2018 - Employment Permit by County

In [1]:
# Reading the raw data in the folder on my personal computer
import pandas as pd

df = pd.read_excel(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/raw_data/2018/permits-by-county-2018.xlsx", # This case is using .xlsx and has a different file name, 2010 and 2009 were using the extension .xls
                   skiprows=[1] # I needed to skip these rows because I had the summarization of the values by year (which was uninteresting) and one more row without any sense
)

In [2]:
# Replacing the NaN values by the year
df["Year"] = df["Year"].fillna("2018")

In [3]:
# Before 2020 the column structures of the files were different. So, I needed to rename the column "Total" by "Issued" because after 2020 the new file didn't have the columns "New" and "Renewals", only "Issued". In this case, to keep the standard of the project, I needed to mantain only "Issued".
df.rename(columns={"Total": "Issued", "County/Country": "County"}, inplace=True)
df.drop(columns=["New", "Renewal"], inplace=True)

# I created the column "Obs" to make sure that all counties belong to Ireland or any issues that I could find by crossing some databases

In [4]:
# As this part of project is focused only in the counties, I had to group all missed places by "Others"

# I had to made a copy of the df with the filtered rows by not being "NaN" (because some cases are about countries or counties from North Ireland)
df_others = df[df["Obs"].notna()].copy()
# Replacing all values by just "Others" to facilitate the analysis
df_others["County"] = "Others"
# Summarizing the columns
df_others_grouped = df_others.groupby(["Year", "County"], as_index=False)[["Issued", "Refused", "Withdrawn"]].sum()
# Creating a dataframe with only the "Others" row
df_main = df[df["Obs"].isna()].copy()
# Concatenating both dataframes, original and the modified copy
df = pd.concat([df_main, df_others_grouped], ignore_index=True)
# As I don't need to use the "Obs" column, I removed for the final file
df.drop(columns=["Obs"], inplace=True)

In [5]:
# Creating the Primary Key
# Ensure Year and County are strings and remove spaces from County
df["Year"] = df["Year"].astype(str)
df["County"] = df["County"].astype(str).str.strip()
df["id_county"] = df["Year"] + df["County"].str.replace(r"\s+", "_", regex=True)

# Reordering columns
cols = ["id_county", "Year", "County", "Issued", "Refused", "Withdrawn"]
df = df[[c for c in cols if c in df.columns]]

In [6]:
# I used the extension .csv because is lighter and easy to work with some libraries like pandas, sqlalchemy
df.to_csv(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/2018/permits-issued-by-county-2018.csv", index=False)

print(df)

        id_county  Year     County  Issued  Refused  Withdrawn
0      2018Carlow  2018     Carlow      18        2          4
1       2018Cavan  2018      Cavan     182        8         10
2       2018Clare  2018      Clare      90       18          6
3        2018Cork  2018       Cork    1145      123         57
4     2018Donegal  2018    Donegal     186       21         18
5      2018Dublin  2018     Dublin    7341      600        281
6      2018Galway  2018     Galway     567       58         21
7       2018Kerry  2018      Kerry     222       11          9
8     2018Kildare  2018    Kildare     375       68         14
9    2018Kilkenny  2018   Kilkenny     192       10         10
10      2018Laois  2018      Laois      71        6          3
11    2018Leitrim  2018    Leitrim      22        2          2
12   2018Limerick  2018   Limerick     546       56         15
13   2018Longford  2018   Longford      49       14          1
14      2018Louth  2018      Louth     419       31    