2016 - Employment Permit by County

In [1]:
# Reading the raw data in the folder on my personal computer
import pandas as pd

df = pd.read_excel(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/raw_data/2016/permits-issued-by-county-2016.xlsx", # This case is using .xlsx, 2010 and 2009 were using the extension .xls
                   skiprows=[1] # I needed to skip this row because I had the summarization of the values by year, which was uninteresting
)

In [2]:
# Replacing the NaN values by the year
df["Year"] = df["Year"].fillna("2016")


In [3]:
# Before 2020 the column structures of the files were different. So, I needed to rename the column "Total" by "Issued" because after 2020 the new file didn't have the columns "New" and "Renewals", only "Issued". In this case, to keep the standard of the project, I needed to mantain only "Issued".
df.rename(columns={"Total": "Issued", "County/Country": "County"}, inplace=True)
df.drop(columns=["New", "Renewal"], inplace=True)

# I created the column "Obs" to make sure that all counties belong to Ireland or any issues that I could find by crossing some databases

In [4]:
# As this part of project is focused only in the counties, I had to group all missed places by "Others"

# I had to made a copy of the df with the filtered rows by not being "NaN" (because some cases are about countries or counties from North Ireland)
df_others = df[df["Obs"].notna()].copy()
# Replacing all values by just "Others" to facilitate the analysis
df_others["County"] = "Others"
# Summarizing the columns
df_others_grouped = df_others.groupby(["Year", "County"], as_index=False)[["Issued", "Refused", "Withdrawn"]].sum()
# Creating a dataframe with only the "Others" row
df_main = df[df["Obs"].isna()].copy()
# Concatenating both dataframes, original and the modified copy
df = pd.concat([df_main, df_others_grouped], ignore_index=True)
# As I don't need to use the "Obs" column, I removed for the final file
df.drop(columns=["Obs"], inplace=True)

In [5]:
# Creating the Primary Key
# Ensure Year and County are strings and remove spaces from County
df["Year"] = df["Year"].astype(str)
df["County"] = df["County"].astype(str).str.strip()
df["id_county"] = df["Year"] + df["County"].str.replace(r"\s+", "_", regex=True)

# Reordering columns
cols = ["id_county", "Year", "County", "Issued", "Refused", "Withdrawn"]
df = df[[c for c in cols if c in df.columns]]

In [6]:
# I used the extension .csv because is lighter and easy to work with some libraries like pandas, sqlalchemy
df.to_csv(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/2016/permits-issued-by-county-2016.csv", index=False)

print(df)

        id_county  Year     County  Issued  Refused  Withdrawn
0      2016Carlow  2016     Carlow      14        7          0
1       2016Cavan  2016      Cavan     118        7          1
2       2016Clare  2016      Clare      72       24          1
3        2016Cork  2016       Cork     733      108         16
4     2016Donegal  2016    Donegal     197       17          0
5      2016Dublin  2016     Dublin    5310      688        123
6      2016Galway  2016     Galway     365       76         10
7       2016Kerry  2016      Kerry     128       27          1
8     2016Kildare  2016    Kildare     360       37         13
9    2016Kilkenny  2016   Kilkenny     102        9          2
10      2016Laois  2016      Laois      49        3          3
11    2016Leitrim  2016    Leitrim       3        1          0
12   2016Limerick  2016   Limerick     428       48          8
13   2016Longford  2016   Longford      28        5          0
14      2016Louth  2016      Louth     192       48    