2020 - Employment Permit by County

In [1]:
# Reading the raw data in the folder on my personal computer
import pandas as pd

df = pd.read_excel(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/raw_data/2020/permits-by-county-2020.xlsx", # This case is using .xlsx and has a different file name, 2010 and 2009 were using the extension .xls
                   header=1,
                   skiprows=[2] # I needed to skip these rows because I had the summarization of the values by year (which was uninteresting)
)

In [2]:
# Creating a new column because of the new structure where the year is on top of the table
df["Year"] = 2020

# Sorting the year column to be the first one 
df = df[["Year"] + [col for col in df.columns if col != "Year"]]

In [3]:
# As this part of project is focused only in the counties, I had to group all missed places by "Others"

# I had to made a copy of the df with the filtered rows by not being "NaN" (because some cases are about countries or counties from North Ireland)
df_others = df[df["Obs"].notna()].copy()
# Replacing all values by just "Others" to facilitate the analysis
df_others["County"] = "Others"
# Summarizing the columns
df_others_grouped = df_others.groupby(["Year", "County"], as_index=False)[["Issued", "Refused", "Withdrawn"]].sum()
# Creating a dataframe with only the "Others" row
df_main = df[df["Obs"].isna()].copy()
# Concatenating both dataframes, original and the modified copy
df = pd.concat([df_main, df_others_grouped], ignore_index=True)
# As I don't need to use the "Obs" column, I removed for the final file
df.drop(columns=["Obs"], inplace=True)

In [4]:
# Creating the Primary Key
# Ensure Year and County are strings and remove spaces from County
df["Year"] = df["Year"].astype(str)
df["County"] = df["County"].astype(str).str.strip()
df["id_county"] = df["Year"] + df["County"].str.replace(r"\s+", "_", regex=True)

# Reordering columns
cols = ["id_county", "Year", "County", "Issued", "Refused", "Withdrawn"]
df = df[[c for c in cols if c in df.columns]]

In [5]:
# I used the extension .csv because is lighter and easy to work with some libraries like pandas, sqlalchemy
df.to_csv(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/2020/permits-issued-by-county-2020.csv", index=False)

print(df)

        id_county  Year     County  Issued  Refused  Withdrawn
0      2020Carlow  2020     Carlow    62.0        3        3.0
1       2020Cavan  2020      Cavan   316.0       44       10.0
2       2020Clare  2020      Clare   136.0       30       11.0
3        2020Cork  2020       Cork  1243.0       91       44.0
4     2020Donegal  2020    Donegal   205.0       11       10.0
5      2020Dublin  2020     Dublin  8520.0      461      321.0
6      2020Galway  2020     Galway   590.0       50       31.0
7       2020Kerry  2020      Kerry   235.0       15       10.0
8     2020Kildare  2020    Kildare   572.0       31       20.0
9    2020Kilkenny  2020   Kilkenny   229.0        6        9.0
10      2020Laois  2020      Laois    82.0        3        NaN
11    2020Leitrim  2020    Leitrim    23.0        3        2.0
12   2020Limerick  2020   Limerick   670.0       39       38.0
13   2020Longford  2020   Longford    86.0        1        3.0
14      2020Louth  2020      Louth   418.0       12    