2023 - Employment Permit by County

In [1]:
# Reading the raw data in the folder on my personal computer
import pandas as pd

df = pd.read_excel(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/raw_data/2023/permits-by-county-2023.xlsx", # This case is using .xlsx and has a different file name, 2010 and 2009 were using the extension .xls
                   header=1,
                   skiprows=[2] # I needed to skip these rows because I had the summarization of the values by year (which was uninteresting)
)

In [2]:
# As the structure of the columns were modified after 2020, I had to rename automatically the first one by "County"
df.rename(columns={"Unnamed: 0": "County"}, inplace=True)

In [3]:
# Creating a new column because of the new structure where the year is on top of the table
df["Year"] = 2023

# Sorting the year column to be the first one 
df = df[["Year"] + [col for col in df.columns if col != "Year"]]

In [4]:
# As this part of project is focused only in the counties, I had to group all missed places by "Others"

# I had to made a copy of the df with the filtered rows by not being "NaN" (because some cases are about countries or counties from North Ireland)
df_others = df[df["Obs"].notna()].copy()
# Replacing all values by just "Others" to facilitate the analysis
df_others["County"] = "Others"
# Summarizing the columns
df_others_grouped = df_others.groupby(["Year", "County"], as_index=False)[["Issued", "Refused", "Withdrawn"]].sum()
# Creating a dataframe with only the "Others" row
df_main = df[df["Obs"].isna()].copy()
# Concatenating both dataframes, original and the modified copy
df = pd.concat([df_main, df_others_grouped], ignore_index=True)
# As I don't need to use the "Obs" column, I removed for the final file
df.drop(columns=["Obs"], inplace=True)

In [5]:
# Creating the Primary Key
# Ensure Year and County are strings and remove spaces from County
df["Year"] = df["Year"].astype(str)
df["County"] = df["County"].astype(str).str.strip()
df["id_county"] = df["Year"] + df["County"].str.replace(r"\s+", "_", regex=True)

# Reordering columns
cols = ["id_county", "Year", "County", "Issued", "Refused", "Withdrawn"]
df = df[[c for c in cols if c in df.columns]]

In [6]:
# I used the extension .csv because is lighter and easy to work with some libraries like pandas, sqlalchemy
df.to_csv(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/2023/permits-issued-by-county-2023.csv", index=False)

print(df)

        id_county  Year     County   Issued  Refused  Withdrawn
0      2023Carlow  2023     Carlow    161.0        7        3.0
1       2023Cavan  2023      Cavan    312.0       18        5.0
2       2023Clare  2023      Clare    496.0       30        6.0
3        2023Cork  2023       Cork   2613.0      123       49.0
4     2023Donegal  2023    Donegal    387.0       19       14.0
5      2023Dublin  2023     Dublin  15401.0      752      265.0
6      2023Galway  2023     Galway   1387.0       85       35.0
7       2023Kerry  2023      Kerry    384.0       26       15.0
8     2023Kildare  2023    Kildare   1734.0       53       29.0
9    2023Kilkenny  2023   Kilkenny    491.0       24       10.0
10      2023Laois  2023      Laois    213.0       14        2.0
11    2023Leitrim  2023    Leitrim     54.0        5        2.0
12   2023Limerick  2023   Limerick   1600.0       84       34.0
13   2023Longford  2023   Longford    120.0       13        2.0
14      2023Louth  2023      Louth    64