2024 - Employment Permit by County

In [1]:
# Reading the raw data in the folder on my personal computer
import pandas as pd

df = pd.read_excel(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/raw_data/2024/permits-by-county-2024.xlsx", # This case is using .xlsx and has a different file name, 2010 and 2009 were using the extension .xls
                   header=0,
                   skiprows=[1] # I needed to skip these rows because I had the summarization of the values by year (which was uninteresting)
)

In [2]:
# As the structure of the columns were modified after 2020, I had to rename automatically the first one by "County"
df.rename(columns={"Unnamed: 0": "County"}, inplace=True)

In [3]:
# Creating a new column because of the new structure where the year is on top of the table
df["Year"] = 2024

# Sorting the year column to be the first one 
df = df[["Year"] + [col for col in df.columns if col != "Year"]]

In [4]:
# As this part of project is focused only in the counties, I had to group all missed places by "Others"

# I had to made a copy of the df with the filtered rows by not being "NaN" (because some cases are about countries or counties from North Ireland)
df_others = df[df["Obs"].notna()].copy()
# Replacing all values by just "Others" to facilitate the analysis
df_others["County"] = "Others"
# Summarizing the columns
df_others_grouped = df_others.groupby(["Year", "County"], as_index=False)[["Issued", "Refused", "Withdrawn"]].sum()
# Creating a dataframe with only the "Others" row
df_main = df[df["Obs"].isna()].copy()
# Concatenating both dataframes, original and the modified copy
df = pd.concat([df_main, df_others_grouped], ignore_index=True)
# As I don't need to use the "Obs" column, I removed for the final file
df.drop(columns=["Obs"], inplace=True)

In [5]:
# Creating the Primary Key
# Ensure Year and County are strings and remove spaces from County
df["Year"] = df["Year"].astype(str)
df["County"] = df["County"].astype(str).str.strip()
df["id_county"] = df["Year"] + df["County"].str.replace(r"\s+", "_", regex=True)

# Reordering columns
cols = ["id_county", "Year", "County", "Issued", "Refused", "Withdrawn"]
df = df[[c for c in cols if c in df.columns]]

In [6]:
# I used the extension .csv because is lighter and easy to work with some libraries like pandas, sqlalchemy
df.to_csv(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/2024/permits-issued-by-county-2024.csv", index=False)

print(df)

        id_county  Year     County   Issued  Refused  Withdrawn
0      2024Carlow  2024     Carlow    233.0       27       11.0
1       2024Cavan  2024      Cavan    573.0       27        9.0
2       2024Clare  2024      Clare    657.0       63       15.0
3        2024Cork  2024       Cork   3293.0      195       91.0
4     2024Donegal  2024    Donegal    428.0       27       26.0
5      2024Dublin  2024     Dublin  19141.0      967      507.0
6      2024Galway  2024     Galway   1269.0       85       25.0
7       2024Kerry  2024      Kerry    542.0       29       10.0
8     2024Kildare  2024    Kildare   2342.0      122       57.0
9    2024Kilkenny  2024   Kilkenny    433.0       30       16.0
10      2024Laois  2024      Laois    193.0       14       16.0
11    2024Leitrim  2024    Leitrim     67.0        8        2.0
12   2024Limerick  2024   Limerick   1905.0       86       57.0
13   2024Longford  2024   Longford    147.0       17        5.0
14      2024Louth  2024      Louth    80