2011 - Employment Permit by County

In [8]:
# Reading the raw data in the folder on my personal computer
import pandas as pd

df = pd.read_excel(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/raw_data/2011/permits-issued-by-county-2011.xlsx", # This case is using .xlsx, 2010 and 2009 were using the extension .xls
                   skiprows=[1] # I needed to skip this row because I had the summarization of the values by year, which was uninteresting
)

In [9]:
# Replacing the NaN values by the year
df["Year"] = df["Year"].fillna("2011")

In [10]:
# Before 2020 the column structures of the files were different. So, I needed to rename the column "Total" by "Issued" because after 2020 the new file didn't have the columns "New" and "Renewals", only "Issued". In this case, to keep the standard of the project, I needed to mantain only "Issued".
df.rename(columns={"Total": "Issued", "County/Country": "County"}, inplace=True) # I had to replace "County/Country" by just "County" once that this analysis is about counties
df.drop(columns=["New", "Renewal"], inplace=True)

# I created the column "Obs" to make sure that all counties belong to Ireland or any issues that I could find by crossing some databases

In [11]:
# As this part of project is focused only in the counties, I had to group all missed places by "Others"

# I had to made a copy of the df with the filtered rows by not being "NaN" (because some cases are about countries or counties from North Ireland)
df_others = df[df["Obs"].notna()].copy()
# Replacing all values by just "Others" to facilitate the analysis
df_others["County"] = "Others"
# Summarizing the columns
df_others_grouped = df_others.groupby(["Year", "County"], as_index=False)[["Issued", "Refused", "Withdrawn"]].sum()
# Creating a dataframe with only the "Others" row
df_main = df[df["Obs"].isna()].copy()
# Concatenating both dataframes, original and the modified copy
df = pd.concat([df_main, df_others_grouped], ignore_index=True)
# As I don't need to use the "Obs" column, I removed for the final file
df.drop(columns=["Obs"], inplace=True)

In [12]:
# Creating the Primary Key
# Ensure Year and County are strings and remove spaces from County
df["Year"] = df["Year"].astype(str)
df["County"] = df["County"].astype(str).str.strip()
df["id_county"] = df["Year"] + df["County"].str.replace(r"\s+", "_", regex=True)

# Reordering columns
cols = ["id_county", "Year", "County", "Issued", "Refused", "Withdrawn"]
df = df[[c for c in cols if c in df.columns]]

In [13]:
# I used the extension .csv because is lighter and easy to work with some libraries like pandas, sqlalchemy
df.to_csv(r"G:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/2011/permits-issued-by-county-2011.csv", index=False)

print(df)

        id_county  Year     County  Issued  Refused  Withdrawn
0      2011Carlow  2011     Carlow      31        7          1
1       2011Cavan  2011      Cavan      44        6          0
2       2011Clare  2011      Clare      81       20          4
3        2011Cork  2011       Cork     417       64         12
4     2011Donegal  2011    Donegal      65       12          6
5      2011Dublin  2011     Dublin    2717      534        108
6      2011Galway  2011     Galway     206       36          4
7       2011Kerry  2011      Kerry      89       20          2
8     2011Kildare  2011    Kildare     324       51         20
9    2011Kilkenny  2011   Kilkenny      53       15          2
10      2011Laois  2011      Laois      31        2          0
11    2011Leitrim  2011    Leitrim       3        4          0
12   2011Limerick  2011   Limerick     151       17          6
13   2011Longford  2011   Longford      24        5          1
14      2011Louth  2011      Louth      73       15    