2019 - Employment Permit by County

In [1]:
# Reading the raw data in the folder on my personal computer
import pandas as pd

df = pd.read_excel(r"E:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/raw_data/2019/permits-by-county-2019.xlsx", # This case is using .xlsx and has a different file name, 2010 and 2009 were using the extension .xls
                   header=1,
                   skiprows=[2] # I needed to skip these rows because I had the summarization of the values by year (which was uninteresting)
)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # The structure for this sheet was different, so I needed to remove an empty column

In [2]:
# Replacing the NaN values by the year
df["Year"] = df["Year"].fillna("2019")

In [3]:
# Before 2020 the column structures of the files were different. So, I needed to rename the column "Total" by "Issued" because after 2020 the new file didn't have the columns "New" and "Renewals", only "Issued". In this case, to keep the standard of the project, I needed to mantain only "Issued".
df.rename(columns={"Total": "Issued", "County/Country": "County"}, inplace=True)
df.drop(columns=["New", "Renewal"], inplace=True)

# I created the column "Obs" to make sure that all counties belong to Ireland or any issues that I could find by crossing some databases

In [4]:
# As this part of project is focused only in the counties, I had to group all missed places by "Others"

# I had to made a copy of the df with the filtered rows by not being "NaN" (because some cases are about countries or counties from North Ireland)
df_others = df[df["Obs"].notna()].copy()
# Replacing all values by just "Others" to facilitate the analysis
df_others["County"] = "Others"
# Summarizing the columns
df_others_grouped = df_others.groupby(["Year", "County"], as_index=False)[["Issued", "Refused", "Withdrawn"]].sum()
# Creating a dataframe with only the "Others" row
df_main = df[df["Obs"].isna()].copy()
# Concatenating both dataframes, original and the modified copy
df = pd.concat([df_main, df_others_grouped], ignore_index=True)
# As I don't need to use the "Obs" column, I removed for the final file
df.drop(columns=["Obs"], inplace=True)

In [5]:
# I used the extension .csv because is lighter and easy to work with some libraries like pandas, sqlalchemy
df.to_csv(r"E:/My Drive/ESTUDOS DATA SCIENCE/ie-employment-permit/data/2019/permits-issued-by-county-2019.csv", index=False)

print(df)

    Year     County  Issued  Refused  Withdrawn
0   2019     Carlow      88       18          3
1   2019      Cavan     299       26         29
2   2019      Clare     197       30          6
3   2019       Cork    1191      100         64
4   2019    Donegal     268       12         11
5   2019     Dublin    9338      616        378
6   2019     Galway     623       53         25
7   2019      Kerry     212       23          7
8   2019    Kildare     339       42         26
9   2019   Kilkenny     152       10         13
10  2019      Laois      88        7          4
11  2019    Leitrim      15        2          0
12  2019   Limerick     555       47         40
13  2019   Longford      87       22         11
14  2019      Louth     361       29         24
15  2019       Mayo     199       17         24
16  2019      Meath     307       51         23
17  2019   Monaghan     283       21         29
18  2019     Offaly     152       53          3
19  2019  Roscommon      85        4    