# Immigration and emigration analysis

## Analysis of the main immigration and emigration tendencies in the last years

In [1]:
# We first import the necessary libraries for our analysis
import pandas as pd
import numpy as np
import os

In [2]:
#We then import the provided data
sources_list = [file for file in os.listdir('../Sources/3.-Population/') if (file.endswith('.csv') and file.startswith("imm"))]
df_lst = []
for file in sources_list:
    globals()[file[:-4]] = pd.read_csv('../Sources/3.-Population/' + file)
    df_lst.append(file[:-4])
revenue_neighborhoods = pd.read_csv('../Sources/External_Sources/_distribucio_territorial_renda_familiar.csv')
revenue_neighborhoods.rename(columns = {'Índex RFD Barcelona = 100':'Revenue_Index','Població':'Population'},inplace = True)

In [3]:
#We only take the data of 2017:
revenue_neighborhoods_2017 = revenue_neighborhoods[revenue_neighborhoods["Any"] == 2017]
immigrants_by_nationality_2017 = immigrants_by_nationality[immigrants_by_nationality["Year"] == 2017]
immigrants_emigrants_by_sex_2017 = immigrants_emigrants_by_sex[immigrants_emigrants_by_sex["Year"] == 2017]
immigrants_emigrants_by_age_2017 = immigrants_emigrants_by_age[immigrants_emigrants_by_age["Year"] == 2017]

In [4]:
#We take a look at one of the immigration dataframes:
immigrants_by_nationality_2017

Unnamed: 0,Year,District Code,District Name,Neighborhood Code,Neighborhood Name,Nationality,Number
0,2017,1,Ciutat Vella,1,el Raval,Spain,1109
1,2017,1,Ciutat Vella,2,el Barri Gòtic,Spain,482
2,2017,1,Ciutat Vella,3,la Barceloneta,Spain,414
3,2017,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",Spain,537
4,2017,2,Eixample,5,el Fort Pienc,Spain,663
...,...,...,...,...,...,...,...
11761,2017,10,Sant Martí,70,el Besòs i el Maresme,No information,0
11762,2017,10,Sant Martí,71,Provençals del Poblenou,No information,0
11763,2017,10,Sant Martí,72,Sant Martí de Provençals,No information,0
11764,2017,10,Sant Martí,73,la Verneda i la Pau,No information,0


In [5]:
#We take a look at the revenues dataset:
revenue_neighborhoods_2017.describe()

Unnamed: 0,Any,Codi_Districte,Codi_Barri,Population,Revenue_Index
count,73.0,73.0,73.0,73.0,73.0
mean,2017.0,6.246575,37.0,22262.150685,93.671233
std,0.0,2.807633,21.217131,14664.585752,42.709775
min,2017.0,1.0,1.0,611.0,38.6
25%,2017.0,4.0,19.0,10422.0,65.1
50%,2017.0,7.0,37.0,20649.0,82.9
75%,2017.0,8.0,55.0,30622.0,105.7
max,2017.0,10.0,73.0,58315.0,248.8


In [10]:
#We create 4 revenue categories for the negihborhoods based on their revenue levels
revenue_neighborhoods_2017["Category"] = ["Very poor" if e < 63 
                                          else "Poor" if e < 79
                                          else "Low middle class" if e < 100
                                          else "High middle class" if e < 126
                                          else "Rich" if e < 159
                                          else "Very rich"
                                          for e in revenue_neighborhoods_2017["Revenue_Index"]]
#We want to see how many people are included in each category:
pop_by_class = revenue_neighborhoods_2017.groupby("Category")[["Population","Revenue_Index"]].aggregate({"Population":"sum","Revenue_Index":"mean"}).sort_values(by = "Population",ascending = False)
pop_by_class["% of total pop"] = pop_by_class["Population"] / pop_by_class["Population"].sum()
pop_by_class["% of total pop"] = pd.Series([str(round(value*100,2)) + '%' for value in pop_by_class["% of total pop"]],index = pop_by_class.index)
pop_by_class.to_excel('../Results.xls',sheet_name = 'Population by revenues')
pop_by_class

Unnamed: 0_level_0,Population,Revenue_Index,% of total pop
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Low middle class,443565,87.081818,27.29%
High middle class,416741,107.553846,25.64%
Poor,282609,70.336364,17.39%
Very poor,210499,49.95625,12.95%
Very rich,181195,196.428571,11.15%
Rich,90528,144.0,5.57%


In [11]:
immigrants_by_nationality_merged = immigrants_by_nationality_2017.merge(revenue_neighborhoods_2017[["Category","Population","Codi_Barri"]],left_on = "Neighborhood Code",right_on = "Codi_Barri",how = "left")
immigrants_emigrants_by_age_merged = immigrants_emigrants_by_age_2017.merge(revenue_neighborhoods_2017[["Category","Population","Codi_Barri"]],left_on = "Neighborhood Code",right_on = "Codi_Barri",how = "left")
immigrants_emigrants_by_sex_merged = immigrants_emigrants_by_sex_2017.immigrants_by_nationality_merged = immigrants_by_nationality_2017.merge(revenue_neighborhoods_2017[["Category","Population","Codi_Barri"]],left_on = "Neighborhood Code",right_on = "Codi_Barri",how = "left")
immigrants_emigrants_by_age_merged = immigrants_emigrants_by_age_2017.merge(revenue_neighborhoods_2017[["Category","Population","Codi_Barri"]],left_on = "Neighborhood Code",right_on = "Codi_Barri",how = "left")

  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
immigrants_by_nationality_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11766 entries, 0 to 11765
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               11766 non-null  int64  
 1   District Code      11766 non-null  int64  
 2   District Name      11766 non-null  object 
 3   Neighborhood Code  11766 non-null  int64  
 4   Neighborhood Name  11766 non-null  object 
 5   Nationality        11766 non-null  object 
 6   Number             11766 non-null  int64  
 7   Category           11607 non-null  object 
 8   Population         11607 non-null  float64
 9   Codi_Barri         11607 non-null  float64
dtypes: float64(2), int64(4), object(4)
memory usage: 1011.1+ KB


In [13]:
#We drop the NaN values because they consist of an unidentified neighborhood with only 2 immigrants in 2017
immigrants_by_nationality_merged.dropna(inplace = True)
immigrants_emigrants_by_age_merged.dropna(inplace=True)
immigrants_emigrants_by_sex_merged.dropna(inplace=True)

### 1- Analysis of immigration and emigration by revenue level


In [29]:
#We first take a look at the immigration by revenue level:
#immigrants_by_nationality_merged.rename(columns = {'Number':'Immigrants'},inplace=True)
#immigrants_by_nationality_merged.drop(['District Code','Neighborhood Code','Codi_Barri'],axis=1,inplace=True)
immigrants_nat_summ = immigrants_by_nationality_merged.groupby('Category')[['Immigrants']].sum()
immigrants_nat_summ["% of total immigrants"] = immigrants_nat_summ["Immigrants"] / immigrants_nat_summ["Immigrants"].sum()
immigrants_nat_summ["% of total immigrants"] = [str(round(value*100,2)) + '%' for value in immigrants_nat_summ["% of total immigrants"]]
immigrants_nat_summ.to_excel('../Results.xls',sheet_name = 'Immigrants by revenue')
immigrants_nat_summ

In [33]:
#We group by nationality and revenue level category and look at the highest immigration levels
immigrants_nat = immigrants_by_nationality_merged.groupby(['Nationality'])[['Immigrants']].sum().nlargest(10,'Immigrants')
top_nationalities = list(immigrants_nat.index)
immigrants_by_nationality_merged["Top 10"] = [nat in top_nationalities for nat in immigrants_by_nationality_merged["Nationality"]]
top_10 = immigrants_by_nationality_merged[immigrants_by_nationality_merged["Top 10"]].groupby(['Nationality','Category'])[['Immigrants']].sum()
top_10["% of immigration"] = top_10["Immigrants"] / immigrants_by_nationality_merged["Immigrants"].sum()
top_10["% of immigration"] = pd.Series(["{0:.2f}%".format(val*100) for val in top_10["% of immigration"]], index = top_10.index)
top_10.drop("Immigrants",axis=1,inplace=True)
top_10 = top_10.unstack()
top_10

Unnamed: 0_level_0,% of immigration,% of immigration,% of immigration,% of immigration,% of immigration,% of immigration
Category,High middle class,Low middle class,Poor,Rich,Very poor,Very rich
Nationality,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
China,1.12%,0.83%,0.66%,0.17%,0.36%,0.25%
Colombia,0.94%,0.96%,0.61%,0.23%,0.39%,0.22%
France,0.89%,0.73%,0.33%,0.23%,0.05%,0.52%
Honduras,0.33%,0.61%,0.59%,0.06%,1.13%,0.12%
Italy,2.23%,2.17%,0.73%,0.46%,0.27%,0.63%
Morocco,0.40%,0.67%,0.38%,0.06%,0.38%,0.10%
Pakistan,0.58%,0.74%,0.83%,0.04%,0.83%,0.03%
Peru,0.65%,0.77%,0.47%,0.13%,0.39%,0.13%
Spain,10.25%,9.96%,5.58%,2.28%,3.96%,4.29%
Venezuela,0.98%,0.86%,0.46%,0.23%,0.32%,0.25%


In [34]:
with pd.ExcelWriter('../Results.xls') as writer:  
    pop_by_class.to_excel(writer, sheet_name='Population by revenue')
    immigrants_nat_summ.to_excel(writer, sheet_name='Immigrants by revenue')
    top_10.to_excel(writer, sheet_name='Top 10 nationalities')