In [1]:
# Importing the pandas library as pd

import pandas as pd

In [2]:
# Reading the CSV files 

unemployment = pd.read_csv('unemployment.csv')

# Checking how many rows & columns we have

unemployment.shape

(14208, 9)

In [3]:
# Inspecting the data

unemployment.head()

Unnamed: 0,Year,Month,District Code,District Name,Neighborhood Code,Neighborhood Name,Gender,Demand_occupation,Number
0,2017,January,1,Ciutat Vella,1,el Raval,Male,Registered unemployed,2107
1,2017,January,1,Ciutat Vella,2,el Barri Gòtic,Male,Registered unemployed,538
2,2017,January,1,Ciutat Vella,3,la Barceloneta,Male,Registered unemployed,537
3,2017,January,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",Male,Registered unemployed,741
4,2017,January,2,Eixample,5,el Fort Pienc,Male,Registered unemployed,630


In [35]:
# We create a new dataframe: we will drop Month, District Code, District Name 

unemployment_filtered = unemployment[['Year','Neighborhood Code','Neighborhood Name','Gender','Demand_occupation','Number']]


In [36]:
# We drop years 2013 and 2014 since these are not in our other dataset that we use for comparison

unemployment_filtered = unemployment_filtered[unemployment_filtered['Year']>2014]

In [37]:
# The new dataframe is named unemployment_filtered

unemployment_filtered.head()

Unnamed: 0,Year,Neighborhood Code,Neighborhood Name,Gender,Demand_occupation,Number
0,2017,1,el Raval,Male,Registered unemployed,2107
1,2017,2,el Barri Gòtic,Male,Registered unemployed,538
2,2017,3,la Barceloneta,Male,Registered unemployed,537
3,2017,4,"Sant Pere, Santa Caterina i la Ribera",Male,Registered unemployed,741
4,2017,5,el Fort Pienc,Male,Registered unemployed,630


In [38]:
# Shape of the new dataframe unemployment_filtered

unemployment_filtered.shape

(10656, 6)

In [39]:
# Check with .describe() method

unemployment_filtered.describe()

Unnamed: 0,Year,Neighborhood Code,Number
count,10656.0,10656.0,10656.0
mean,2016.0,37.837838,354.160191
std,0.816535,22.119898,356.028668
min,2015.0,1.0,0.0
25%,2015.0,19.0,97.0
50%,2016.0,37.5,217.0
75%,2017.0,56.0,511.0
max,2017.0,99.0,2688.0


In [40]:
# Check for missing values 
## NaN

unemployment_filtered.isnull().sum() 

Year                 0
Neighborhood Code    0
Neighborhood Name    0
Gender               0
Demand_occupation    0
Number               0
dtype: int64

In [41]:
# Check for other values that do not belong to the dataframe

unemployment_filtered.tail()

Unnamed: 0,Year,Neighborhood Code,Neighborhood Name,Gender,Demand_occupation,Number
10651,2015,70,el Besòs i el Maresme,Female,Unemployment demand,173
10652,2015,71,Provençals del Poblenou,Female,Unemployment demand,132
10653,2015,72,Sant Martí de Provençals,Female,Unemployment demand,197
10654,2015,73,la Verneda i la Pau,Female,Unemployment demand,225
10655,2015,99,No consta,Female,Unemployment demand,27


In [44]:
# to delete 99 neighborhood code

unemployment_filtered = unemployment_filtered[unemployment_filtered["Neighborhood Code"] != 99]
unemployment_filtered.tail()

Unnamed: 0,Year,Neighborhood Code,Neighborhood Name,Gender,Demand_occupation,Number
10650,2015,69,Diagonal Mar i el Front Marítim del Poblenou,Female,Unemployment demand,75
10651,2015,70,el Besòs i el Maresme,Female,Unemployment demand,173
10652,2015,71,Provençals del Poblenou,Female,Unemployment demand,132
10653,2015,72,Sant Martí de Provençals,Female,Unemployment demand,197
10654,2015,73,la Verneda i la Pau,Female,Unemployment demand,225


In [None]:
## NEXT DATASET immigrants by nationality

In [2]:
# import dataset immigrants by nationality

immigrants_nat = pd.read_csv('immigrants_by_nationality.csv')
immigrants_nat.head()

Unnamed: 0,Year,District Code,District Name,Neighborhood Code,Neighborhood Name,Nationality,Number
0,2017,1,Ciutat Vella,1,el Raval,Spain,1109
1,2017,1,Ciutat Vella,2,el Barri Gòtic,Spain,482
2,2017,1,Ciutat Vella,3,la Barceloneta,Spain,414
3,2017,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",Spain,537
4,2017,2,Eixample,5,el Fort Pienc,Spain,663


In [3]:
#Get rid of District code and District Name
immigrants_nat_clean = immigrants_nat[["Year", "Neighborhood Code", "Neighborhood Name", "Nationality", "Number"]]
immigrants_nat_clean = immigrants_nat_clean.sort_values("Year")
immigrants_nat_clean.head()

Unnamed: 0,Year,Neighborhood Code,Neighborhood Name,Nationality,Number
35223,2015,99,No consta,No information,0
27630,2015,29,el Coll,Syria,0
27629,2015,28,Vallcarca i els Penitents,Syria,1
27628,2015,27,el Putxet i el Farró,Syria,3
27627,2015,26,Sant Gervasi - Galvany,Syria,4


In [4]:
immigrants_nat.shape

(35224, 7)

In [5]:
# Look at the datasets using the .describe() method

immigrants_nat_clean.describe()

Unnamed: 0,Year,Neighborhood Code,Number
count,35224.0,35224.0,35224.0
mean,2016.010504,37.837838,7.707273
std,0.810846,22.119174,50.421883
min,2015.0,1.0,0.0
25%,2015.0,19.0,0.0
50%,2016.0,37.5,0.0
75%,2017.0,56.0,2.0
max,2017.0,99.0,1603.0


In [6]:
#Get rid of Nationality without information
immigrants_nat_clean = immigrants_nat_clean[immigrants_nat_clean.Nationality != "No information"]
immigrants_nat_clean.head()

Unnamed: 0,Year,Neighborhood Code,Neighborhood Name,Nationality,Number
27630,2015,29,el Coll,Syria,0
27629,2015,28,Vallcarca i els Penitents,Syria,1
27628,2015,27,el Putxet i el Farró,Syria,3
27627,2015,26,Sant Gervasi - Galvany,Syria,4
27626,2015,25,Sant Gervasi - la Bonanova,Syria,1


In [7]:
#Less 222 rows
immigrants_nat_clean.shape

(35002, 5)

In [8]:
#Get rid of Neighborhood Code that dont have meaning
immigrants_nat_clean = immigrants_nat_clean[immigrants_nat_clean["Neighborhood Code"] != 99]
immigrants_nat_clean.head()

Unnamed: 0,Year,Neighborhood Code,Neighborhood Name,Nationality,Number
27630,2015,29,el Coll,Syria,0
27629,2015,28,Vallcarca i els Penitents,Syria,1
27628,2015,27,el Putxet i el Farró,Syria,3
27627,2015,26,Sant Gervasi - Galvany,Syria,4
27626,2015,25,Sant Gervasi - la Bonanova,Syria,1


In [9]:
#Less 473 rows
immigrants_nat_clean.shape

(34529, 5)

In [10]:
#Number of unique nationalities in the city
n = immigrants_nat_clean["Nationality"].unique()
len(n)

176

In [17]:
#Immigrants that groped by Neighborhoods and the number of imigrants in every neighborhood
#for the year 2017
immigrants_2017 = immigrants_nat_clean[immigrants_nat_clean['Year']== 2017]
                                         
immigrants_2017_byhood = immigrants_2017.groupby(['Neighborhood Code',"Neighborhood Name"], 
                                                 as_index=False).agg({'Number': "sum"})
immigrants_2017_byhood.head()

Unnamed: 0,Neighborhood Code,Neighborhood Name,Number
0,1,el Raval,5398
1,2,el Barri Gòtic,2686
2,3,la Barceloneta,1759
3,4,"Sant Pere, Santa Caterina i la Ribera",2765
4,5,el Fort Pienc,2235


In [16]:
#Immigrants that groped by Neighborhoods and the number of imigrants in every neighborhood
#for the year 2016
immigrants_2016 = immigrants_nat_clean[immigrants_nat_clean['Year']== 2016]
                                         
immigrants_2016_byhood = immigrants_2016.groupby(['Neighborhood Code',"Neighborhood Name"], 
                                                 as_index=False).agg({'Number': "sum"})
immigrants_2016_byhood.head()

Unnamed: 0,Neighborhood Code,Neighborhood Name,Number
0,1,el Raval,4975
1,2,el Barri Gòtic,2197
2,3,la Barceloneta,1547
3,4,"Sant Pere, Santa Caterina i la Ribera",2376
4,5,el Fort Pienc,1990


In [15]:
#Immigrants that groped by Neighborhoods and the number of imigrants in every neighborhood
#for the year 2015
immigrants_2015 = immigrants_nat_clean[immigrants_nat_clean['Year']== 2015]
                                         
immigrants_2015_byhood = immigrants_2015.groupby(['Neighborhood Code',"Neighborhood Name"], 
                                                 as_index=False).agg({'Number': "sum"})
immigrants_2015_byhood.head()

Unnamed: 0,Neighborhood Code,Neighborhood Name,Number
0,1,el Raval,5335
1,2,el Barri Gòtic,2201
2,3,la Barceloneta,1553
3,4,"Sant Pere, Santa Caterina i la Ribera",2551
4,5,el Fort Pienc,2000


In [14]:
#Total immigrants by years
immigrants_2015_total = immigrants_2015_byhood["Number"].sum()
immigrants_2016_total = immigrants_2016_byhood["Number"].sum()
immigrants_2017_total = immigrants_2017_byhood["Number"].sum()

print("Total imigrants in 2017 = ", immigrants_2017_total)
print("Total imigrants in 2016 = ", immigrants_2016_total)
print("Total imigrants in 2015 = ", immigrants_2015_total)

Total imigrants in 2017 =  97280
Total imigrants in 2016 =  85123
Total imigrants in 2015 =  88911


In [18]:
immigrants_nat_clean.tail()
print(immigrants_nat_clean.shape)
immigrants_nat_clean.head()

(34529, 5)


Unnamed: 0,Year,Neighborhood Code,Neighborhood Name,Nationality,Number
27630,2015,29,el Coll,Syria,0
27629,2015,28,Vallcarca i els Penitents,Syria,1
27628,2015,27,el Putxet i el Farró,Syria,3
27627,2015,26,Sant Gervasi - Galvany,Syria,4
27626,2015,25,Sant Gervasi - la Bonanova,Syria,1


In [19]:
# Check if data types make sense

print(immigrants_nat_clean.dtypes)

Year                  int64
Neighborhood Code     int64
Neighborhood Name    object
Nationality          object
Number                int64
dtype: object


In [56]:
print(unemployment_filtered.dtypes)

Year                  int64
Neighborhood Code     int64
Neighborhood Name    object
Gender               object
Demand_occupation    object
Number                int64
dtype: object


In [None]:
#### HERE GROUP BY NEIGHBORHOOD CODE

In [16]:
### THIS IS A DRAFT

# Merge datasets

combined = immigrants_nat.merge(unemployment, on='Neighborhood Code', how='inner')


In [17]:

combined.shape

(6763008, 15)

In [18]:
combined.head()

Unnamed: 0,Year_x,District Code_x,District Name_x,Neighborhood Code,Neighborhood Name_x,Nationality,Number_x,Year_y,Month,District Code_y,District Name_y,Neighborhood Name_y,Gender,Demand_occupation,Number_y
0,2017,1,Ciutat Vella,1,el Raval,Spain,1109,2017,January,1,Ciutat Vella,el Raval,Male,Registered unemployed,2107
1,2017,1,Ciutat Vella,1,el Raval,Spain,1109,2017,January,1,Ciutat Vella,el Raval,Male,Unemployment demand,429
2,2017,1,Ciutat Vella,1,el Raval,Spain,1109,2017,January,1,Ciutat Vella,el Raval,Female,Registered unemployed,1452
3,2017,1,Ciutat Vella,1,el Raval,Spain,1109,2017,January,1,Ciutat Vella,el Raval,Female,Unemployment demand,288
4,2017,1,Ciutat Vella,1,el Raval,Spain,1109,2017,February,1,Ciutat Vella,el Raval,Male,Registered unemployed,2063


In [19]:
# Drop columns that are not interesting for the analysis and save in a new dataframe

combined_new = combined.drop(['Month', 'District Code_x','District Name_x','Neighborhood Name_x'], axis=1)
combined_new.head()


Unnamed: 0,Year_x,Neighborhood Code,Nationality,Number_x,Year_y,District Code_y,District Name_y,Neighborhood Name_y,Gender,Demand_occupation,Number_y
0,2017,1,Spain,1109,2017,1,Ciutat Vella,el Raval,Male,Registered unemployed,2107
1,2017,1,Spain,1109,2017,1,Ciutat Vella,el Raval,Male,Unemployment demand,429
2,2017,1,Spain,1109,2017,1,Ciutat Vella,el Raval,Female,Registered unemployed,1452
3,2017,1,Spain,1109,2017,1,Ciutat Vella,el Raval,Female,Unemployment demand,288
4,2017,1,Spain,1109,2017,1,Ciutat Vella,el Raval,Male,Registered unemployed,2063


In [20]:
# Rename column titles

combined_new = combined_new.rename(columns={'Year_x': 'Year_Unemployment', 'Year_y': 'Year_Immigrant','District Code_y':'District Code', 'District Name_y':'District_Name','Neighborhood Name_y':'Neighborhood Name','Number_y':'Number_Immigrants','Number_x':'Number_Unemployed'})

In [21]:
combined_new.tail()


Unnamed: 0,Year_Unemployment,Neighborhood Code,Nationality,Number_Unemployed,Year_Immigrant,District Code,District_Name,Neighborhood Name,Gender,Demand_occupation,Number_Immigrants
6763003,2015,99,No information,0,2013,99,No consta,No consta,Female,Registered unemployed,4
6763004,2015,99,No information,0,2013,99,No consta,No consta,Male,Registered unemployed,0
6763005,2015,99,No information,0,2013,99,No consta,No consta,Female,Registered unemployed,1
6763006,2015,99,No information,0,2013,99,No consta,No consta,Male,Registered unemployed,1
6763007,2015,99,No information,0,2013,99,No consta,No consta,Female,Registered unemployed,0


In [22]:
# Changing the column order

column_order = ['Neighborhood Code','Neighborhood Name','District Code','District_Name', 

                'Gender','Nationality','Demand_occupation', 

                'Year_Unemployment','Number_Unemployed','Year_Immigrant', 

                'Number_Immigrants'] 

  

combined_new = combined_new[column_order] 


In [23]:
combined_new.head(56)

Unnamed: 0,Neighborhood Code,Neighborhood Name,District Code,District_Name,Gender,Nationality,Demand_occupation,Year_Unemployment,Number_Unemployed,Year_Immigrant,Number_Immigrants
0,1,el Raval,1,Ciutat Vella,Male,Spain,Registered unemployed,2017,1109,2017,2107
1,1,el Raval,1,Ciutat Vella,Male,Spain,Unemployment demand,2017,1109,2017,429
2,1,el Raval,1,Ciutat Vella,Female,Spain,Registered unemployed,2017,1109,2017,1452
3,1,el Raval,1,Ciutat Vella,Female,Spain,Unemployment demand,2017,1109,2017,288
4,1,el Raval,1,Ciutat Vella,Male,Spain,Registered unemployed,2017,1109,2017,2063
5,1,el Raval,1,Ciutat Vella,Male,Spain,Unemployment demand,2017,1109,2017,471
6,1,el Raval,1,Ciutat Vella,Female,Spain,Registered unemployed,2017,1109,2017,1462
7,1,el Raval,1,Ciutat Vella,Female,Spain,Unemployment demand,2017,1109,2017,332
8,1,el Raval,1,Ciutat Vella,Male,Spain,Registered unemployed,2017,1109,2017,2109
9,1,el Raval,1,Ciutat Vella,Male,Spain,Unemployment demand,2017,1109,2017,448


In [24]:
# Check for missing values

combined_new.isnull().sum() 

Neighborhood Code    0
Neighborhood Name    0
District Code        0
District_Name        0
Gender               0
Nationality          0
Demand_occupation    0
Year_Unemployment    0
Number_Unemployed    0
Year_Immigrant       0
Number_Immigrants    0
dtype: int64

In [25]:
# Check for missing values like No consta or No information (LIKE NO?)
