In [None]:
import os
import pandas as pd
import json
import folium
import copy
from itertools import chain
import numpy as np
%matplotlib inline

#### 1. Europe

In [None]:
# load the data regarding the ratio of unemployement
europe_unemploy_rate_path = 'datasets/europe_unemploy_ratio.csv'
europe_unemploy_rate = pd.read_csv(europe_unemploy_rate_path)
europe_unemploy_rate = europe_unemploy_rate.loc[\
    (europe_unemploy_rate.TIME==2016) & (europe_unemploy_rate.SEX=="Total") & (~europe_unemploy_rate.GEO.str.contains("European|Euro")), \
    ["GEO","Value"]]
europe_unemploy_rate.loc[europe_unemploy_rate.GEO.str.contains("Germany"), "GEO"] = "Germany"
europe_unemploy_rate.loc[europe_unemploy_rate.GEO.str.contains("Former Yugoslav Republic of Macedonia"), "GEO"] = "The former Yugoslav Republic of Macedonia"
print("We have the unemployement rate the following states: ", europe_unemploy_rate.shape[0])
europe_unemploy_rate.head()

In [None]:
# load the europe data (geometry of the states) 
europe_topo_path = 'topojson/europe.topojson.json'
topo_data = json.load(open(europe_topo_path))
topo_states = [state["properties"]["NAME"] for state in topo_data["objects"]["europe"]["geometries"]]
print("We draw the boundaries of the following states:")
len(topo_states), ", ".join(topo_states)

In [None]:
# we don't have the data of all the states!
missing_states = list(set(topo_states)-set(europe_unemploy_rate.GEO))
print("We are missing the unemployement rate about these states:")
len(missing_states), ", ".join(missing_states)

In [None]:
def fill_invalid(state):
    if state in list(missing_states):
        return 0.7
    else:
        return 0

In [None]:
def optimize(data):
    used_arcs = set()
    for o in data['objects']:
        for geom in data['objects'][o]['geometries']:
            if geom['type'] == 'MultiPolygon':
                it = chain.from_iterable(geom['arcs'])
            else:
                it = geom['arcs']

            for i in chain.from_iterable(it):
                used_arcs.add(i if i >= 0 else ~i)
    for i in range(len(data['arcs'])):
        if i not in used_arcs:
            data['arcs'][i] = []
    return data

In [None]:
europe_location = [55, 15]

m = folium.Map(location=europe_location, zoom_start=3)

# fill the states with colors depending on their unemployement rate
m.choropleth(
    geo_data=topo_data, 
    topojson='objects.europe',
    data=europe_unemploy_rate,
    columns=['GEO', 'Value'],
    #threshold_scale=[0, 5, 10, 15, 20, 25],
    key_on='properties.NAME',
    fill_color='BuGn', 
    fill_opacity=0.7, 
#     line_opacity=1,
    legend_name='Percentage of unemployement (%)')

# fill with gray the states of which we have no info and also add a popup to every state to show more information 
# about the states
for state in topo_data['objects']['europe']['geometries']:
    state_name = state["properties"]["NAME"]
    unemp_rate = europe_unemploy_rate.Value[europe_unemploy_rate.GEO == state_name]
    if unemp_rate.size == 0:
        unemp_rate = "not available"
    else:
        unemp_rate = "{0:.1f}".format(unemp_rate.iloc[0])

    tdata = copy.deepcopy(topo_data)
    tdata['objects']['europe']['geometries'] = [state]
    folium.TopoJson(
        optimize(tdata),
        'objects.europe',
        name=state['id'],
        style_function=lambda geometry: {
#                 'color' : 'transparent',
#                 'fillColor': 'trasparent',
                # grey color to the state of which we don't have an
                'fillColor': "#424949", 
                'fillOpacity': fill_invalid(geometry["properties"]["NAME"]),
            }
    ).add_child(folium.Popup("<span style=\"font-weight:bold;\"> Rate: </span> " + unemp_rate)).add_to(m)

#m.save('test.html')

m

#### 2-3. Switzerland
Importat notions (from https://www.amstat.ch/v2/definition.jsp?lang=it):
- The unemplyement ratio is computed as unemployed people / actice population where the unemployed people are the people who are looking for a job without having one. 
- "Numero dei disoccupati registrati, nel giorno di riferimento (ultimo giorno del mese), diviso per il numero di persone attive, moltiplicato per 100."
- Persone attive: Persone che svolgono un’attività lucrativa di almeno un’ora per settimana e le persone non occupate.
- Il tasso di disoccupazione è calcolato prendendo come denominatore il numero delle persone attive

**Problem (for question 3)**: The site does not allow to download the rate of unemployement divided by age and nationality at the same time, but only the absolute value of unemployed people divided in such a way. Since we are interested in the ratio of unemployed people we need the active population for each category Ci defined as a tuple {Canton, Nationality, age class} to compute ratio=(number of unemployed people in category Ci)/(active population of category Ci). 
Solution: If considering three age classes (15-24, 15-49, 50+) then we are unable to copute the active population for each category Ci since the site does not give (or does not have) enought informatio. However, we can retrieve the active population for the people of age 15-24 on a per Canton and Nationality basis. Therefore, we decided to have only 2 age classes 15-24 and 25+.

In order to avoid downloading one dataset per task we decided to proceed as follows:
1. Compute the active population of each category:
    - *Active population divided per Canton and Nationality*: Download the dataset with the unemployement ratio and the number of unemployed people divided by Canton and Nationality. From this dataset we compute the active population for each tuple {Canton, Nationality} as active population=number of unemployed people*100/ratio of unemployed people.
    - *Active population of people of age 15-24 divided per Canton and Nationality*: this information can be directly downloadable from the site.
    - *Active population of people of age 25+ divided per Canton and Nationality*: this is easily derived from the previous two. 
2. Download the complete dataset with the number of unemployed peopled divided by canton, age class and nationality. We clean this dataset and, as explained before, we consider only the two age classes 15-24 and 25+. Then we transform each number of unemployed people in the ratio of unemployed people using the previously computed active poulation values.
3. To answer the questions we will group by on the fields we are intereted in.

In [None]:
# Helper function (integers are in an invalid format)
# Parses a string to an integer, removing invalid characters
def parseInt(numStr):
    cleaned = [x for x in numStr if x.isdigit()]
    return int("".join(cleaned))

In [None]:
def absoluteToRatios(data):
    # compute the ratio
    act_pop = data["Active Population"]
    data["Jobseekers"] = round(data["Jobseekers"]*100/act_pop, 1)
    data["Unemployed Jobseekers"] = round(data["Unemployed Jobseekers"]*100/act_pop, 1)
    data["Employed Jobseekers"] = round(data["Employed Jobseekers"]*100/act_pop, 1)
    
    # rename the columns
    column_rename = {
        "Jobseekers": "Ratio of Jobseekers",
        "Unemployed Jobseekers": "Ratio of Unemployed Jobseekers",
        "Employed Jobseekers": "Ratio of Employed Jobseekers"
    }
    data.rename(columns=column_rename, inplace=True)
    return data
    

In [None]:
# load the swiss topology (geometry of the cantons) 
ch_topo_path = 'topojson/ch-cantons.topojson.json'
topo_data_ch = json.load(open(ch_topo_path, encoding="utf-8"))
topo_cantons = [state["properties"]["name"] for state in topo_data_ch["objects"]["cantons"]["geometries"]]
print("We draw the boundaries of the following cantons:")
len(topo_cantons), ", ".join(topo_cantons)

    1. Load the data regarding the ratio of unemployement so to compute the active population per category.

   - *Active population divided per Canton and Nationality*

In [None]:
path = 'datasets/ch_unemploy_rate_bycanton_bynationality.csv'
ch_ap_canton_nationality = pd.read_csv(path)

# 1. drop and rename the columns
column_rename = {
    "Cantone":"Canton", 
    "Nazionalità": "Nationality",
    "Tasso di disoccupazione": "Unemployement rate", 
    "Disoccupati registrati": "Unemployed"
}
ch_ap_canton_nationality = ch_ap_canton_nationality[list(column_rename.keys())]
ch_ap_canton_nationality.rename(columns=column_rename, inplace=True)
ch_ap_canton_nationality = ch_ap_canton_nationality[~(ch_ap_canton_nationality.Canton=="Totale")]

# 2. converto objects to integers
ch_ap_canton_nationality["Unemployed"] = ch_ap_canton_nationality["Unemployed"].apply(parseInt)

# 3. compute the active population
ch_ap_canton_nationality["Active Population"] = \
    (100*ch_ap_canton_nationality["Unemployed"]/ch_ap_canton_nationality["Unemployement rate"]).astype("int")

# 4. translate column Nationality
ch_ap_canton_nationality.Nationality = ch_ap_canton_nationality.Nationality.apply(
    lambda natio: "Foreigners" if (natio=="stranieri") else "Swiss")

# 5. Rename the cantons
ch_ap_canton_nationality["Canton"] = [c for canton in topo_cantons for c in [canton]*2]

ch_ap_canton_nationality = ch_ap_canton_nationality[["Canton", "Nationality", "Active Population"]]
ch_ap_canton_nationality.head(2)

- *Active population of people of age 15-24 divided per Canton and Nationality*

In [None]:
path = 'datasets/ch_activepopulation_class1_bynationality.csv'
ch_ap_class1_canton_nationality = pd.read_csv(path)

# 0. Add the a column age to indicate that this dataframe is related to people of age 15-24
ch_ap_class1_canton_nationality["Age"] = "15-24"

# 1. drop and rename the columns
column_rename = {
    "Cantone":"Canton", 
    "Nazionalità": "Nationality",
    "Age": "Age",
    "Persone attive (giovane)": "Active Population"
}
ch_ap_class1_canton_nationality = ch_ap_class1_canton_nationality[list(column_rename.keys())]
ch_ap_class1_canton_nationality.rename(columns=column_rename, inplace=True)

# 2. converto objects to integers
ch_ap_class1_canton_nationality["Active Population"] = ch_ap_class1_canton_nationality["Active Population"].apply(parseInt)

# 4. translate column Nationality
ch_ap_class1_canton_nationality.Nationality = ch_ap_class1_canton_nationality.Nationality.apply(
    lambda natio: "Foreigners" if (natio=="stranieri") else "Swiss")

# 5. Rename the cantons
ch_ap_class1_canton_nationality["Canton"] = [c for canton in topo_cantons for c in [canton]*2]

ch_ap_class1_canton_nationality.head(2)

- *Active population of people of age 25+ divided per Canton and Nationality*

In [None]:
ch_ap_class2_canton_nationality = ch_ap_class1_canton_nationality.copy()
ch_ap_class2_canton_nationality["Age"] = "25+"
ch_ap_class2_canton_nationality["Active Population"] = \
    ch_ap_canton_nationality["Active Population"]-ch_ap_class1_canton_nationality["Active Population"]

ch_ap_class2_canton_nationality.head(2)

Finally we can merge this information in one dataframe.

In [None]:
active_population = pd.concat((ch_ap_class1_canton_nationality, ch_ap_class2_canton_nationality))
active_population.sort_values(['Canton', 'Nationality']).tail(4)

    2. Now that we have the active population values we can collect all the information so that it will be possible to compute the ratios that we need for each one of the following tasks.

In [None]:
path = 'datasets/ch_unemploed_bycanton_byage_bynationality.csv'
ch_unemployed = pd.read_csv(path)
# dataset cleaning:
#1. drop columns and column renaming
column_rename = {
    "Cantone": "Canton",
    "Nazionalità": "Nationality",
    "Classi d'età 15-24, 15-49, 50 anni e più": "Age",     
    "Persone in cerca d'impiego": "Jobseekers",
    "Disoccupati registrati": "Unemployed Jobseekers",
    "Persone in cerca d'impiego non disoccupate": "Employed Jobseekers"
}
ch_unemployed = ch_unemployed[list(column_rename.keys())]
ch_unemployed.rename(columns=column_rename, inplace=True)

# 2. Drop totals
ch_unemployed = ch_unemployed[~(ch_unemployed=="Totale").any(axis=1)]

# 3. Canton's name and nationality translate 
canton_names = [name for canton in topo_data_ch['objects']['cantons']["geometries"] for name in [canton["properties"]["name"]]*6]
ch_unemployed.Canton = canton_names
ch_unemployed.Nationality = ch_unemployed.Nationality.apply(
    lambda natio: "Foreigners" if (natio=="stranieri") else "Swiss")  

# 4. Convert objects to integers
ch_unemployed["Jobseekers"] = ch_unemployed["Jobseekers"].apply(parseInt)
ch_unemployed["Unemployed Jobseekers"] = ch_unemployed["Unemployed Jobseekers"].apply(parseInt)
ch_unemployed["Employed Jobseekers"] = ch_unemployed["Employed Jobseekers"].apply(parseInt)

# 5. Convert the three age classe to the two we use
ch_unemployed.Age = ch_unemployed.Age.astype("int")
ch_unemployed.Age = ch_unemployed.Age.apply(lambda class_: "15-24" if class_==1 else "25+")
ch_unemployed = ch_unemployed.groupby(by=["Canton", "Nationality", "Age"]).sum().reset_index()

# 6. Add column with active population
ch_unemployed = pd.merge(ch_unemployed, active_population, on=["Canton", "Nationality", "Age"], how='left')

ch_unemployed.sort_values(by=["Canton", "Nationality", "Age"])
ch_unemployed.tail(4)

On the obtained DataFrame we will group by on the fields we are interested in so to compute the respective ratio of unemployed people.

    4. Answer the questions grouping on the field we are interested in.

- **question 2**: ... "unemployment rates in Switzerland at a recent date"

In [None]:
ch_unemploy_rate_canton = absoluteToRatios(ch_unemployed.groupby(by="Canton").sum().reset_index())
ch_unemploy_rate_canton.head()

First show only the unemployed people ratio

In [None]:
ch_location = [46.8, 8.5]
m = folium.Map(location=ch_location, zoom_start=8)

# fill the cantons with colors depending on their unemployement rate
m.choropleth(
    geo_data=topo_data_ch, 
    topojson='objects.cantons',
    data=ch_unemploy_rate_canton,
    columns=['Canton', 'Ratio of Unemployed Jobseekers'],
    #threshold_scale=[0, 5, 10, 15, 20, 25],
    key_on='properties.name',
    fill_color='BuGn', 
    fill_opacity=0.7, 
#     line_opacity=1,
    legend_name='Percentage of unemployement (%)')
m

Then show all the jobseekers, considering the employed ones too.

In [None]:
m = folium.Map(location=ch_location, zoom_start=8)

# fill the cantons with colors depending on their unemployement rate
m.choropleth(
    geo_data=topo_data_ch, 
    topojson='objects.cantons',
    data=ch_unemploy_rate_canton,
    columns=['Canton', 'Ratio of Jobseekers'],
    #threshold_scale=[0, 5, 10, 15, 20, 25],
    key_on='properties.name',
    fill_color='BuGn', 
    fill_opacity=0.7, 
#     line_opacity=1,
    legend_name='Percentage of unemployement (%)')
m

TODO: comment differencies

- **question 3**: ...

In [None]:
# group by Canton and Nationality
ch_unemploy_rate_canton_nationality = \
    absoluteToRatios(ch_unemployed.groupby(by=["Canton", "Nationality"]).sum().reset_index())
ch_unemploy_rate_canton_nationality.head(4)

In [None]:
# pivot on "Unemployed Jobseekers" so to compute the ratio afterwards
ch_unemploy_rate_canton_nationality = ch_unemploy_rate_canton_nationality.pivot(
    index="Canton", columns="Nationality", values="Ratio of Unemployed Jobseekers").reset_index()
ch_unemploy_rate_canton_nationality.columns = \
    pd.MultiIndex(
        levels=[["", "Ratio of Unemployed Jobseekers"], ["Canton", "Foreigners", "Swiss"]], 
        labels=[[0,1,1], [0,1,2]])

ch_unemploy_rate_canton_nationality.head(4)

In [None]:
# compute the ratio between unemployed swiss and unemployes foreigners
ch_unemploy_rate_canton_nationality["", "Ratio (Swiss/Foreigners)"] = \
    round(ch_unemploy_rate_canton_nationality["Ratio of Unemployed Jobseekers", "Swiss"]/ch_unemploy_rate_canton_nationality["Ratio of Unemployed Jobseekers", "Foreigners"], 1)
ch_unemploy_rate_canton_nationality.head(4)

In [None]:
m = folium.Map(location=ch_location, zoom_start=8)

m.choropleth(
    geo_data=topo_data_ch, 
    topojson='objects.cantons',
    data=ch_unemploy_rate_canton_nationality,
    columns=[('', 'Canton'), ('', 'Ratio (Swiss/Foreigners)')],
    #threshold_scale=[0, 5, 10, 15, 20, 25],
    key_on='properties.name',
    fill_color='BuGn', 
    fill_opacity=0.7, 
#     line_opacity=1,
    legend_name='Percentage of unemployement (%)')
m

Question 3:  group by Canton, Nationality and age

In [None]:
# group by Canton, Nationality and age
ch_unemploy_rate_canton_nationality_age = \
    absoluteToRatios(ch_unemployed.groupby(by=["Canton", "Nationality", "Age"]).sum())

ch_unemploy_rate_canton_nationality_age = ch_unemploy_rate_canton_nationality_age[["Ratio of Unemployed Jobseekers"]]
ch_unemploy_rate_canton_nationality_age.head(4)

TODO: represent the dataframe in a bar plot?... so many bars...

In [None]:
# ch_unemploy_rate_canton_nationality_age.head(8).plot(kind="bar")

TODO: task 4