In [None]:
import folium
import pandas as pd
import numpy as np
import requests
%matplotlib inline

In [None]:
df = pd.read_csv("P3_GrantExport.csv", sep=";", index_col=0)

In [None]:
df.sample(5)

In [None]:
wanted_columns = ["Project Title", "Funding Instrument", "Institution", "University", "Start Date", "End Date", "Approved Amount"]
df = df[wanted_columns]
df.sample(3)

**We can see there are a lot of missing values in the Approved Amount column. They are denoted with a "data not included in P3" string**

In [None]:
df.loc[53829]["Approved Amount"]

**There's about 1/6th of the data with missing approved amounts. We have no other option than dropping them**

In [None]:
pd.to_numeric(df["Approved Amount"], errors="coerce").isnull().sum()

** After inspecting the rows with missing data, it looks like most of the Institutions are not located in Switzerland. It is also indicated on the documentation page that all mobility fellowship are missing.**

In [None]:
missing_amounts = pd.to_numeric(df["Approved Amount"], errors="coerce").isnull()
df[missing_amounts].groupby("Institution").count().sample(5)

**The documentation says that this field is missing for mobility, but we notice that there are other Funding Instruments like "Fellowships for prospective researchers" which lack many Amount values.** 

In [None]:
df[missing_amounts].groupby("Funding Instrument").count()

In [None]:
# Dropping project with missing Amounts
print("Length before drop: ", len(df))
df = df[~missing_amounts]
print("Length after drop: ", len(df))

In [None]:
df["Approved Amount"] = pd.to_numeric(df["Approved Amount"])
df["Approved Amount"].describe()

In [None]:
# Biggest project
df[df["Approved Amount"] == df["Approved Amount"].max()]

In [None]:
zero_amount_pj = df[df["Approved Amount"] == 0]
zero_amount_pj.sample(5)

In [None]:
len(zero_amount_pj)

**There is a gap in the project numbering, let's see if there is also a gap in the dates (missing period)**

In [None]:
df["Approved Amount"].plot()

In [None]:
df["Start Date"] = pd.to_datetime(df["Start Date"])
df["End Date"] = pd.to_datetime(df["End Date"])

In [None]:
# Plotting the years
df["Start Date"].apply(lambda d: d.year).plot()

**From our crude analysis, we can see that the data looks inconsistent before 1990, and that the project numbering changed around 2004 but the data looks complete from 1990 to today**

In [None]:
df["End Date"].apply(lambda d: d.year).plot()

In [None]:
df["Start Year"] = df["Start Date"].apply(lambda d: d.year)

In [None]:
df.plot(x='Start Year', y='Approved Amount', style='.')

In [None]:
# Print the median amount for each year
df.groupby("Start Year").median().plot()

In [None]:
# Print the total of the amounts for each year
df.groupby("Start Year").sum().plot()

** We see that the yearly budget generally increases with the time**

In [None]:
username = "coolestteamada"
GEO_URL = "http://api.geonames.org/searchJSON"

In [None]:
epfl_geo = requests.get(GEO_URL, params={"username": username, "q": "EPFL"})
epfl_geo.status_code

In [None]:
epfl_geo = epfl_geo.json()['geonames'][0]
epfl_geo

In [None]:
# seems like we can get the canton easily
epfl_geo['adminCode1']

In [None]:
def parse_canton(json):
    try:
        return json['geonames'][0]['adminCode1']
    except:
        return None

In [None]:
def get_uni_canton(uni):
    r = requests.get(GEO_URL, params={"username": username, "q": uni})
    return parse_canton(r.json())

In [None]:
unis = list(df.groupby("University").groups.keys())
unis[0:5]

In [None]:
# Number of different universities
len(unis)

**This looks like a reasonable number of calls to our API**

In [None]:
# Percentage of project without universities
"{0:.2f}%".format(100 * df["University"].isnull().sum() / len(df))

In [None]:
# Percentage of funding without universities
"{0:.2f}%".format(100 * df[df["University"].isnull()]["Approved Amount"].sum() / df["Approved Amount"].sum())

**It is only about 4% of the projects, and 0.4% of the total fundings. So we can omit those without a huge loss of information**

In [None]:
# Try to get cantons for each uni
canton_dict = {}
for uni in unis:
    canton_dict[uni] = get_uni_canton(uni)

In [None]:
canton_dict.values()

**It does not work for most of the universities**

In [None]:
swiss_cantons_list = ["AG", "AI", "AR", "BE", "BL", "BS", "FR", "GE", "GL", "GR", "JU", "LU", "NE", "NW", "OW", "SG", "SH", "SO", "SZ", "TG", "TI", "UR", "VD", "VS", "ZG", "ZH"]

In [None]:
canton_dict

** Let's test folium's choropleth with random values for each canton**

In [None]:
from random import random

random_values = [random() for _ in swiss_cantons_list]
    
random_cantons = pd.DataFrame(list(zip(swiss_cantons_list, random_values)), columns=["Canton", "Value"])
random_cantons.head()


In [None]:
geo_path = 'ch-cantons.topojson.json'
    

switzerland = folium.Map(location=[46.57, 8], zoom_start=8)
switzerland.choropleth(geo_path=geo_path, 
                     data=random_cantons,
                     columns=['Canton', 'Value'],
                     key_on='feature.id',
                     topojson='objects.cantons',
                     fill_color='YlOrRd'
                    )
switzerland