# Get cantons and coordinates for universities

Import libraries. Here we utilize a very useful libarary `requests_cache`, which basically stores your every http requests results into a local database, default is sqlite, in order to **eliminate redundant requests**. We can use store the response for further uses because we are querying **static information**, such as locations for a place, meaning the returned results won't change during further requests.

In [1]:
import pandas as pd
import numpy as np
import requests
import requests_cache
import csv

from collections import Counter


requests_cache.install_cache('ada_cache')

Notice the csv file uses **semicolon** as its delimiter

In [2]:
csv_data = pd.read_csv('P3_GrantExport.csv', sep=';')

Extract out the two columns we are concerned about

In [3]:
data = csv_data[['University', 'Approved Amount']]

In [62]:
data_list = data.to_dict('records')  # change dataframe to a dict

In [6]:
KEY = XXXXXX  # omit my key
GEOCODE_URL = 'https://maps.googleapis.com/maps/api/geocode/json'
TEXT_URL = 'https://maps.googleapis.com/maps/api/place/textsearch/json'

In [36]:
def get_university_amount(data_list):
    """
    find the total funding amount for a university 
    
    params:
        data_list (list): a list of dicts containing university name and amount for one project
        
    return:
        universities (dict): contains university name, total amount of funding, 
                             canton of the university, latitude and longitude
        cnt (Counter): count how many times we missed the data for one university
    """
    cnt = Counter()
    universities = {}
    for d in data_list:
        if isinstance(d['University'], str) and '.' in d['Approved Amount']:
            university, amount = d['University'], float(d['Approved Amount'])
            uni_fullname = university.split(' - ')[0]
            if uni_fullname in universities:
                universities[uni_fullname]['amount'] += amount
            else:  # get university location info etco
                text_params = {
                    'key': KEY,
                    'query': uni_fullname
                }
                text_res = requests.get(TEXT_URL, params=text_params).json()  # we use google json API
                if text_res['status'] == 'OVER_QUERY_LIMIT':
                    with open('universities.csv', 'w') as f:
                        writer = csv.writer(f)
                        for university, info in universities.items():
                            writer.writerow([university, 
                                             "{0:.2f}".format(info['amount']), 
                                             info['canton'], 
                                             info['location']['lat'], 
                                             info['location']['lng']
                                            ])
                    return universities
                elif text_res['status'] != 'OK':
                    cnt[university] += 1
                else:  # get address of text query
                    address = text_res['results'][0]['formatted_address']  # get the first formatted address
                    # send second request to get canton, location etc.
                    geo_params = {
                        'key': KEY,
                        'address': address
                    }
                    geocode_res = requests.get(GEOCODE_URL, params=geo_params).json()
                    first_res = geocode_res['results'][0]   # get the first result
                    if 'geometry' in first_res:
                        geometry = first_res['geometry']
                        if 'location' not in geometry:
                            cnt['no_location'] += 1
                        else:
                            universities[uni_fullname] = {
                                'location': geometry['location'],
                                'amount': amount,
                                'canton': None
                            }  # add a new university
                            addr_comp = first_res['address_components']
                            for addr in addr_comp:
                                if addr['types'][0] == 'administrative_area_level_1':
                                    universities[uni_fullname]['canton'] = addr['long_name']   #  update canton
                    else:
                        cnt['no_geometry'] += 1
        else:
            cnt['not_university'] += 1

    return universities, cnt

uni, c = get_university_amount(data_dict)

We can check the returned Counter object to see the universities whose information we can't get by Google API. The values are the number of their appearances in the data

In [37]:
c

Counter({'Eidg. Forschungsanstalt für Wald,Schnee,Land - WSL': 223,
         'Eidg. Material und Prüfungsanstalt - EMPA': 238,
         'Fachhochschule Nordwestschweiz (ohne PH) - FHNW': 225,
         'Firmen/Privatwirtschaft - FP': 492,
         'Forschungsanstalten Agroscope - AGS': 135,
         'Forschungskommission SAGW': 1,
         'Haute école pédagogique BE, JU, NE - HEPBEJUNE': 7,
         'NPO (Biblioth., Museen, Verwalt.) - NPO': 1473,
         'Nicht zuteilbar - NA': 2595,
         'Physikal.-Meteorolog. Observatorium Davos - PMOD': 48,
         'Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP': 2,
         'Schweizer Kompetenzzentrum Sozialwissensch. - FORS': 30,
         'Staatsunabh. Theologische Hochschule Basel - STHB': 3,
         'Swiss Center for Electronics and Microtech. - CSEM': 28,
         'Swiss Institute of Bioinformatics - SIB': 31,
         'Weitere Institute - FINST': 43,
         'Weitere Spitäler - ASPIT': 81,
         'not_university': 13091})

Considering there are only a few missed universities, we decided to add them **manually**. Notice there are some invalid university names listed in the following:

1. Firmen/Privatwirtschaft - FP, meaning private in Germany
2. NPO (Biblioth., Museen, Verwalt.) - NPO, meaning Non-profit organization
3. Nicht zuteilbar - NA, meaning not assgined in Germany
4. Weitere Institute - FINST, meaning other institues in Germany
5. Weitere Spitäler - ASPIT, meaning other hospitals in Germany
6. not_university, meaning there's no university name or amount for this project in P3 data

so we only add the rest. Plus, we find **Pädag. Hochschule Tessin** and **Forschungsanstalten Agroscope** have multiple locations and we don't know which office got the fund, so we just ignore these two.

We use the following function to calculate the total amount for each missing university

In [56]:
def generate_record(university):
    """
    get amount for this university
    
    params:
        university (str): universitye name
        
    return:
        a tuple: university and its amount
    """
    university_df = data[data.University == university].copy()
    university_df = university_df[university_df['Approved Amount'] != 'data not included in P3']
    university_df['amount'] = university_df['Approved Amount'].astype('float')

    return university, "{0:.2f}".format(university_df['amount'].sum())

In [60]:
for name in c.keys():
    print(generate_record(name))

('Fachhochschule Nordwestschweiz (ohne PH) - FHNW', '42771914.12')
('Haute école pédagogique BE, JU, NE - HEPBEJUNE', '627380.00')
('NPO (Biblioth., Museen, Verwalt.) - NPO', '334130583.79')
('Firmen/Privatwirtschaft - FP', '111686719.90')
('Nicht zuteilbar - NA', '142425719.57')
('Swiss Institute of Bioinformatics - SIB', '11583219.00')
('Eidg. Material und Prüfungsanstalt - EMPA', '58574515.92')
('not_university', '0.00')
('Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP', '159317.00')
('Schweizer Kompetenzzentrum Sozialwissensch. - FORS', '34735816.00')
('Physikal.-Meteorolog. Observatorium Davos - PMOD', '12098436.00')
('Weitere Institute - FINST', '9256736.00')
('Forschungskommission SAGW', '100000.00')
('Staatsunabh. Theologische Hochschule Basel - STHB', '17300.00')
('Swiss Center for Electronics and Microtech. - CSEM', '18068246.00')
('Forschungsanstalten Agroscope - AGS', '33115719.00')
('Weitere Spitäler - ASPIT', '10749808.00')
('Eidg. Forschungsanstalt für Wald,Schnee,Lan

So finally we have a result csv containing **70 unique universities** of **46058 records** from original P3 data.

In [7]:
results = pd.read_csv('results.csv')
results

Unnamed: 0,university,amount,canton,latitude,longitude
0,Forschungsinstitut für biologischen Landbau,7.442410e+06,Hessen,50.119810,8.639162
1,Facoltà di Teologia di Lugano,8.000000e+03,Ticino,46.010731,8.958132
2,Schweiz. Hochschule für Logopädie Rorschach,4.296000e+05,Sankt Gallen,47.472613,9.494863
3,Université de Fribourg,4.575262e+08,Fribourg,46.806211,7.151754
4,Universität St. Gallen,6.919495e+07,Sankt Gallen,47.431611,9.374611
5,Eidg. Hochschulinstitut für Berufsbildung,2.086572e+06,Bern,47.001482,7.450728
6,HES de Suisse occidentale,3.416297e+07,Jura,47.362654,7.351270
7,Fachhochschule Ostschweiz,4.437056e+06,St. Gallen,47.424563,9.374897
8,Pädagogische Hochschule Zürich,3.298346e+06,Zürich,47.377639,8.534637
9,SUP della Svizzera italiana,2.404008e+07,Ticino,46.010731,8.958132
