# Preprocessing SNSF Public Data

In [43]:
import os
import numpy as np
import pandas as pd
import feather
import matplotlib.pyplot as plt
import seaborn as sns
import requests

## Gather

In [2]:
# no need live updates...
UPDATE_DATA = False

folder_name = 'rawdata'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [46]:
# grants and grantees
file_names = ["P3_GrantExport.csv", "P3_PersonExport.csv"]

if UPDATE_DATA:
    for k in file_names:
        url_grant = "http://p3.snf.ch/P3Export/" + k
        print(url_grant)
        response = requests.get(url_grant)
        assert response.status_code == 200, "status code for" + k + " not ok"
        
        with open(os.path.join(folder_name, k), mode="wb") as file:
                file.write(response.content)

In [47]:
grants_raw = pd.read_csv(os.path.join(folder_name, file_names[0]), sep=';')
person_raw = pd.read_csv(os.path.join(folder_name, file_names[1]), sep=';')

## Assess

In [48]:
grants_raw.sample(3)

Unnamed: 0,Project Number,Project Number String,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,Institution Country,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,All disciplines,Start Date,End Date,Approved Amount,Keywords
46074,128977,IZK0Z1_128977,The Microeconomics of Successful Design,The Microeconomics of Successful Design,Steinert Martin,International short research visits,Careers,International Institute of Management in Techn...,Switzerland,University of Fribourg - FR,10204,Science of management,"Humanities and Social Sciences;Economics, law",10204/20502,01.09.2009,30.11.2009,9030.0,engineering design process; value chain analys...
34595,107324,32CO30-107324,2nd international conference: clinical ethics ...,2nd international conference: clinical ethics ...,Reiter-Theil Stella,Scientific Conferences,Science communication,Klinische Ethik Universitätsspital Basel Unive...,Switzerland,University of Basel - BS,30701,Internal Medicine,Biology and Medicine;Clinical Medicine,30701,01.12.2004,31.05.2005,10000.0,
59852,157615,PP00P2_157615,Theoretical investigation of photocatalytic wa...,Theoretical investigation of photocatalytic wa...,Aschauer Ulrich,SNSF Professorships,Careers,Departement für Chemie und Biochemie Universit...,Switzerland,University of Berne - BE,20301,Physical Chemistry,"Mathematics, Natural- and Engineering Sciences...",20301/20505/20404,01.01.2016,31.12.2019,1426477.0,Surface Properties; Perovskite; Oxynitride; De...


In [49]:
grants_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Project Number                74519 non-null  int64 
 1   Project Number String         74519 non-null  object
 2   Project Title                 74519 non-null  object
 3   Project Title English         31400 non-null  object
 4   Responsible Applicant         74519 non-null  object
 5   Funding Instrument            74519 non-null  object
 6   Funding Instrument Hierarchy  74479 non-null  object
 7   Institution                   68860 non-null  object
 8   Institution Country           68794 non-null  object
 9   University                    74514 non-null  object
 10  Discipline Number             74519 non-null  int64 
 11  Discipline Name               74519 non-null  object
 12  Discipline Name Hierarchy     74020 non-null  object
 13  All disciplines 

In [50]:
person_raw.sample(3)

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person
26606,Ellemers,Naomi,female,Universiteit Leiden Faculteit der Sociale Wete...,AK Leiden,629617,,,154264.0,,,,
76280,Pianezzi,Fabian,male,,,644610,,,,,,149453.0,
83411,Roth,Christian,male,Computational Biology Unit University of Bergen,Postboks,135813,,68405.0,,,,,


In [51]:
person_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111903 entries, 0 to 111902
Data columns (total 13 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Last Name                          111903 non-null  object
 1   First Name                         111896 non-null  object
 2   Gender                             111903 non-null  object
 3   Institute Name                     54186 non-null   object
 4   Institute Place                    54083 non-null   object
 5   Person ID SNSF                     111903 non-null  int64 
 6   OCRID                              7092 non-null    object
 7   Projects as responsible Applicant  28898 non-null   object
 8   Projects as Applicant              18934 non-null   object
 9   Projects as Partner                5300 non-null    object
 10  Projects as Practice Partner       735 non-null     object
 11  Projects as Employee               82000 non-null   

### Tidiness

#### `grants_raw` (ie. `GrantExport`) table

- `Funding Instrument`, `Funding Instrument Hierarchy` are confusing
- `Discipline`, ... `Discipline Name Hierarchy` are confusing
- Details about `Institute` out of scope
- `University` contains both long and short names: details out of scope

#### `person_raw` (ie. `PersonExport`) table

- **`Project as...` contain mixed variables and observations for grant and role**
- Details about `Institute` out of scope

#### Quality

- spaces in column names

##### `grants_raw` (ie. `GrantExport`) table

- **`Project Number` and `Project Number String` are redundant**
- **`Project Number String` encodes division information?**
- **`Responsible Applicant` not an uid**
- **`Start Date` and `End Date` string, not date type**
- **`Aproved Amount` not numeric**
- `Project Title English` often redundant or null
- `Institution` free text? if yes, is it relevant? better named as department?
- `Keywords` not consistent (see keyword extraction from abstracts)

##### `person_raw` (ie. `PersonExport`) table

- typo in col name for `ORCID`
- gender not categorical variable

## Clean

In [65]:
# good practice
grants = grants_raw.copy()
grants.columns = grants.columns.str.lower().str.replace(' ', '_')
person = person_raw.copy()
person.columns = person.columns.str.lower().str.replace(' ', '_')

In [66]:
# https://github.com/zambujo/p3data/issues/4
# grantee table
grantee = (person
    .melt('person_id_snsf', 
          var_name='role', 
          value_name='project_number', 
          value_vars=['projects_as_responsible_applicant', 
                      'projects_as_applicant', 
                      'projects_as_partner', 
                      'projects_as_practice_partner', 
                      'projects_as_employee', 
                      'projects_as_contact_person']).dropna())
grantee = grantee.assign(project_number=grantee.project_number.str.split(';')).explode('project_number').reset_index(drop=True)
grantee['role'] = grantee.role.str[12:]
grantee.drop_duplicates(inplace=True)
grantee['project_number'] = pd.to_numeric(grantee['project_number'], errors='coerce')
grantee=grantee.dropna()
grantee['project_number'] = grantee['project_number'].astype(int)
grantee['role'] = grantee['role'].astype('category')
assert grantee.project_number.dtype.name == 'int64', 'project number column not an integer'
assert grantee.role.dtype.name == 'category', 'role column not a category'


In [67]:
# person table
person = person[['person_id_snsf', 'gender', 'first_name', 'last_name']]
person['gender'] = person['gender'].astype('category')
assert person.gender.dtype.name == 'category', 'gender column not a category'
person.gender.value_counts()

male      70007
female    41896
Name: gender, dtype: int64

In [68]:
grantee.info()
person.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255507 entries, 0 to 258790
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   person_id_snsf  255507 non-null  int64   
 1   role            255507 non-null  category
 2   project_number  255507 non-null  int64   
dtypes: category(1), int64(2)
memory usage: 6.1 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111903 entries, 0 to 111902
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   person_id_snsf  111903 non-null  int64   
 1   gender          111903 non-null  category
 2   first_name      111896 non-null  object  
 3   last_name       111903 non-null  object  
dtypes: category(1), int64(1), object(2)
memory usage: 2.7+ MB


In [72]:
# https://github.com/zambujo/p3data/issues/1
grants['string_code'] = grants.project_number_string.str[:-6]
grants['string_code'] = grants.string_code.str.replace('[-_ ]', '')
grants = grants.drop('project_number_string', 1)

In [73]:
# not clear what to test... [TODO]
grants.groupby(["string_code", "funding_instrument"]).size().reset_index().rename(columns={0: "count"}).sort_values(by='count', ascending=False).head()

Unnamed: 0,string_code,funding_instrument,count
181,2000,Project funding (Div. I-III),6072
286,3100,Project funding (Div. I-III),4997
183,200020,Project funding (Div. I-III),3506
188,200021,Project funding (Div. I-III),3306
292,31003A,Project funding (Div. I-III),2677


In [75]:
# https://github.com/zambujo/p3data/issues/6
grants['approved_amount'] = pd.to_numeric(grants['approved_amount'], errors='coerce')
grants['start_date'] = pd.to_datetime(grants.start_date)
grants['end_date'] = pd.to_datetime(grants.end_date)

assert grants.approved_amount.dtype.name == 'float64', 'approved amount column not a float'
assert grants.start_date.dtype.name == 'datetime64[ns]', 'start_date column not datetime type'
assert grants.end_date.dtype.name == 'datetime64[ns]', 'end_date column not datetime type'

In [77]:
grants[['domain','topic']] = grants.discipline_name_hierarchy.str.split(';', expand=True)
grants['domain'] = grants['domain'].astype('category')
grants['topic'] = grants['topic'].astype('category')
grants['discipline_name'] = grants['discipline_name'].astype('category')

assert grants.domain.dtype.name == 'category', 'domain column not a category'
assert grants.topic.dtype.name == 'category', 'topic column not a category'
assert grants.discipline_name.dtype.name == 'category', 'discipline name column not a category'

In [80]:
grants[['programme', 'programme_details', 'programme_specifics']] = grants.funding_instrument_hierarchy.str.split(';', expand=True)
grants['programme'] = grants['programme'].astype('category')
grants['programme_details'] = grants['programme_details'].astype('category')
grants['programme_specifics'] = grants['programme_specifics'].astype('category')
grants['funding_instrument'] = grants['funding_instrument'].astype('category')

assert grants.programme.dtype.name == 'category', 'programme column not a category'
assert grants.programme_details.dtype.name == 'category', 'programme details column not a category'
assert grants.programme_specifics.dtype.name == 'category', 'programme specifics column not a category'
assert grants.funding_instrument.dtype.name == 'category', 'funding instrument column not a category'

In [82]:
grants = grants[['project_number', 'programme', 'funding_instrument', 'domain', 'topic', 'discipline_name', 'discipline_number', 'start_date', 'end_date', 'approved_amount']]
grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   project_number      74519 non-null  int64         
 1   programme           74479 non-null  category      
 2   funding_instrument  74519 non-null  category      
 3   domain              74020 non-null  category      
 4   topic               70748 non-null  category      
 5   discipline_name     74519 non-null  category      
 6   discipline_number   74519 non-null  int64         
 7   start_date          74517 non-null  datetime64[ns]
 8   end_date            74517 non-null  datetime64[ns]
 9   approved_amount     61387 non-null  float64       
dtypes: category(5), datetime64[ns](2), float64(1), int64(2)
memory usage: 3.4 MB


In [41]:
# minimal set of tidy datasets
grants.reset_index().to_feather(os.path.join('data', 'grants.feather'))
person.reset_index().to_feather(os.path.join('data', 'person.feather'))
grantee.reset_index().to_feather(os.path.join('data', 'grantee.feather'))

See also: [exploration.html](./exploration.html) for exploratory steps, [slides](./index.html) for insights