# Preprocessing SNSF Public Data

In [1]:
import os
import sqlite3

import numpy as np
import pandas as pd
import feather
import matplotlib.pyplot as plt
import seaborn as sns
import requests

## Gather

In [2]:
# no need live updates...
UPDATE_DATA = False
RAW_DATA_FOLDER = '../rawdata'
DATA_FOLDER = '../data/'

if not os.path.exists(RAW_DATA_FOLDER):
    os.makedirs(RAW_DATA_FOLDER)

In [3]:
# grants and grantees
file_names = ["P3_GrantExport.csv", "P3_PersonExport.csv"]

if UPDATE_DATA:
    for k in file_names:
        url_grant = "http://p3.snf.ch/P3Export/" + k
        print(url_grant)
        response = requests.get(url_grant)
        assert response.status_code == 200, "status code for" + k + " not ok"
        
        with open(os.path.join(RAW_DATA_FOLDER, k), mode="wb") as file:
                file.write(response.content)

In [4]:
grants_raw = pd.read_csv(os.path.join(RAW_DATA_FOLDER, file_names[0]), sep=';')
people_raw = pd.read_csv(os.path.join(RAW_DATA_FOLDER, file_names[1]), sep=';')

## Assess

In [5]:
grants_raw.sample(3)

Unnamed: 0,Project Number,Project Number String,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,Institution Country,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,All disciplines,Start Date,End Date,Approved Amount,Keywords
62671,163628,IZK0Z1_163628,The Power of Personal Geographies: Fine-tuning...,The Power of Personal Geographies: Fine-tuning...,Ciornei Irina,International short research visits,Careers,,,University of Berne - BE,10201,Sociology,"Humanities and Social Sciences;Sociology, soci...",10201,01.07.2015,31.07.2015,3280.0,mobility; European integration; European identity
60957,159884,100011_159884,Die verfassungs- und völkerrechtlichen Vorgabe...,The constitutional and international framework...,Belser Eva Maria,Project funding (Div. I-III),Project funding,Institut für Föderalismus Universität Freiburg,Switzerland,University of Fribourg - FR,10205,Legal sciences,"Humanities and Social Sciences;Economics, law",10205,01.10.2015,31.03.2019,200904.0,Soziale Grundrechte; Soziale Sicherheit; Nothi...
70307,182206,100018_182206,Forecasting Behaviour under Risk and over Time...,Forecasting Behaviour under Risk and over Time...,Di Falco Salvatore,Project funding (Div. I-III),Project funding,Département d'économie Université de Genève,Switzerland,University of Geneva - GE,10203,Economics,"Humanities and Social Sciences;Economics, law",10203,01.03.2019,29.02.2024,723661.0,Ethiopia; Forecasting; Time and risk preferenc...


In [6]:
people_raw.sample(3)

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person
51645,Klotz,Alexander,male,,,710588,,,,,,166503;185372,
61736,Marangi,Angelo,male,,,111883,,,,,,53706,
50099,Keller,Florian,male,,,514290,,,,,,102634,


In [7]:
grants_raw.info()
print('            =====================================')
people_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Project Number                74519 non-null  int64 
 1   Project Number String         74519 non-null  object
 2   Project Title                 74519 non-null  object
 3   Project Title English         31400 non-null  object
 4   Responsible Applicant         74519 non-null  object
 5   Funding Instrument            74519 non-null  object
 6   Funding Instrument Hierarchy  74479 non-null  object
 7   Institution                   68860 non-null  object
 8   Institution Country           68794 non-null  object
 9   University                    74514 non-null  object
 10  Discipline Number             74519 non-null  int64 
 11  Discipline Name               74519 non-null  object
 12  Discipline Name Hierarchy     74020 non-null  object
 13  All disciplines 

### Tidiness

#### `grants_raw` (ie. `GrantExport`) table

- `Funding Instrument`, `Funding Instrument Hierarchy` are confusing
- `Discipline`, ... `Discipline Name Hierarchy` are confusing
- Details about `Institute` out of scope
- `University` contains both long and short names: details out of scope

#### `people_raw` (ie. `PersonExport`) table

- **`Project as...` contain mixed variables and observations for grant and role**
- Details about `Institute` out of scope

#### Quality

- spaces in column names

##### `grants_raw` (ie. `GrantExport`) table

- **`Project Number` and `Project Number String` are redundant**
- **`Project Number String` encodes division information?**
- **`Responsible Applicant` not an uid**
- **`Start Date` and `End Date` string, not date type**
- **`Aproved Amount` not numeric**
- `Project Title English` often redundant or null
- `Institution` free text? if yes, is it relevant? better named as department?
- `Keywords` not consistent (see keyword extraction from abstracts)
- **2 observations without starting/ending date**

##### `people_raw` (ie. `PersonExport`) table

- typo in col name for `ORCID`
- gender not categorical variable
- make sure `person_id_snsf` is unique

## Clean

In [8]:
# good practice
grants = grants_raw.copy()
grants.columns = grants.columns.str.lower().str.replace(' ', '_')
people = people_raw.copy()
people.columns = people.columns.str.lower().str.replace(' ', '_')

In [9]:
# https://github.com/zambujo/p3data/issues/4
# role_in_grants table
role_in_grants = (people
    .melt('person_id_snsf', 
          var_name='role', 
          value_name='project_number', 
          value_vars=['projects_as_responsible_applicant', 
                      'projects_as_applicant', 
                      'projects_as_partner', 
                      'projects_as_practice_partner', 
                      'projects_as_employee', 
                      'projects_as_contact_person']).dropna())
role_in_grants = role_in_grants.assign(project_number=role_in_grants.project_number.str.split(';')).explode('project_number').reset_index(drop=True)
role_in_grants['role'] = role_in_grants.role.str[12:]
role_in_grants.drop_duplicates(inplace=True)
role_in_grants['project_number'] = pd.to_numeric(role_in_grants['project_number'], errors='coerce')
role_in_grants=role_in_grants.dropna()
role_in_grants['project_number'] = role_in_grants['project_number'].astype(int)
assert role_in_grants.project_number.dtype.name == 'int64', 'project number column not an integer'

In [10]:
# people table
people = people[['person_id_snsf', 'gender', 'first_name', 'last_name']]
people = people.drop_duplicates()
assert all(people.person_id_snsf.duplicated() == False), 'person_id_snsf contains duplicates'

In [11]:
role_in_grants.info()
print('    =====================================')
people.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255507 entries, 0 to 258790
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  255507 non-null  int64 
 1   role            255507 non-null  object
 2   project_number  255507 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 7.8+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 111120 entries, 0 to 111902
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  111120 non-null  int64 
 1   gender          111120 non-null  object
 2   first_name      111113 non-null  object
 3   last_name       111120 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.2+ MB


In [12]:
# https://github.com/zambujo/p3data/issues/1
grants['string_code'] = grants.project_number_string.str[:-6]
grants['string_code'] = grants.string_code.str.replace('[-_ ]', '')
grants = grants.drop('project_number_string', 1)

In [26]:
# https://github.com/zambujo/p3data/issues/13
grants=grants.dropna(subset=['start_date'])

assert all(pd.isna(grants.start_date) == False), 'NA values in grants.start_date'
assert all(pd.isna(grants.end_date) == False), 'NA values in grants.end_date'

In [14]:
# not clear what to test... [TODO]
grants.groupby(["string_code", "funding_instrument"]).size().reset_index().rename(columns={0: "count"}).sort_values(by='count', ascending=False).head()

Unnamed: 0,string_code,funding_instrument,count
181,2000,Project funding (Div. I-III),6072
286,3100,Project funding (Div. I-III),4997
183,200020,Project funding (Div. I-III),3506
188,200021,Project funding (Div. I-III),3306
292,31003A,Project funding (Div. I-III),2677


In [15]:
# https://github.com/zambujo/p3data/issues/6
grants['approved_amount'] = pd.to_numeric(grants['approved_amount'], errors='coerce')
grants['start_date'] = pd.to_datetime(grants.start_date)
grants['end_date'] = pd.to_datetime(grants.end_date)

assert grants.approved_amount.dtype.name == 'float64', 'approved amount column not a float'
assert grants.start_date.dtype.name == 'datetime64[ns]', 'start_date column not datetime type'
assert grants.end_date.dtype.name == 'datetime64[ns]', 'end_date column not datetime type'

In [16]:
grants[['domain','topic']] = grants.discipline_name_hierarchy.str.split(';', expand=True)
grants[['programme', 'programme_details', 'programme_specifics']] = grants.funding_instrument_hierarchy.str.split(';', expand=True)
grants = grants[['project_number', 'programme', 'funding_instrument', 'domain', 'topic', 'discipline_name', 'discipline_number', 'start_date', 'end_date', 'approved_amount']]

In [17]:
conn = sqlite3.connect('../data/p3.db')
c = conn.cursor()

In [18]:
c.execute('''
CREATE TABLE IF NOT EXISTS "grants" (
  "project_number" INTEGER,
  "programme" TEXT,
  "funding_instrument" TEXT,
  "domain" TEXT,
  "topic" TEXT,
  "discipline_name" TEXT,
  "discipline_number" INTEGER,
  "start_date" DATETIME,
  "end_date" DATETIME,
  "approved_amount" FLOAT,
  PRIMARY KEY("project_number")
);
''')
conn.commit()

In [19]:
c.execute('''
CREATE TABLE IF NOT EXISTS "people" (
  "person_id_snsf" INTEGER,
  "gender" TEXT,
  "first_name" TEXT,
  "last_name" TEXT,
  PRIMARY KEY("person_id_snsf")
);
''')
conn.commit()

In [20]:
c.execute('''
CREATE TABLE IF NOT EXISTS "role_in_grants" (
  "person_id_snsf" INTEGER,
  "role" TEXT,
  "project_number" INTEGER,
  FOREIGN KEY("person_id_snsf") REFERENCES "people"("person_id_snsf"),
  FOREIGN KEY("project_number") REFERENCES "grants"("project_number")
);
''')
conn.commit()

In [21]:
grants.to_sql('grants', con=conn, if_exists='replace', index=False)
people.to_sql('people', con=conn, if_exists='replace', index=False)
role_in_grants.to_sql('role_in_grants', con=conn, if_exists='replace', index=False)