# Preprocessing SNSF Public Data

In [1]:
import os
import sqlite3

import numpy as np
import pandas as pd

RAW_DATA_FOLDER = '../rawdata'
DATA_FOLDER = '../data/'

In [2]:
# file_names = os.listdir(RAW_DATA_FOLDER) # file_names[0], file_names[1]
grants = pd.read_csv(os.path.join(RAW_DATA_FOLDER, 'P3_GrantExport.csv'), sep=';')
people = pd.read_csv(os.path.join(RAW_DATA_FOLDER, 'P3_PersonExport.csv'), sep=';')

## Assess

In [3]:
grants.sample(3)

Unnamed: 0,Project Number,Project Number String,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,Institution Country,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,All disciplines,Start Date,End Date,Approved Amount,Keywords
43961,124814,13DPD6_124814,Automatisation des enquêtes sur l'exposition d...,,Le Coultre Régis,DORE project funding,Project funding;Project funding (special),Filière Technique en radiologie médicale TRM H...,Switzerland,University of Applied Sciences and Arts Wester...,10602,Health,"Humanities and Social Sciences;Sociology, soci...",10602,01.04.2009,30.09.2011,141140.0,Radiology; Radiation protection; Patient dose;...
28301,64695,2000-064695,Particle-particle and particle-matrix interact...,,Suter Ulrich W.,Project funding (Div. I-III),Project funding,Institut für Polymere ETH Zürich,Switzerland,ETH Zurich - ETHZ,20505,Material Sciences,"Mathematics, Natural- and Engineering Sciences...",20505,01.04.2002,31.03.2005,420239.8,NANOCOMPOSITE; POLYMER; INTERACTION; ASPECT RA...
56118,149006,10CO11_149006,The Gender of Authority: Celibate and Childles...,,Höfert Almut,Scientific Conferences,Science communication,Historisches Seminar Universität Zürich,Switzerland,University of Zurich - ZH,10301,General history (without pre-and early history),Humanities and Social Sciences;Theology & reli...,10301/10303/10503/10403/10103,01.08.2013,31.10.2013,10000.0,Globalgeschichte; Geschichte der Männlichkeite...


In [4]:
people.sample(3)

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person
74509,Pascale,Patrizio,male,CHUV,Lausanne,584122,0000-0001-7328-2481,134268;192720,,,,,
82756,Roditi,Isabel,female,Institut für Zellbiologie Departement Biologie...,Bern,31289,0000-0003-2812-6513,30233;30964;40501;48987;50932;63987;111663;112...,37379;117446,191762.0,,,
93636,Sobolev,Andrej N.,male,,,638691,,,,177557.0,,,


In [5]:
grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74807 entries, 0 to 74806
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Project Number                74807 non-null  int64 
 1   Project Number String         74807 non-null  object
 2   Project Title                 74807 non-null  object
 3   Project Title English         31553 non-null  object
 4   Responsible Applicant         74807 non-null  object
 5   Funding Instrument            74807 non-null  object
 6   Funding Instrument Hierarchy  74767 non-null  object
 7   Institution                   69131 non-null  object
 8   Institution Country           69066 non-null  object
 9   University                    74802 non-null  object
 10  Discipline Number             74807 non-null  int64 
 11  Discipline Name               74807 non-null  object
 12  Discipline Name Hierarchy     74303 non-null  object
 13  All disciplines 

In [6]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112476 entries, 0 to 112475
Data columns (total 13 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Last Name                          112476 non-null  object
 1   First Name                         112469 non-null  object
 2   Gender                             112476 non-null  object
 3   Institute Name                     54497 non-null   object
 4   Institute Place                    54393 non-null   object
 5   Person ID SNSF                     112476 non-null  int64 
 6   OCRID                              7328 non-null    object
 7   Projects as responsible Applicant  29056 non-null   object
 8   Projects as Applicant              18969 non-null   object
 9   Projects as Partner                5460 non-null    object
 10  Projects as Practice Partner       735 non-null     object
 11  Projects as Employee               82415 non-null   

### Tidiness

#### `grants` (ie. `GrantExport`) table

- `Funding Instrument`, `Funding Instrument Hierarchy` are confusing
- `Discipline`, ... `Discipline Name Hierarchy` are confusing
- Details about `Institute` out of scope
- `University` contains both long and short names: details out of scope

#### `people` (ie. `PersonExport`) table

- [x]`Project as...` contain mixed variables and observations for grant and role
- Details about `Institute` out of scope

#### Quality

- [x] spaces in column names

##### `grants` (ie. `GrantExport`) table

- [x] `Project Number` and `Project Number String` are redundant
- [x] `Project Number String` encodes division information?
- [x] `Responsible Applicant` not an uid
- [x] `Start Date` and `End Date` string, not date type
- [x] `Aproved Amount` not numeric
- [x] 2 observations without `start_date` or `end_date`
- `Project Title English` often redundant or null
- `Institution` free text? if yes, is it relevant? better named as department?
- `Keywords` not consistent (see keyword extraction from abstracts)

##### `people` (ie. `PersonExport`) table

- typo in col name for `ORCID`
- gender not categorical variable
- make sure `person_id_snsf` is unique

## Clean

In [7]:
# Fix column names
grants.columns = grants.columns.str.lower().str.replace(' ', '_')
people.columns = people.columns.str.lower().str.replace(' ', '_')

assert all(grants.columns.str.contains(' ') == False), 'Space in grants column names'
assert all(people.columns.str.contains(' ') == False), 'Space in people column names'
assert all(grants.columns == grants.columns.str.lower()), 'Uppercase in grants column names'
assert all(people.columns == people.columns.str.lower()), 'Uppercase in people column names'

In [8]:
# https://github.com/zambujo/p3data/issues/4
role_types = (np.array(people.columns[people.columns.
                                        str.contains('projects_as_')]))
role_in_grants = (people
    .melt('person_id_snsf', 
          var_name='role', 
          value_name='project_number', 
          value_vars=role_types).dropna())
role_in_grants = (role_in_grants.
                      assign(project_number=role_in_grants.
                      project_number.str.split(';')).
                      explode('project_number').
                      reset_index(drop=True))
role_in_grants['role'] = role_in_grants.role.str[12:] # remove 'projects_as_'
role_in_grants['project_number'] = pd.to_numeric(role_in_grants['project_number'], 
                                                 errors='coerce')
role_in_grants=role_in_grants.dropna()
role_in_grants['project_number'] = role_in_grants['project_number'].astype(int)
role_in_grants.drop_duplicates(inplace=True)

# role_in_grants.role.value_counts()
assert role_in_grants.project_number.dtype.name == 'int64', 'project number column not an integer'

In [9]:
# people table
people = people[['person_id_snsf', 'gender', 'first_name', 'last_name']]
people = people.drop_duplicates()

assert all(people.person_id_snsf.duplicated() == False), 'person_id_snsf contains duplicates'

In [10]:
role_in_grants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 256654 entries, 0 to 259956
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  256654 non-null  int64 
 1   role            256654 non-null  object
 2   project_number  256654 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 7.8+ MB


In [11]:
people.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111690 entries, 0 to 112475
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  111690 non-null  int64 
 1   gender          111690 non-null  object
 2   first_name      111683 non-null  object
 3   last_name       111690 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.3+ MB


In [12]:
# https://github.com/zambujo/p3data/issues/1
# remove ending project_number >> remove ending separator(s)
grants['string_code'] = (
    grants.project_number_string.
        str.replace('([0-9]+)$', '').
        str.replace('[^a-zA-Z0-9]+$', ''))
grants = grants.drop('project_number_string', 1)

In [19]:
# string_code vs funding_instrument
# grants.groupby(["string_code", "funding_instrument"]).size().reset_index().rename(columns={0: "count"}).sort_values(by='count', ascending=False)

In [20]:
# https://github.com/zambujo/p3data/issues/6
grants['approved_amount'] = pd.to_numeric(grants['approved_amount'], errors='coerce')
grants['start_date'] = pd.to_datetime(grants.start_date)
grants['end_date'] = pd.to_datetime(grants.end_date)

assert grants.approved_amount.dtype.name == 'float64', 'approved amount column not a float'
assert grants.start_date.dtype.name == 'datetime64[ns]', 'start_date column not datetime type'
assert grants.end_date.dtype.name == 'datetime64[ns]', 'end_date column not datetime type'

In [21]:
# https://github.com/zambujo/p3data/issues/13
grants=grants.dropna(subset=['start_date'])
grants=grants.dropna(subset=['end_date']) # to be on safe side

assert all(pd.isna(grants.start_date) == False), 'NA values in grants.start_date'
assert all(pd.isna(grants.end_date) == False), 'NA values in grants.end_date'

In [24]:
grants[['domain','topic']] = grants.discipline_name_hierarchy.str.split(';', expand=True)
grants[['programme', 'programme_details', 'programme_specifics']] = grants.funding_instrument_hierarchy.str.split(';', expand=True)
grants = grants[['project_number', 'programme', 'funding_instrument', 'domain', 'topic', 'discipline_name', 'discipline_number', 'start_date', 'end_date', 'approved_amount']]

In [25]:
conn = sqlite3.connect('../data/p3.db')
c = conn.cursor()

In [26]:
c.execute('''
CREATE TABLE IF NOT EXISTS "grants" (
  "project_number" INTEGER,
  "programme" TEXT,
  "funding_instrument" TEXT,
  "domain" TEXT,
  "topic" TEXT,
  "discipline_name" TEXT,
  "discipline_number" INTEGER,
  "start_date" DATETIME,
  "end_date" DATETIME,
  "approved_amount" FLOAT,
  PRIMARY KEY("project_number")
);
''')
conn.commit()

In [27]:
c.execute('''
CREATE TABLE IF NOT EXISTS "people" (
  "person_id_snsf" INTEGER,
  "gender" TEXT,
  "first_name" TEXT,
  "last_name" TEXT,
  PRIMARY KEY("person_id_snsf")
);
''')
conn.commit()

In [28]:
c.execute('''
CREATE TABLE IF NOT EXISTS "role_in_grants" (
  "person_id_snsf" INTEGER,
  "role" TEXT,
  "project_number" INTEGER,
  FOREIGN KEY("person_id_snsf") REFERENCES "people"("person_id_snsf"),
  FOREIGN KEY("project_number") REFERENCES "grants"("project_number")
);
''')
conn.commit()

In [29]:
grants.to_sql('grants', con=conn, if_exists='replace', index=False)
people.to_sql('people', con=conn, if_exists='replace', index=False)
role_in_grants.to_sql('role_in_grants', con=conn, if_exists='replace', index=False)