# Preprocessing SNSF Public Data

In [1]:
import os
import sqlite3

import numpy as np
import pandas as pd

RAW_DATA_FOLDER = '../rawdata'
DATA_FOLDER = '../data/'

In [2]:
# file_names = os.listdir(RAW_DATA_FOLDER) # file_names[0], file_names[1]
grants_raw = pd.read_csv(os.path.join(RAW_DATA_FOLDER, 'P3_GrantExport.csv'), sep=';')
people_raw = pd.read_csv(os.path.join(RAW_DATA_FOLDER, 'P3_PersonExport.csv'), sep=';')

## Assess

In [3]:
grants_raw.sample(3)

Unnamed: 0,Project Number,Project Number String,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,Institution Country,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,All disciplines,Start Date,End Date,Approved Amount,Keywords
9852,20686,81FR-020686,Die EG-Direktive über die Produktehaftpflicht ...,,Henninger Anton,Fellowships for prospective researchers,Careers;Fellowships,UNI: University of Cambridgeoffe di GB,Great Britain and Northern Ireland,Institution abroad - IACH,10205,Legal sciences,"Humanities and Social Sciences;Economics, law",10205,01.01.1989,31.12.1989,data not included in P3,
21563,48977,3234-048977,Identification of antigens expressed by P. viv...,,Romero F. Jackeline,Marie Heim-Voegtlin grants,Careers;Academic and research careers (non-pro...,Centre Integratif de Genomique Faculté de Biol...,Switzerland,University of Lausanne - LA,30102,Molecular Biology,Biology and Medicine;Basic Biological Research,30102/30403,01.01.1997,31.12.2000,154650.00,
46512,129828,310030_129828,Investigating the primary immune response agai...,Investigating the primary immune response agai...,Pichler Werner Joseph,Project funding (Div. I-III),Project funding,"Universitätsklinik für Rheumatologie, Immunolo...",Switzerland,University of Berne - BE,30718,Clinical Immunology and Immunopathology,Biology and Medicine;Clinical Medicine,30718,01.05.2010,31.10.2013,484520.00,drug hypersensitivity; p-i concept; prediction...


In [4]:
people_raw.sample(3)

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person
19030,Colldeweih,Rachael Marie,female,,,681504,,,,,,144288;162967,
45316,Hunziker,Patrick,male,Klinik für Intensivmedizin Universitätsspital ...,Basel,81176,,59517;108486;121208;125653;126078;131653;160178,59817;137194,,,,
65240,Merendino,Maura,female,,,18930,,,,,,11057;11072,


In [5]:
grants_raw.info()
print('            =====================================')
people_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74807 entries, 0 to 74806
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Project Number                74807 non-null  int64 
 1   Project Number String         74807 non-null  object
 2   Project Title                 74807 non-null  object
 3   Project Title English         31553 non-null  object
 4   Responsible Applicant         74807 non-null  object
 5   Funding Instrument            74807 non-null  object
 6   Funding Instrument Hierarchy  74767 non-null  object
 7   Institution                   69131 non-null  object
 8   Institution Country           69066 non-null  object
 9   University                    74802 non-null  object
 10  Discipline Number             74807 non-null  int64 
 11  Discipline Name               74807 non-null  object
 12  Discipline Name Hierarchy     74303 non-null  object
 13  All disciplines 

### Tidiness

#### `grants_raw` (ie. `GrantExport`) table

- `Funding Instrument`, `Funding Instrument Hierarchy` are confusing
- `Discipline`, ... `Discipline Name Hierarchy` are confusing
- Details about `Institute` out of scope
- `University` contains both long and short names: details out of scope

#### `people_raw` (ie. `PersonExport`) table

- **`Project as...` contain mixed variables and observations for grant and role**
- Details about `Institute` out of scope

#### Quality

- spaces in column names

##### `grants_raw` (ie. `GrantExport`) table

- **`Project Number` and `Project Number String` are redundant**
- **`Project Number String` encodes division information?**
- **`Responsible Applicant` not an uid**
- **`Start Date` and `End Date` string, not date type**
- **`Aproved Amount` not numeric**
- **2 observations without `start_date` or `end_date`**
- `Project Title English` often redundant or null
- `Institution` free text? if yes, is it relevant? better named as department?
- `Keywords` not consistent (see keyword extraction from abstracts)

##### `people_raw` (ie. `PersonExport`) table

- typo in col name for `ORCID`
- gender not categorical variable
- make sure `person_id_snsf` is unique

## Clean

In [6]:
# good practice
grants = grants_raw.copy()
grants.columns = grants.columns.str.lower().str.replace(' ', '_')
people = people_raw.copy()
people.columns = people.columns.str.lower().str.replace(' ', '_')

In [7]:
# https://github.com/zambujo/p3data/issues/4
# role_in_grants table
role_in_grants = (people
    .melt('person_id_snsf', 
          var_name='role', 
          value_name='project_number', 
          value_vars=['projects_as_responsible_applicant', 
                      'projects_as_applicant', 
                      'projects_as_partner', 
                      'projects_as_practice_partner', 
                      'projects_as_employee', 
                      'projects_as_contact_person']).dropna())
role_in_grants = role_in_grants.assign(project_number=role_in_grants.project_number.str.split(';')).explode('project_number').reset_index(drop=True)
role_in_grants['role'] = role_in_grants.role.str[12:]
role_in_grants.drop_duplicates(inplace=True)
role_in_grants['project_number'] = pd.to_numeric(role_in_grants['project_number'], errors='coerce')
role_in_grants=role_in_grants.dropna()
role_in_grants['project_number'] = role_in_grants['project_number'].astype(int)
assert role_in_grants.project_number.dtype.name == 'int64', 'project number column not an integer'

In [8]:
# people table
people = people[['person_id_snsf', 'gender', 'first_name', 'last_name']]
people = people.drop_duplicates()
assert all(people.person_id_snsf.duplicated() == False), 'person_id_snsf contains duplicates'

In [9]:
role_in_grants.info()
print('    =====================================')
people.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 256654 entries, 0 to 259956
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  256654 non-null  int64 
 1   role            256654 non-null  object
 2   project_number  256654 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 7.8+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 111690 entries, 0 to 112475
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  111690 non-null  int64 
 1   gender          111690 non-null  object
 2   first_name      111683 non-null  object
 3   last_name       111690 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.3+ MB


In [10]:
# https://github.com/zambujo/p3data/issues/1
grants['string_code'] = grants.project_number_string.str[:-6]
grants['string_code'] = grants.string_code.str.replace('[-_ ]', '')
grants = grants.drop('project_number_string', 1)

# TODO test

In [11]:
# https://github.com/zambujo/p3data/issues/13
grants=grants.dropna(subset=['start_date'])

assert all(pd.isna(grants.start_date) == False), 'NA values in grants.start_date'
assert all(pd.isna(grants.end_date) == False), 'NA values in grants.end_date'

In [12]:
# not clear what to test... [TODO]
grants.groupby(["string_code", "funding_instrument"]).size().reset_index().rename(columns={0: "count"}).sort_values(by='count', ascending=False).head()

Unnamed: 0,string_code,funding_instrument,count
181,2000,Project funding (Div. I-III),6072
286,3100,Project funding (Div. I-III),4997
183,200020,Project funding (Div. I-III),3512
188,200021,Project funding (Div. I-III),3318
292,31003A,Project funding (Div. I-III),2677


In [13]:
# https://github.com/zambujo/p3data/issues/6
grants['approved_amount'] = pd.to_numeric(grants['approved_amount'], errors='coerce')
grants['start_date'] = pd.to_datetime(grants.start_date)
grants['end_date'] = pd.to_datetime(grants.end_date)

assert grants.approved_amount.dtype.name == 'float64', 'approved amount column not a float'
assert grants.start_date.dtype.name == 'datetime64[ns]', 'start_date column not datetime type'
assert grants.end_date.dtype.name == 'datetime64[ns]', 'end_date column not datetime type'

In [14]:
grants[['domain','topic']] = grants.discipline_name_hierarchy.str.split(';', expand=True)
grants[['programme', 'programme_details', 'programme_specifics']] = grants.funding_instrument_hierarchy.str.split(';', expand=True)
grants = grants[['project_number', 'programme', 'funding_instrument', 'domain', 'topic', 'discipline_name', 'discipline_number', 'start_date', 'end_date', 'approved_amount']]

In [15]:
conn = sqlite3.connect('../data/p3.db')
c = conn.cursor()

In [16]:
c.execute('''
CREATE TABLE IF NOT EXISTS "grants" (
  "project_number" INTEGER,
  "programme" TEXT,
  "funding_instrument" TEXT,
  "domain" TEXT,
  "topic" TEXT,
  "discipline_name" TEXT,
  "discipline_number" INTEGER,
  "start_date" DATETIME,
  "end_date" DATETIME,
  "approved_amount" FLOAT,
  PRIMARY KEY("project_number")
);
''')
conn.commit()

In [17]:
c.execute('''
CREATE TABLE IF NOT EXISTS "people" (
  "person_id_snsf" INTEGER,
  "gender" TEXT,
  "first_name" TEXT,
  "last_name" TEXT,
  PRIMARY KEY("person_id_snsf")
);
''')
conn.commit()

In [18]:
c.execute('''
CREATE TABLE IF NOT EXISTS "role_in_grants" (
  "person_id_snsf" INTEGER,
  "role" TEXT,
  "project_number" INTEGER,
  FOREIGN KEY("person_id_snsf") REFERENCES "people"("person_id_snsf"),
  FOREIGN KEY("project_number") REFERENCES "grants"("project_number")
);
''')
conn.commit()

In [19]:
grants.to_sql('grants', con=conn, if_exists='replace', index=False)
people.to_sql('people', con=conn, if_exists='replace', index=False)
role_in_grants.to_sql('role_in_grants', con=conn, if_exists='replace', index=False)