# Pre-processing SNSF Public Data

In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests

## Gather

In [7]:
UPDATE_DATA = False
folder_name = 'rawdata'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [9]:
file_names = ["P3_GrantExport.csv", "P3_PersonExport.csv", "P3_PublicationExport.csv", "P3_GrantOutputDataExport.csv"]

if UPDATE_DATA:
    for k in file_names:
        url_grant = "http://p3.snf.ch/P3Export/" + k
        print(url_grant)
        response = requests.get(url_grant)
        assert response.status_code == 200, "status code for" + k + " not ok"
        
        with open(os.path.join(folder_name, k), mode="wb") as file:
                file.write(response.content)

In [11]:
grants = pd.read_csv(os.path.join(folder_name, file_names[0]), sep=';')
people = pd.read_csv(os.path.join(folder_name, file_names[1]), sep=';')
publications = pd.read_csv(os.path.join(folder_name, file_names[2]), sep=';')
output_data = pd.read_csv(os.path.join(folder_name, file_names[3]), sep=';')

## Assess

In [13]:
grants.sample(3)

Unnamed: 0,Project Number,Project Number String,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,Institution Country,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,All disciplines,Start Date,End Date,Approved Amount,Keywords
10269,25470,1113-025470,"Lehrerethos als positive Einstellung, Verpflic...",,Oser Fritz,Project funding (Div. I-III),Project funding,Institut de pédagogie Université de Fribourg,Switzerland,University of Fribourg - FR,10104,Educational science and Pedagogy,"Humanities and Social Sciences;Psychology, edu...",10104,01.12.1989,30.11.1990,78950.0,1D
1502,1993,1000-001993,"Dal carcere alla cattedra, genesi, innovazioni...",,López José Manuel,Project funding (Div. I-III),Project funding,Departement für Sprach- und Literaturwissensch...,Switzerland,University of Berne - BE,10502,Romance languages and literature,Humanities and Social Sciences;Linguistics and...,10502,01.10.1986,31.08.1988,49350.0,
63923,166485,CR23I3_166485,Magnetic Resonance Imaging-Guided Computationa...,Magnetic Resonance Imaging-Guided Computationa...,Kozerke Sebastian,Interdisciplinary projects,Project funding,Institut für Biomedizinische Technik Universit...,Switzerland,ETH Zurich - ETHZ,30714,Biomedical Engineering,Biology and Medicine;Clinical Medicine,30714/30715/20502,01.07.2016,30.06.2020,648688.0,Heart Failure; Computational Modelling; Comput...


In [15]:
grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Project Number                74519 non-null  int64 
 1   Project Number String         74519 non-null  object
 2   Project Title                 74519 non-null  object
 3   Project Title English         31400 non-null  object
 4   Responsible Applicant         74519 non-null  object
 5   Funding Instrument            74519 non-null  object
 6   Funding Instrument Hierarchy  74479 non-null  object
 7   Institution                   68860 non-null  object
 8   Institution Country           68794 non-null  object
 9   University                    74514 non-null  object
 10  Discipline Number             74519 non-null  int64 
 11  Discipline Name               74519 non-null  object
 12  Discipline Name Hierarchy     74020 non-null  object
 13  All disciplines 

In [17]:
people.sample(3)

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person
4727,Ballif,Christophe,male,Laboratoire de photovoltaïque et couches mince...,Neuchâtel 2,67065,,62444;107469;116630;125177;126926;128762;13383...,137833;141563;153728;153952;153965;153976;1625...,,,33630;40283;45660,141563.0
75979,Pfammatter,Tamara,female,,,584876,,,,,,129694,
66401,Miss,Fabia Mirjam,female,,,700071,,,,,,149796;172979,


In [19]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111903 entries, 0 to 111902
Data columns (total 13 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Last Name                          111903 non-null  object
 1   First Name                         111896 non-null  object
 2   Gender                             111903 non-null  object
 3   Institute Name                     54186 non-null   object
 4   Institute Place                    54083 non-null   object
 5   Person ID SNSF                     111903 non-null  int64 
 6   OCRID                              7092 non-null    object
 7   Projects as responsible Applicant  28898 non-null   object
 8   Projects as Applicant              18934 non-null   object
 9   Projects as Partner                5300 non-null    object
 10  Projects as Practice Partner       735 non-null     object
 11  Projects as Employee               82000 non-null   

In [21]:
publications.sample(3)

Unnamed: 0,Publication ID SNSF,Project Number,Peer Review Status,Type of Publication,Title of Publication,Authors,Status,Publication Year,ISBN,DOI,...,Publisher,Editors,Journal Title,Volume,Issue / Number,Page from,Page to,Proceeding Title,Proceeding Place,Abstract
66589,{40A46ECB-7186-4D9D-819C-B7E9BDCB2F1E},144646,Peer-reviewed,Proceedings (peer-reviewed),Properties of self-assembled nanostructures: g...,"Reguera Javier, Malachosky Edward, Martin Ma...",Published,2015.0,,10.1039/c5fd90042e,...,"Royal Socitey of Chemistry,Cambridge, UK",,"Faraday Discussions, ,181",,,,,"Faraday Discussions, ,181",,
9372,{06E2F6B9-9E4A-4942-A3A8-C3BBA3DDE33B},123501,Peer-reviewed,Original article (peer-reviewed),Measurement of the Inclusive Z Cross Section v...,"Chatrchyan Serguei, others",Published,2011.0,,10.1007/JHEP08(2011)117 ...,...,,,JHEP,1108.0,,117.0,117.0,JHEP,,
23087,{08CFECF5-4622-4442-8F47-CACC5257CD96},129708,Peer-reviewed,Original article (peer-reviewed),Environmental versatility promotes modularity ...,"Samal A., Wagner Andreas, Martin O.C.",Published,2011.0,,10.1186/1752-0509-5-135,...,,,BMC Systems Biology,5.0,,135.0,135.0,BMC Systems Biology,,


In [23]:
publications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133541 entries, 0 to 133540
Data columns (total 26 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Publication ID SNSF        133541 non-null  object 
 1   Project Number             133541 non-null  int64  
 2   Peer Review Status         133541 non-null  object 
 3   Type of Publication        128867 non-null  object 
 4   Title of Publication       133504 non-null  object 
 5   Authors                    131026 non-null  object 
 6   Status                     133541 non-null  object 
 7   Publication Year           118806 non-null  float64
 8   ISBN                       15201 non-null   object 
 9   DOI                        78871 non-null   object 
 10  Import Source              107910 non-null  object 
 11  Last Change of Outputdata  0 non-null       float64
 12  Open Access Status         133541 non-null  int64  
 13  Open Access Type           42

In [25]:
publications['Open Access Type'].value_counts()

Publisher (Gold Open Access)                                           18131
Repository (Green Open Access)                                         12349
Website                                                                10359
Green OA Embargo (Freely available via Repository after an embargo)     1791
Name: Open Access Type, dtype: int64

In [27]:
publications['Status'].value_counts()

Published    118975
Accepted      14464
NotSet          102
Name: Status, dtype: int64

In [None]:
publications['Volume'].value_counts()

In [30]:
publications['Issue / Number'].value_counts()

1                       8688
2                       6471
3                       5426
4                       5071
5                       3638
                        ... 
867                        1
Herbst                     1
7611                       1
Numéros thématiques        1
n° 7                       1
Name: Issue / Number, Length: 2720, dtype: int64

In [32]:
publications[(publications.DOI.isna() == False) & (publications[['DOI', 'Project Number']].duplicated())].shape

(1724, 26)

In [37]:
output_data.sample(3)

Unnamed: 0,Project Number,Output Type,Output Title,Url,Year
28359,184485,"Print (books, brochures, leaflets)",How horizontal inequalities lead to conflict i...,https://doi.org/10.46446/Publication_r4d.2019....,2019.0
24859,170645,Talks/events/exhibitions,Film Screening and Panel Discussion “In Our Ha...,,2018.0
27608,178636,"Media relations: print media, online media",Des films contre l'âgisme,,2019.0


In [39]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28730 entries, 0 to 28729
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Project Number  28730 non-null  int64  
 1   Output Type     28730 non-null  object 
 2   Output Title    28726 non-null  object 
 3   Url             18712 non-null  object 
 4   Year            28487 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.1+ MB


In [41]:
output_data['Output Type'].value_counts()

Media relations: print media, online media           10709
Talks/events/exhibitions                              7201
New media (web, blogs, podcasts, news feeds etc.)     3679
Media relations: radio, television                    3623
Print (books, brochures, leaflets)                    1413
Other activities                                      1003
Video/Film                                             680
Software                                               286
Start-up                                               136
Name: Output Type, dtype: int64

#### Tidiness

##### `grants` (ie. `GrantExport`) table

- `Funding Instrument`, `Funding Instrument Hierarchy` are confusing
- `Discipline`, ... `Discipline Name Hierarchy` are confusing
- Details about `Institute` out of scope
- `University` contains both long and short names: details out of scope

##### `people` (ie. `PersonExport`) table

- **`Project as...` contain mixed variables and observations for grant and role**
- Details about `Institute` out of scope

##### `publications` table

- `Authors` contains multiple observations

##### `collaborations` table

- `Types of collaboration` contains multiple observations

#### Quality

- spaces in column names

##### `grants` (ie. `GrantExport`) table

- **`Project Number` and `Project Number String` are redundant**
- **`Project Number String` encodes division information?**
- **`Responsible Applicant` not an uid**
- **`Start Date` and `End Date` string, not date type**
- **`Aproved Amount` not numeric**
- `Project Title English` often redundant or null
- `Institution` free text? if yes, is it relevant? better named as department?
- `Keywords` not consistent (see keyword extraction from abstracts)

##### `people` (ie. `PersonExport`) table

- typo in col name for `ORCID`
- gender not categorical variable

##### `publications` table

- Missing DOIs
- `Last Change of Outputdata` empty
- `Publication Year` shows as float
- `Status`, `Peer Review Status`, `Type of Publication`, and `Open Acces Type` strings, not categories
- `Volume`, `Issue / Number`, `Page from`, `Page to` strings, not numeric
- `[..] Title` show inconsistencies re capitalization
- Duplicated entries: 1'724 duplicated non null DOIs and project numbers
Í
##### `output_data` table

- `Output Type` string, not category
- `Year` float, not integer

## Clean

In [131]:
# good practice to work on copies of the raw data
df_grants = grants.copy()
df_grants.columns = df_grants.columns.str.lower().str.replace(' ', '_')
df_people = people.copy()
df_people.columns = df_people.columns.str.lower().str.replace(' ', '_')

In [126]:
# https://github.com/zambujo/p3data/issues/4
# grantee table
grantee = (df_people
    .melt('person_id_snsf', 
          var_name='role', 
          value_name='project_number', 
          value_vars=['projects_as_responsible_applicant', 
                      'projects_as_applicant', 
                      'projects_as_partner', 
                      'projects_as_practice_partner', 
                      'projects_as_employee', 
                      'projects_as_contact_person']).dropna())
grantee = grantee.assign(project_number=grantee.project_number.str.split(';')).explode('project_number').reset_index(drop=True)
grantee['role'] = grantee.role.str[12:]
grantee.drop_duplicates(inplace=True)
grantee['project_number'] = pd.to_numeric(grantee['project_number'], errors='coerce')
grantee=grantee.dropna()
grantee['project_number'] = grantee['project_number'].astype(int)



In [141]:
# person table
person = df_people[['person_id_snsf', 'gender', 'first_name', 'last_name']]

In [143]:
grantee.info()
person.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255507 entries, 0 to 258790
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  255507 non-null  int64 
 1   role            255507 non-null  object
 2   project_number  255507 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 7.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111903 entries, 0 to 111902
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  111903 non-null  int64 
 1   gender          111903 non-null  object
 2   first_name      111896 non-null  object
 3   last_name       111903 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.4+ MB


In [129]:
grantee.nunique()

person_id_snsf    111120
role                   6
project_number     74494
dtype: int64

In [145]:
grantee.role.value_counts()

employee                 127500
responsible_applicant     73608
applicant                 46609
partner                    5961
contact_person             1033
practice_partner            796
Name: role, dtype: int64

In [146]:
person.gender.value_counts()

male      70007
female    41896
Name: gender, dtype: int64

In [44]:
# https://github.com/zambujo/p3data/issues/1
df_grants['string_code'] = df_grants.project_number_string.str[:-6]
df_grants['string_code'] = df_grants.string_code.str.replace('[-_ ]', '')
df_grants = df_grants.drop('project_number_string', 1)

_Acceptance tests_

In [45]:
df_grants.string_code.value_counts()

2000       6072
3100       4997
200020     3506
200021     3306
31003A     2677
           ... 
20ED21        1
32NM30        1
K33K0         1
315130        1
CR32I3L       1
Name: string_code, Length: 1102, dtype: int64

In [46]:
df_grants.groupby(["string_code", "funding_instrument"]).size().reset_index().rename(columns={0: "count"}).sort_values(by='count', ascending=False).head()

Unnamed: 0,string_code,funding_instrument,count
181,2000,Project funding (Div. I-III),6072
286,3100,Project funding (Div. I-III),4997
183,200020,Project funding (Div. I-III),3506
188,200021,Project funding (Div. I-III),3306
292,31003A,Project funding (Div. I-III),2677


In [148]:
person['gender'] = person['gender'].astype('category')

In [170]:
assert person.gender.dtype.name == 'category', 'gender column not a category'

In [157]:
# https://github.com/zambujo/p3data/issues/6
df_grants['approved_amount'] = pd.to_numeric(df_grants['approved_amount'], errors='coerce')

In [161]:
df_grants['start_date'] = pd.to_datetime(df_grants.start_date)
df_grants['end_date'] = pd.to_datetime(df_grants.end_date)

In [169]:
assert df_grants.start_date.dtype.name == 'datetime64[ns]', 'start_date not datetime type'
assert df_grants.end_date.dtype.name == 'datetime64[ns]', 'end_date not datetime type'
df_grants[['start_date', 'end_date']].sample(3)

Unnamed: 0,start_date,end_date
39234,2007-01-10,2010-09-30
69898,2018-01-09,2020-02-29
16860,1993-03-09,1993-02-12
