# Pre-processing SNSF Public Data

In [44]:
import os
import numpy as np
import pandas as pd
import feather
import matplotlib.pyplot as plt
import seaborn as sns
import requests

ModuleNotFoundError: No module named 'feather'

## Gather

In [2]:
UPDATE_DATA = False
folder_name = 'rawdata'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [3]:
file_names = ["P3_GrantExport.csv", "P3_PersonExport.csv", "P3_PublicationExport.csv", "P3_GrantOutputDataExport.csv"]

if UPDATE_DATA:
    for k in file_names:
        url_grant = "http://p3.snf.ch/P3Export/" + k
        print(url_grant)
        response = requests.get(url_grant)
        assert response.status_code == 200, "status code for" + k + " not ok"
        
        with open(os.path.join(folder_name, k), mode="wb") as file:
                file.write(response.content)

In [4]:
grants = pd.read_csv(os.path.join(folder_name, file_names[0]), sep=';')
people = pd.read_csv(os.path.join(folder_name, file_names[1]), sep=';')
publications = pd.read_csv(os.path.join(folder_name, file_names[2]), sep=';')
output_data = pd.read_csv(os.path.join(folder_name, file_names[3]), sep=';')

## Assess

In [5]:
grants.sample(3)

Unnamed: 0,Project Number,Project Number String,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,Institution Country,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,All disciplines,Start Date,End Date,Approved Amount,Keywords
37494,112653,PP0012-112653,Logique Universelle,,Béziau Jean-Yves,SNSF Professorships,Careers,Institut de Logique Université de Neuchâtel,Switzerland,University of Neuchatel - NE,20100,Mathematics,"Mathematics, Natural- and Engineering Sciences",20100/10101,01.08.2006,31.07.2008,417850.0,universal logic; abstract logic; combination o...
32380,102443,B-0010-102443,Il tema dell'onore nel teatro barocco in Europa,,Stäuble Antonio,Publication grants,Science communication,Section d'Italien Faculté des Lettres Universi...,Switzerland,University of Lausanne - LA,10502,Romance languages and literature,Humanities and Social Sciences;Linguistics and...,10502,01.09.2003,30.09.2003,5700.0,
1167,1540,1000-001540,Rabbinische Gleichnisse und das Neue Testament...,,Thoma Clemens,Project funding (Div. I-III),Project funding,,,Unassignable - NA,10102,"Religious studies, Theology",Humanities and Social Sciences;Theology & reli...,10102,01.04.1985,31.03.1988,185428.0,


In [6]:
grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Project Number                74519 non-null  int64 
 1   Project Number String         74519 non-null  object
 2   Project Title                 74519 non-null  object
 3   Project Title English         31400 non-null  object
 4   Responsible Applicant         74519 non-null  object
 5   Funding Instrument            74519 non-null  object
 6   Funding Instrument Hierarchy  74479 non-null  object
 7   Institution                   68860 non-null  object
 8   Institution Country           68794 non-null  object
 9   University                    74514 non-null  object
 10  Discipline Number             74519 non-null  int64 
 11  Discipline Name               74519 non-null  object
 12  Discipline Name Hierarchy     74020 non-null  object
 13  All disciplines 

In [7]:
people.sample(3)

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person
49,Abbaspour,Karim,male,Aquatische Ökologie Eawag,Dübendorf,80211,,100076;107468;109430,61395;103600;103881;113890;122479;138608;146430,173206.0,,,
24514,Downs,Sara,female,Institut für Sozial- und Präventivmedizin Univ...,Basel,501107,,,,,,65896;104283;104284;104288,
98486,Textor,Stephan,male,Rytec Partner AG,Münsingen,70501,,,44589,,,,


In [8]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111903 entries, 0 to 111902
Data columns (total 13 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Last Name                          111903 non-null  object
 1   First Name                         111896 non-null  object
 2   Gender                             111903 non-null  object
 3   Institute Name                     54186 non-null   object
 4   Institute Place                    54083 non-null   object
 5   Person ID SNSF                     111903 non-null  int64 
 6   OCRID                              7092 non-null    object
 7   Projects as responsible Applicant  28898 non-null   object
 8   Projects as Applicant              18934 non-null   object
 9   Projects as Partner                5300 non-null    object
 10  Projects as Practice Partner       735 non-null     object
 11  Projects as Employee               82000 non-null   

In [9]:
publications.sample(3)

Unnamed: 0,Publication ID SNSF,Project Number,Peer Review Status,Type of Publication,Title of Publication,Authors,Status,Publication Year,ISBN,DOI,...,Publisher,Editors,Journal Title,Volume,Issue / Number,Page from,Page to,Proceeding Title,Proceeding Place,Abstract
74025,{F96D37AA-E202-4DF2-91FD-5DB8441DC439},147139,Peer-reviewed,Original article (peer-reviewed),Continental weathering and redox conditions du...,"Montero-Serrano Jean-Carlos, Föllmi Karl B., ...",Published,2015.0,,10.1016/j.palaeo.2015.03.043,...,,,"Palaeogeography, Palaeoclimatology, Palaeoecology",429,,83,99,"Palaeogeography, Palaeoclimatology, Palaeoecology",,
88838,{16693A62-33A4-462A-AF7F-FD78BDCE5AA3},153038,Peer-reviewed,Original article (peer-reviewed),PAGES synthesis study on climate changes in As...,"Ge Quansheng, Zheng Jingyun, Hao Zhixin",Published,2014.0,,10.11821/dlxb201503001 ...,...,,,Acta Geographica Sinica,70,3.0,355,363,Acta Geographica Sinica,,ABSTRACT: This article in Chinese is a contrib...
124874,{825C75D1-32EF-40B1-B96A-1709C04498BD},171590,Peer-reviewed,Original article (peer-reviewed),Novel Transgenic Lines to Analyze Renal Glutat...,"Sugano Yuya, Siegfried Hugo, Merkel Erin, D...",Published,2020.0,,10.1089/zeb.2020.1862,...,,,Zebrafish,17,2.0,153,155,Zebrafish,,Reactive oxygen species (ROS) are important re...


In [10]:
publications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133541 entries, 0 to 133540
Data columns (total 26 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Publication ID SNSF        133541 non-null  object 
 1   Project Number             133541 non-null  int64  
 2   Peer Review Status         133541 non-null  object 
 3   Type of Publication        128867 non-null  object 
 4   Title of Publication       133504 non-null  object 
 5   Authors                    131026 non-null  object 
 6   Status                     133541 non-null  object 
 7   Publication Year           118806 non-null  float64
 8   ISBN                       15201 non-null   object 
 9   DOI                        78871 non-null   object 
 10  Import Source              107910 non-null  object 
 11  Last Change of Outputdata  0 non-null       float64
 12  Open Access Status         133541 non-null  int64  
 13  Open Access Type           42

In [11]:
publications['Open Access Type'].value_counts()

Publisher (Gold Open Access)                                           18131
Repository (Green Open Access)                                         12349
Website                                                                10359
Green OA Embargo (Freely available via Repository after an embargo)     1791
Name: Open Access Type, dtype: int64

In [12]:
publications['Status'].value_counts()

Published    118975
Accepted      14464
NotSet          102
Name: Status, dtype: int64

In [13]:
publications['Volume'].value_counts()

8                          2078
7                          2002
9                          1851
6                          1808
10                         1705
                           ... 
9(2)                          1
Sonderheft 4                  1
II-2                          1
ISBN: 978-2-35327-134-4       1
(12)                          1
Name: Volume, Length: 3657, dtype: int64

In [14]:
publications['Issue / Number'].value_counts()

1           8688
2           6471
3           5426
4           5071
5           3638
            ... 
0000           1
36419          1
235            1
4 · 2019       1
w13767         1
Name: Issue / Number, Length: 2720, dtype: int64

In [15]:
publications[(publications.DOI.isna() == False) & (publications[['DOI', 'Project Number']].duplicated())].shape

(1724, 26)

In [16]:
output_data.sample(3)

Unnamed: 0,Project Number,Output Type,Output Title,Url,Year
13086,148042,"Media relations: print media, online media",Restoring a rare firsthand account to history,http://www.las.illinois.edu/news/article/?id=1...,2016.0
17906,156561,"Media relations: radio, television",Allaitement au travail,https://www.rjb.ch/Format-A3/Allaitement-au-tr...,2017.0
17962,156574,"New media (web, blogs, podcasts, news feeds etc.)",Why austerity is easier to implement in some c...,http://eprints.lse.ac.uk/70177/,2017.0


In [17]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28730 entries, 0 to 28729
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Project Number  28730 non-null  int64  
 1   Output Type     28730 non-null  object 
 2   Output Title    28726 non-null  object 
 3   Url             18712 non-null  object 
 4   Year            28487 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.1+ MB


In [18]:
output_data['Output Type'].value_counts()

Media relations: print media, online media           10709
Talks/events/exhibitions                              7201
New media (web, blogs, podcasts, news feeds etc.)     3679
Media relations: radio, television                    3623
Print (books, brochures, leaflets)                    1413
Other activities                                      1003
Video/Film                                             680
Software                                               286
Start-up                                               136
Name: Output Type, dtype: int64

#### Tidiness

##### `grants` (ie. `GrantExport`) table

- `Funding Instrument`, `Funding Instrument Hierarchy` are confusing
- `Discipline`, ... `Discipline Name Hierarchy` are confusing
- Details about `Institute` out of scope
- `University` contains both long and short names: details out of scope

##### `people` (ie. `PersonExport`) table

- **`Project as...` contain mixed variables and observations for grant and role**
- Details about `Institute` out of scope

##### `publications` table

- `Authors` contains multiple observations

##### `collaborations` table

- `Types of collaboration` contains multiple observations

#### Quality

- spaces in column names

##### `grants` (ie. `GrantExport`) table

- **`Project Number` and `Project Number String` are redundant**
- **`Project Number String` encodes division information?**
- **`Responsible Applicant` not an uid**
- **`Start Date` and `End Date` string, not date type**
- **`Aproved Amount` not numeric**
- `Project Title English` often redundant or null
- `Institution` free text? if yes, is it relevant? better named as department?
- `Keywords` not consistent (see keyword extraction from abstracts)

##### `people` (ie. `PersonExport`) table

- typo in col name for `ORCID`
- gender not categorical variable

##### `publications` table

- Missing DOIs
- `Last Change of Outputdata` empty
- `Publication Year` shows as float
- `Status`, `Peer Review Status`, `Type of Publication`, and `Open Acces Type` strings, not categories
- `Volume`, `Issue / Number`, `Page from`, `Page to` strings, not numeric
- `[..] Title` show inconsistencies re capitalization
- Duplicated entries: 1'724 duplicated non null DOIs and project numbers
Í
##### `output_data` table

- `Output Type` string, not category
- `Year` float, not integer

## Clean

In [19]:
# good practice to work on copies of the raw data
df_grants = grants.copy()
df_grants.columns = df_grants.columns.str.lower().str.replace(' ', '_')
df_people = people.copy()
df_people.columns = df_people.columns.str.lower().str.replace(' ', '_')

In [20]:
# https://github.com/zambujo/p3data/issues/4
# grantee table
grantee = (df_people
    .melt('person_id_snsf', 
          var_name='role', 
          value_name='project_number', 
          value_vars=['projects_as_responsible_applicant', 
                      'projects_as_applicant', 
                      'projects_as_partner', 
                      'projects_as_practice_partner', 
                      'projects_as_employee', 
                      'projects_as_contact_person']).dropna())
grantee = grantee.assign(project_number=grantee.project_number.str.split(';')).explode('project_number').reset_index(drop=True)
grantee['role'] = grantee.role.str[12:]
grantee.drop_duplicates(inplace=True)
grantee['project_number'] = pd.to_numeric(grantee['project_number'], errors='coerce')
grantee=grantee.dropna()
grantee['project_number'] = grantee['project_number'].astype(int)



In [21]:
# person table
person = df_people[['person_id_snsf', 'gender', 'first_name', 'last_name']]

In [22]:
grantee.info()
person.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255507 entries, 0 to 258790
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  255507 non-null  int64 
 1   role            255507 non-null  object
 2   project_number  255507 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 7.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111903 entries, 0 to 111902
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   person_id_snsf  111903 non-null  int64 
 1   gender          111903 non-null  object
 2   first_name      111896 non-null  object
 3   last_name       111903 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.4+ MB


In [23]:
grantee.nunique()

person_id_snsf    111120
role                   6
project_number     74494
dtype: int64

In [24]:
grantee.role.value_counts()

employee                 127500
responsible_applicant     73608
applicant                 46609
partner                    5961
contact_person             1033
practice_partner            796
Name: role, dtype: int64

In [25]:
person.gender.value_counts()

male      70007
female    41896
Name: gender, dtype: int64

In [26]:
# https://github.com/zambujo/p3data/issues/1
df_grants['string_code'] = df_grants.project_number_string.str[:-6]
df_grants['string_code'] = df_grants.string_code.str.replace('[-_ ]', '')
df_grants = df_grants.drop('project_number_string', 1)

_Acceptance tests_

In [27]:
df_grants.string_code.value_counts()

2000       6072
3100       4997
200020     3506
200021     3306
31003A     2677
           ... 
CR21I3        1
PBTI33        1
314730B       1
3118          1
10FI1         1
Name: string_code, Length: 1102, dtype: int64

In [28]:
df_grants.groupby(["string_code", "funding_instrument"]).size().reset_index().rename(columns={0: "count"}).sort_values(by='count', ascending=False).head()

Unnamed: 0,string_code,funding_instrument,count
181,2000,Project funding (Div. I-III),6072
286,3100,Project funding (Div. I-III),4997
183,200020,Project funding (Div. I-III),3506
188,200021,Project funding (Div. I-III),3306
292,31003A,Project funding (Div. I-III),2677


In [29]:
person['gender'] = person['gender'].astype('category')

In [30]:
assert person.gender.dtype.name == 'category', 'gender column not a category'

In [31]:
# https://github.com/zambujo/p3data/issues/6
df_grants['approved_amount'] = pd.to_numeric(df_grants['approved_amount'], errors='coerce')

In [32]:
df_grants['start_date'] = pd.to_datetime(df_grants.start_date)
df_grants['end_date'] = pd.to_datetime(df_grants.end_date)

In [33]:
assert df_grants.start_date.dtype.name == 'datetime64[ns]', 'start_date not datetime type'
assert df_grants.end_date.dtype.name == 'datetime64[ns]', 'end_date not datetime type'
df_grants[['start_date', 'end_date']].sample(3)

Unnamed: 0,start_date,end_date
51720,2012-01-05,2014-04-30
27876,2001-01-04,2005-03-31
3230,1982-01-04,1982-09-30


In [34]:
df_grants.head()

Unnamed: 0,project_number,project_title,project_title_english,responsible_applicant,funding_instrument,funding_instrument_hierarchy,institution,institution_country,university,discipline_number,discipline_name,discipline_name_hierarchy,all_disciplines,start_date,end_date,approved_amount,keywords,string_code
0,1,Schlussband (Bd. VI) der Jacob Burckhardt-Biog...,,Kaegi Werner,Project funding (Div. I-III),Project funding,,,Unassignable - NA,10302,Swiss history,Humanities and Social Sciences;Theology & reli...,10302,1975-01-10,1976-09-30,11619.0,,1000
1,4,Batterie de tests à l'usage des enseignants po...,,Massarenti Léonard,Project funding (Div. I-III),Project funding,Faculté de Psychologie et des Sciences de l'Ed...,Switzerland,University of Geneva - GE,10104,Educational science and Pedagogy,"Humanities and Social Sciences;Psychology, edu...",10104,1975-01-10,1976-09-30,41022.0,,1000
2,5,Kritische Erstausgabe der 'Evidentiae contra D...,,Kommission für das Corpus philosophorum medii ...,Project funding (Div. I-III),Project funding,Kommission für das Corpus philosophorum medii ...,Switzerland,"Non-profit organisations (libraries, museums, ...",10101,Philosophy,Humanities and Social Sciences;Linguistics and...,10101,1976-01-03,1985-02-28,79732.0,,1000
3,6,Katalog der datierten Handschriften in der Sch...,,Burckhardt Max,Project funding (Div. I-III),Project funding,Abteilung Handschriften und Alte Drucke Univer...,Switzerland,University of Basel - BS,10302,Swiss history,Humanities and Social Sciences;Theology & reli...,10302,1975-01-10,1976-09-30,52627.0,,1000
4,7,Wissenschaftliche Mitarbeit am Thesaurus Lingu...,,Schweiz. Thesauruskommission,Project funding (Div. I-III),Project funding,Schweiz. Thesauruskommission,Switzerland,"Non-profit organisations (libraries, museums, ...",10303,Ancient history and Classical studies,Humanities and Social Sciences;Theology & reli...,10303,1976-01-01,1978-04-30,120042.0,,1000


In [35]:
df_grants[['domain','topic']] = df_grants.discipline_name_hierarchy.str.split(';', expand=True)
df_grants['domain'] = df_grants['domain'].astype('category')
df_grants['topic'] = df_grants['topic'].astype('category')
df_grants['discipline_name'] = df_grants['discipline_name'].astype('category')
# df.assign(project_number=grantee.project_number.str.split(';')).explode('project_number').reset_index(drop=True)


In [36]:
df_grants.head(3)

Unnamed: 0,project_number,project_title,project_title_english,responsible_applicant,funding_instrument,funding_instrument_hierarchy,institution,institution_country,university,discipline_number,discipline_name,discipline_name_hierarchy,all_disciplines,start_date,end_date,approved_amount,keywords,string_code,domain,topic
0,1,Schlussband (Bd. VI) der Jacob Burckhardt-Biog...,,Kaegi Werner,Project funding (Div. I-III),Project funding,,,Unassignable - NA,10302,Swiss history,Humanities and Social Sciences;Theology & reli...,10302,1975-01-10,1976-09-30,11619.0,,1000,Humanities and Social Sciences,"Theology & religious studies, history, classic..."
1,4,Batterie de tests à l'usage des enseignants po...,,Massarenti Léonard,Project funding (Div. I-III),Project funding,Faculté de Psychologie et des Sciences de l'Ed...,Switzerland,University of Geneva - GE,10104,Educational science and Pedagogy,"Humanities and Social Sciences;Psychology, edu...",10104,1975-01-10,1976-09-30,41022.0,,1000,Humanities and Social Sciences,"Psychology, educational studies"
2,5,Kritische Erstausgabe der 'Evidentiae contra D...,,Kommission für das Corpus philosophorum medii ...,Project funding (Div. I-III),Project funding,Kommission für das Corpus philosophorum medii ...,Switzerland,"Non-profit organisations (libraries, museums, ...",10101,Philosophy,Humanities and Social Sciences;Linguistics and...,10101,1976-01-03,1985-02-28,79732.0,,1000,Humanities and Social Sciences,"Linguistics and literature, philosophy"


In [37]:
df_grants[['programme', 'programme_details', 'programme_specifics']] = df_grants.funding_instrument_hierarchy.str.split(';', expand=True)
df_grants['programme'] = df_grants['programme'].astype('category')
df_grants['programme_details'] = df_grants['programme_details'].astype('category')
df_grants['programme_specifics'] = df_grants['programme_specifics'].astype('category')
df_grants['funding_instrument'] = df_grants['funding_instrument'].astype('category')


In [38]:
df_grants.funding_instrument.value_counts().head(25)

Project funding (Div. I-III)               35554
Fellowships for prospective researchers     6485
Publication grants                          3695
Scientific Conferences                      3514
Early Postdoc.Mobility                      2341
International short research visits         2175
Project funding (special)                   2018
Fellowships for advanced researchers        1954
Doc.Mobility                                1204
SNSF Professorships                         1089
Scientific Exchanges                        1052
SCOPES                                      1025
Ambizione                                    949
R'EQUIP                                      910
Swiss Priority Programmes (SPPs)             748
Advanced Postdoc.Mobility                    719
Marie Heim-Voegtlin grants                   683
DORE project funding                         448
Sinergia                                     414
ProDoc                                       362
International Explor

In [39]:
df_grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   project_number                74519 non-null  int64         
 1   project_title                 74519 non-null  object        
 2   project_title_english         31400 non-null  object        
 3   responsible_applicant         74519 non-null  object        
 4   funding_instrument            74519 non-null  category      
 5   funding_instrument_hierarchy  74479 non-null  object        
 6   institution                   68860 non-null  object        
 7   institution_country           68794 non-null  object        
 8   university                    74514 non-null  object        
 9   discipline_number             74519 non-null  int64         
 10  discipline_name               74519 non-null  category      
 11  discipline_name_hierarchy   

In [40]:
project = df_grants[['project_number', 'programme', 'funding_instrument', 'domain', 'topic', 'discipline_name', 'discipline_number', 'start_date', 'end_date', 'approved_amount']]

In [41]:
project.to_feather(os.path.join('data', 'project.feather'))
person.to_feather(os.path.join('data', 'person.feather'))
grantee.to_feather(os.path.join('data', 'grantee.feather'))

ValueError: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s)