# Pre-processing SNSF Public Data

In [105]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests

## Gather

In [106]:
UPDATE_DATA = False
folder_name = 'rawdata'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [109]:
file_names = ["P3_GrantExport.csv", "P3_PersonExport.csv", "P3_PublicationExport.csv", "P3_GrantOutputDataExport.csv"]

if UPDATE_DATA:
    for k in file_names:
        url_grant = "http://p3.snf.ch/P3Export/" + k
        print(url_grant)
        response = requests.get(url_grant)
        assert response.status_code == 200, "status code for" + k + " not ok"
        
        with open(os.path.join(folder_name, k), mode="wb") as file:
                file.write(response.content)

In [111]:
grants = pd.read_csv(os.path.join(folder_name, file_names[0]), sep=';')
people = pd.read_csv(os.path.join(folder_name, file_names[1]), sep=';')
publications = pd.read_csv(os.path.join(folder_name, file_names[2]), sep=';')
output_data = pd.read_csv(os.path.join(folder_name, file_names[3]), sep=';')

## Assess

In [112]:
grants.sample(3)

Unnamed: 0,Project Number,Project Number String,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,Institution Country,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,All disciplines,Start Date,End Date,Approved Amount,Keywords
6743,7909,3200-007909,Duration of immunity after rubella vaccination...,,Just Max,Project funding (Div. I-III),Project funding,Universitäts-Kinderspital beider Basel UKBB,Switzerland,University of Basel - BS,30808,Infectious Diseases,Biology and Medicine;Preventive Medicine (Epid...,30808,01.10.1983,30.09.1984,29230.0,
64796,168917,P0BSP1_168917,Eye tracking and episodic memory,Eye tracking and episodic memory,Fehlmann Bernhard,Doc.CH,Careers,Division of Cognitive Neuroscience Institut fü...,Switzerland,University of Basel - BS,10105,Psychology,"Humanities and Social Sciences;Psychology, edu...",10105,01.09.2016,31.08.2020,234054.0,attention; encoding; eye tracking; memory; emo...
25333,57100,2100-057100,Numerical Study of Flame Instabilites,,Monkewitz Peter,Project funding (Div. I-III),Project funding,EPFL / STI-ISE Institut des Sciences de l'Energie,Switzerland,EPF Lausanne - EPFL,20503,Fluid Dynamics,"Mathematics, Natural- and Engineering Sciences...",20503,01.10.1999,31.12.2002,124078.0,DIFFUSION FLAME INSTABILI; TIES; STABILITY ANA...


In [113]:
grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Project Number                74519 non-null  int64 
 1   Project Number String         74519 non-null  object
 2   Project Title                 74519 non-null  object
 3   Project Title English         31400 non-null  object
 4   Responsible Applicant         74519 non-null  object
 5   Funding Instrument            74519 non-null  object
 6   Funding Instrument Hierarchy  74479 non-null  object
 7   Institution                   68860 non-null  object
 8   Institution Country           68794 non-null  object
 9   University                    74514 non-null  object
 10  Discipline Number             74519 non-null  int64 
 11  Discipline Name               74519 non-null  object
 12  Discipline Name Hierarchy     74020 non-null  object
 13  All disciplines 

In [114]:
people.sample(3)

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person
15938,Cárdenas,Livia,female,Departement Geschichte Philosophisch-Historisc...,Basel,615874,,,,,,162971,
102953,Vianin,Pascal,male,Section 'E. Minkowski' Dépt. universitaire psy...,Lausanne,46565,,62947,,,,33812;43581;50896;57239,
68783,Müller,Jean-Pierre,male,Institut d'informatique et Intelligence artifi...,Neuchâtel 7,11364,,27037;34319;38049;40936;46916;49794;51523;5405...,5576;27021;30156;33903;37345;39370;43217;50578...,,,,


In [115]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111903 entries, 0 to 111902
Data columns (total 13 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Last Name                          111903 non-null  object
 1   First Name                         111896 non-null  object
 2   Gender                             111903 non-null  object
 3   Institute Name                     54186 non-null   object
 4   Institute Place                    54083 non-null   object
 5   Person ID SNSF                     111903 non-null  int64 
 6   OCRID                              7092 non-null    object
 7   Projects as responsible Applicant  28898 non-null   object
 8   Projects as Applicant              18934 non-null   object
 9   Projects as Partner                5300 non-null    object
 10  Projects as Practice Partner       735 non-null     object
 11  Projects as Employee               82000 non-null   

In [116]:
publications.sample(3)

Unnamed: 0,Publication ID SNSF,Project Number,Peer Review Status,Type of Publication,Title of Publication,Authors,Status,Publication Year,ISBN,DOI,...,Publisher,Editors,Journal Title,Volume,Issue / Number,Page from,Page to,Proceeding Title,Proceeding Place,Abstract
100232,{EE578A32-62BB-416C-A0FC-72550D628349},157476,Peer-reviewed,Original article (peer-reviewed),Increased interleukin-27 cytokine expression i...,"Lalive Patrice H, Kreutzfeldt Mario, Devergn...",Published,2017.0,,10.1186/s12974-017-0919-1,...,,,Journal of neuroinflammation,14.0,1.0,144,144,Journal of neuroinflammation,,Multiple sclerosis (MS) is an autoimmune disor...
48869,{B0B63B49-95C6-4318-98E3-56DEA0148516},138217,Peer-reviewed,Original article (peer-reviewed),Tunnel-Junction Thermometry Down to Millikelvi...,"Feshchenko A. V., Casparis L., Khaymovich I....",Published,2015.0,,10.1103/PhysRevApplied.4.034001 ...,...,,,Phys. Rev. Applied,4.0,,34001,34001,Phys. Rev. Applied,,
83425,{5993CD1F-C8FA-4AEA-A145-80F3CCC2EDB0},150446,Not peer-reviewed,Original article (non peer-reviewed),Kulturelles Erbe bauen. Architektur und Idenit...,MintaAnna,Published,2018.0,,,...,,,kunst und kirche,,3.0,4,13,kunst und kirche,,


In [117]:
publications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133541 entries, 0 to 133540
Data columns (total 26 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Publication ID SNSF        133541 non-null  object 
 1   Project Number             133541 non-null  int64  
 2   Peer Review Status         133541 non-null  object 
 3   Type of Publication        128867 non-null  object 
 4   Title of Publication       133504 non-null  object 
 5   Authors                    131026 non-null  object 
 6   Status                     133541 non-null  object 
 7   Publication Year           118806 non-null  float64
 8   ISBN                       15201 non-null   object 
 9   DOI                        78871 non-null   object 
 10  Import Source              107910 non-null  object 
 11  Last Change of Outputdata  0 non-null       float64
 12  Open Access Status         133541 non-null  int64  
 13  Open Access Type           42

In [15]:
publications['Open Access Type'].value_counts()

Publisher (Gold Open Access)                                           18131
Repository (Green Open Access)                                         12349
Website                                                                10359
Green OA Embargo (Freely available via Repository after an embargo)     1791
Name: Open Access Type, dtype: int64

In [16]:
publications['Status'].value_counts()

Published    118975
Accepted      14464
NotSet          102
Name: Status, dtype: int64

In [17]:
publications['Volume'].value_counts()

8                             2078
7                             2002
9                             1851
6                             1808
10                            1705
                              ... 
7368                             1
75                               1
8627                             1
Volume 88,  7 October 2015       1
909                              1
Name: Volume, Length: 3657, dtype: int64

In [18]:
publications['Issue / Number'].value_counts()

1                                 8688
2                                 6471
3                                 5426
4                                 5071
5                                 3638
                                  ... 
347                                  1
2134                                 1
eaah6817                             1
Winter Braids VI (Lille, 2016)       1
504                                  1
Name: Issue / Number, Length: 2720, dtype: int64

In [50]:
publications[(publications.DOI.isna() == False) & (publications[['DOI', 'Project Number']].duplicated())].shape

(1724, 26)

In [118]:
collaborations.sample(3)

Unnamed: 0,Project Number,Group/Person,Types of collaboration,Country,Project Start Date,Project End Date
43436,163488,Hosei University,"in-depth/constructive exchanges on approaches,...",Japan,01.09.2015,30.11.2015
59618,189575,"Lilian Edwards, Newcastle Law School, Newcastle","in-depth/constructive exchanges on approaches,...",Great Britain and Northern Ireland,01.08.2019,31.10.2019
48474,169191,"Prof. Aurélien Thomas, University of Lausanne,...","in-depth/constructive exchanges on approaches,...",Switzerland,01.10.2016,31.10.2019


In [120]:
output_data.sample(3)

Unnamed: 0,Project Number,Output Type,Output Title,Url,Year
4041,131285,Video/Film,Projection d’un film documentaire didactique p...,,2010.0
14955,151539,Talks/events/exhibitions,Sentier thématique - A la découverte d'un mond...,http://www.sciencesnaturelles.ch/service/leisu...,2016.0
20141,159997,Talks/events/exhibitions,"Soirée Société Suisse de la SEP, Environnement...",,2016.0


In [125]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28730 entries, 0 to 28729
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Project Number  28730 non-null  int64  
 1   Output Type     28730 non-null  object 
 2   Output Title    28726 non-null  object 
 3   Url             18712 non-null  object 
 4   Year            28487 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.1+ MB


In [126]:
output_data['Output Type'].value_counts()

Media relations: print media, online media           10709
Talks/events/exhibitions                              7201
New media (web, blogs, podcasts, news feeds etc.)     3679
Media relations: radio, television                    3623
Print (books, brochures, leaflets)                    1413
Other activities                                      1003
Video/Film                                             680
Software                                               286
Start-up                                               136
Name: Output Type, dtype: int64

#### Tidiness

##### `grants` (ie. `GrantExport`) table

- `Funding Instrument`, `Funding Instrument Hierarchy` are confusing
- `Discipline`, ... `Discipline Name Hierarchy` are confusing
- Details about `Institute` out of scope
- `University` contains both long and short names: details out of scope

##### `people` (ie. `PersonExport`) table

- **`Project as...` contain mixed variables and observations for grant and role**
- Details about `Institute` out of scope

##### `publications` table

- `Authors` contains multiple observations

##### `collaborations` table

- `Types of collaboration` contains multiple observations

#### Quality

- spaces in column names

##### `grants` (ie. `GrantExport`) table

- **`Project Number` and `Project Number String` are redundant**
- **`Project Number String` encodes division information?**
- **`Responsible Applicant` not an uid**
- **`Start Date` and `End Date` string, not date type**
- **`Aproved Amount` not numeric**
- `Project Title English` often redundant or null
- `Institution` free text? if yes, is it relevant? better named as department?
- `Keywords` not consistent (see keyword extraction from abstracts)

##### `people` (ie. `PersonExport`) table

- typo in col name for `ORCID`
- gender not categorical variable

##### `publications` table

- Missing DOIs
- `Last Change of Outputdata` empty
- `Publication Year` shows as float
- `Status`, `Peer Review Status`, `Type of Publication`, and `Open Acces Type` strings, not categories
- `Volume`, `Issue / Number`, `Page from`, `Page to` strings, not numeric
- `[..] Title` show inconsistencies re capitalization
- Duplicated entries: 1'724 duplicated non null DOIs and project numbers
Í
##### `output_data` table

- `Output Type` string, not category
- `Year` float, not integer

## Clean

In [127]:
df_grants = grants.copy()
df_grants.columns = df_grants.columns.str.lower().str.replace(' ', '_')
df_grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   project_number                74519 non-null  int64 
 1   project_number_string         74519 non-null  object
 2   project_title                 74519 non-null  object
 3   project_title_english         31400 non-null  object
 4   responsible_applicant         74519 non-null  object
 5   funding_instrument            74519 non-null  object
 6   funding_instrument_hierarchy  74479 non-null  object
 7   institution                   68860 non-null  object
 8   institution_country           68794 non-null  object
 9   university                    74514 non-null  object
 10  discipline_number             74519 non-null  int64 
 11  discipline_name               74519 non-null  object
 12  discipline_name_hierarchy     74020 non-null  object
 13  all_disciplines 

---

In [128]:
# https://github.com/zambujo/p3data/issues/1
df_grants['string_code'] = df_grants.project_number_string.str[:-6]
df_grants['string_code'] = df_grants.string_code.str.replace('[-_ ]', '')
df_grants = df_grants.drop('project_number_string', 1)

_Acceptance tests_

In [129]:
df_grants.string_code.value_counts()

2000      6072
3100      4997
200020    3506
200021    3306
31003A    2677
          ... 
IZJFZ2       1
PBLU1        1
IZRJZ2       1
106011       1
IZSAZ1       1
Name: string_code, Length: 1102, dtype: int64

In [130]:
df_grants.groupby(["string_code", "funding_instrument"]).size().reset_index().rename(columns={0: "count"}).sort_values(by='count', ascending=False).head()

Unnamed: 0,string_code,funding_instrument,count
181,2000,Project funding (Div. I-III),6072
286,3100,Project funding (Div. I-III),4997
183,200020,Project funding (Div. I-III),3506
188,200021,Project funding (Div. I-III),3306
292,31003A,Project funding (Div. I-III),2677
