# Pre-processing SNSF Public Data

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests

## Gather

In [2]:
UPDATE_DATA = False
folder_name = 'rawdata'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [5]:
file_names = ["P3_GrantExport.csv",\
              "P3_GrantExport_with_abstracts.csv",\
              "P3_PersonExport.csv",\
              "P3_PublicationExport.csv",\
              "P3_GrantOutputDataExport.csv",\
              "P3_CollaborationExport.csv"]

if UPDATE_DATA:
    for k in file_names:
        url_grant = "http://p3.snf.ch/P3Export/" + k
        print(url_grant)
        response = requests.get(url_grant)
        assert response.status_code == 200, "status code for" + k + " not ok"
        
        with open(os.path.join(folder_name, k), mode="wb") as file:
                file.write(response.content)

In [7]:
grants = pd.read_csv(os.path.join(folder_name, file_names[0]), sep=';')
people = pd.read_csv(os.path.join(folder_name, file_names[2]), sep=';')
publications = pd.read_csv(os.path.join(folder_name, file_names[3]), sep=';')
output_data = pd.read_csv(os.path.join(folder_name, file_names[4]), sep=';')
collaborations = pd.read_csv(os.path.join(folder_name, file_names[5]), sep=';')

## Assess

In [9]:
grants.sample(3)

Unnamed: 0,Project Number,Project Number String,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,Institution Country,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,All disciplines,Start Date,End Date,Approved Amount,Keywords
68095,176683,IZSEZ0_176683,Intergenerational knowledge transfer as a mean...,Intergenerational knowledge transfer as a mean...,Burmeister Anne,Scientific Exchanges,Science communication,Arbeits- und Organisationspsychologie Institut...,Switzerland,University of Berne - BE,10605,Applied psychology,"Humanities and Social Sciences;Psychology, edu...",10605,01.01.2018,28.02.2018,7500.00,Age diversity; Knowledge transfer; Intergenera...
24623,55644,2100-055644,Predictive Control of Nonlinear Non-minimum Ph...,,Bonvin Dominique,Project funding (Div. I-III),Project funding,Laboratoire d'automatique EPFL - STI - IGM - LA1,Switzerland,EPF Lausanne - EPFL,20511,Other disciplines of Engineering Sciences,"Mathematics, Natural- and Engineering Sciences...",20511,01.04.2001,31.03.2004,115142.00,PREDICTIVE CONTROL; NON-MINIMUM PHASE; NONLINE...
13467,30335,81ZH-030335,Analysis of PSI Integrin Function in Drosophila.,,Wehrli Marcel,Fellowships for prospective researchers,Careers;Fellowships,UNI: MRC Laboratory of Molecular Biology C a...,Great Britain and Northern Ireland,Institution abroad - IACH,30102,Molecular Biology,Biology and Medicine;Basic Biological Research,30102,01.02.1991,31.01.1992,data not included in P3,


In [10]:
grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Project Number                74519 non-null  int64 
 1   Project Number String         74519 non-null  object
 2   Project Title                 74519 non-null  object
 3   Project Title English         31400 non-null  object
 4   Responsible Applicant         74519 non-null  object
 5   Funding Instrument            74519 non-null  object
 6   Funding Instrument Hierarchy  74479 non-null  object
 7   Institution                   68860 non-null  object
 8   Institution Country           68794 non-null  object
 9   University                    74514 non-null  object
 10  Discipline Number             74519 non-null  int64 
 11  Discipline Name               74519 non-null  object
 12  Discipline Name Hierarchy     74020 non-null  object
 13  All disciplines 

In [11]:
people.sample(3)

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person
11589,Bott,Markus,male,,,84996,,,,,,47136.0,
607,Aebi,Max,male,Institut für Evaluative Forschung in Orthopädi...,Bern,518682,,115859.0,,,,,
55131,Laedermann,Jean-Pascal,male,Institut de Radiophysique Département de Radio...,Lausanne,110891,,,58248.0,,,109986.0,


In [12]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111903 entries, 0 to 111902
Data columns (total 13 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Last Name                          111903 non-null  object
 1   First Name                         111896 non-null  object
 2   Gender                             111903 non-null  object
 3   Institute Name                     54186 non-null   object
 4   Institute Place                    54083 non-null   object
 5   Person ID SNSF                     111903 non-null  int64 
 6   OCRID                              7092 non-null    object
 7   Projects as responsible Applicant  28898 non-null   object
 8   Projects as Applicant              18934 non-null   object
 9   Projects as Partner                5300 non-null    object
 10  Projects as Practice Partner       735 non-null     object
 11  Projects as Employee               82000 non-null   

In [13]:
publications.sample(3)

Unnamed: 0,Publication ID SNSF,Project Number,Peer Review Status,Type of Publication,Title of Publication,Authors,Status,Publication Year,ISBN,DOI,...,Publisher,Editors,Journal Title,Volume,Issue / Number,Page from,Page to,Proceeding Title,Proceeding Place,Abstract
72170,{31B2FBC2-8F21-436B-86AE-4B80F7FECF3A},146725,Peer-reviewed,Original article (peer-reviewed),Development of autoimmune pancreatitis is inde...,"Seleznik Gitta M, Reding Theresia, Peter Luk...",Published,2018.0,,10.1136/gutjnl-2016-313458,...,,,Gut,67.0,9.0,1663.0,1673.0,Gut,,ObjectiveChronic pancreatitis (CP) and autoimm...
37751,{C75375CE-871A-4B08-8A4C-951A844DAB04},134797,Peer-reviewed,Original article (peer-reviewed),New constraints on dust emission and UV attenu...,"{Schaerer} D., {Boone} F., {Zamojski} M., {...",Accepted,,,,...,,,ArXiv e-prints,,,,,ArXiv e-prints,,
116294,{0D6319F7-F117-42CC-BD6E-81683EF3AC5F},165837,Peer-reviewed,,Fecal microbiota transplantation: a promising ...,"Delaune Vaihere, Orci Lorenzo A, Lacotte Sté...",Published,2018.0,,10.1080/14712598.2018.1518424,...,,,Expert opinion on biological therapy,18.0,10.0,1061.0,1071.0,Expert opinion on biological therapy,,Non alcoholic fatty liver disease (NAFLD) has ...


In [14]:
publications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133541 entries, 0 to 133540
Data columns (total 26 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Publication ID SNSF        133541 non-null  object 
 1   Project Number             133541 non-null  int64  
 2   Peer Review Status         133541 non-null  object 
 3   Type of Publication        128867 non-null  object 
 4   Title of Publication       133504 non-null  object 
 5   Authors                    131026 non-null  object 
 6   Status                     133541 non-null  object 
 7   Publication Year           118806 non-null  float64
 8   ISBN                       15201 non-null   object 
 9   DOI                        78871 non-null   object 
 10  Import Source              107910 non-null  object 
 11  Last Change of Outputdata  0 non-null       float64
 12  Open Access Status         133541 non-null  int64  
 13  Open Access Type           42

In [15]:
publications['Open Access Type'].value_counts()

Publisher (Gold Open Access)                                           18131
Repository (Green Open Access)                                         12349
Website                                                                10359
Green OA Embargo (Freely available via Repository after an embargo)     1791
Name: Open Access Type, dtype: int64

In [16]:
publications['Status'].value_counts()

Published    118975
Accepted      14464
NotSet          102
Name: Status, dtype: int64

In [17]:
publications['Volume'].value_counts()

8                             2078
7                             2002
9                             1851
6                             1808
10                            1705
                              ... 
7368                             1
75                               1
8627                             1
Volume 88,  7 October 2015       1
909                              1
Name: Volume, Length: 3657, dtype: int64

In [18]:
publications['Issue / Number'].value_counts()

1                                 8688
2                                 6471
3                                 5426
4                                 5071
5                                 3638
                                  ... 
347                                  1
2134                                 1
eaah6817                             1
Winter Braids VI (Lille, 2016)       1
504                                  1
Name: Issue / Number, Length: 2720, dtype: int64

In [50]:
publications[(publications.DOI.isna() == False) & (publications[['DOI', 'Project Number']].duplicated())].shape

(1724, 26)

In [19]:
collaborations.sample(3)

Unnamed: 0,Project Number,Group/Person,Types of collaboration,Country,Project Start Date,Project End Date
10625,138096,"Pontificia Universidad Católica de Chile, Sant...",Publication,Chile,01.11.2011,28.02.2013
36138,157789,"Benoît Rihoux, Comparatiev Political Sciences,...","in-depth/constructive exchanges on approaches,...",Belgium,01.01.2016,31.12.2021
10856,138213,LMIS4/EPFL,"in-depth/constructive exchanges on approaches,...",Switzerland,01.10.2011,30.09.2013


In [20]:
collaborations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60324 entries, 0 to 60323
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Project Number          60324 non-null  int64 
 1   Group/Person            60324 non-null  object
 2   Types of collaboration  60324 non-null  object
 3   Country                 60324 non-null  object
 4   Project Start Date      60324 non-null  object
 5   Project End Date        60322 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.8+ MB


In [21]:
collaborations['Types of collaboration'].value_counts()

in-depth/constructive exchanges on approaches, methods or results;Publication                                                                                                      18300
in-depth/constructive exchanges on approaches, methods or results                                                                                                                  16506
in-depth/constructive exchanges on approaches, methods or results;Publication;Research Infrastructures                                                                              6305
in-depth/constructive exchanges on approaches, methods or results;Publication;Research Infrastructures;Exchange of personnel                                                        3260
in-depth/constructive exchanges on approaches, methods or results;Research Infrastructures                                                                                          3078
Publication                                                                

In [22]:
collaborations['Group/Person'].value_counts()

EPFL                                                                                                    92
University of Geneva                                                                                    80
ETH Zurich                                                                                              72
University of Zurich                                                                                    64
ETH Zürich                                                                                              56
                                                                                                        ..
UNI ZH, Kunshist. Inst. Prof. Hans B. Thomson, Ostasiatische KGS                                         1
Prof. Oliver Rheinbach, University of Freiberg                                                           1
William Wisden, Imperial College London                                                                  1
Prof. De Klerk, Ghent University     

In [23]:
output_data.sample(3)

Unnamed: 0,Project Number,Output Type,Output Title,Url,Year
24724,170435,Talks/events/exhibitions,art installation: non narratedportraits,http://centrodememoriahistorica.gov.co/museo/o...,2017.0
5710,135192,"Media relations: print media, online media",Sturzgefahr! Hilft dieses Hormon?,,2016.0
12023,146533,"Media relations: print media, online media",Can diet control cancer development?,http://www.sciencenutshell.com/can-diet-contro...,2014.0


In [24]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28730 entries, 0 to 28729
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Project Number  28730 non-null  int64  
 1   Output Type     28730 non-null  object 
 2   Output Title    28726 non-null  object 
 3   Url             18712 non-null  object 
 4   Year            28487 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.1+ MB


In [25]:
output_data['Output Type'].value_counts()

Media relations: print media, online media           10709
Talks/events/exhibitions                              7201
New media (web, blogs, podcasts, news feeds etc.)     3679
Media relations: radio, television                    3623
Print (books, brochures, leaflets)                    1413
Other activities                                      1003
Video/Film                                             680
Software                                               286
Start-up                                               136
Name: Output Type, dtype: int64

#### Tidiness

##### `grants` (ie. `GrantExport`) table

- `Funding Instrument`, `Funding Instrument Hierarchy` are confusing
- `Discipline`, ... `Discipline Name Hierarchy` are confusing
- Details about `Institute` out of scope
- `University` contains both long and short names: details out of scope

##### `people` (ie. `PersonExport`) table

- **`Project as...` contain mixed variables and observations for grant and role**
- Details about `Institute` out of scope

##### `publications` table

- `Authors` contains multiple observations

##### `collaborations` table

- `Types of collaboration` contains multiple observations

#### Quality

- spaces in column names

##### `grants` (ie. `GrantExport`) table

- **`Project Number` and `Project Number String` are redundant**
- **`Project Number String` encodes division information?**
- **`Responsible Applicant` not an uid**
- **`Start Date` and `End Date` string, not date type**
- **`Aproved Amount` not numeric**
- `Project Title English` often redundant or null
- `Institution` free text? if yes, is it relevant? better named as department?
- `Keywords` not consistent (see keyword extraction from abstracts)

##### `people` (ie. `PersonExport`) table

- typo in col name for `ORCID`
- gender not categorical variable

##### `publications` table

- Missing DOIs
- `Last Change of Outputdata` empty
- `Publication Year` shows as float
- `Status`, `Peer Review Status`, `Type of Publication`, and `Open Acces Type` strings, not categories
- `Volume`, `Issue / Number`, `Page from`, `Page to` strings, not numeric
- `[..] Title` show inconsistencies re capitalization
- Duplicated entries: 1'724 duplicated non null DOIs and project numbers

##### `collaborations` table

- `Type of collaboration` string, not category
- `Switzerland` should not be a valid `Type of collaboration`
- `[.] Date` string, not dates
- `Group/Person` encoding seems inconsistent ("," vs "/", "Prof", "Dr")

##### `output_data` table

- `Output Type` string, not category
- `Year` float, not integer

## Clean

In [98]:
df_grants = grants.copy()
df_grants.columns = df_grants.columns.str.lower().str.replace(' ', '_')
df_grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74519 entries, 0 to 74518
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   project_number                74519 non-null  int64 
 1   project_number_string         74519 non-null  object
 2   project_title                 74519 non-null  object
 3   project_title_english         31400 non-null  object
 4   responsible_applicant         74519 non-null  object
 5   funding_instrument            74519 non-null  object
 6   funding_instrument_hierarchy  74479 non-null  object
 7   institution                   68860 non-null  object
 8   institution_country           68794 non-null  object
 9   university                    74514 non-null  object
 10  discipline_number             74519 non-null  int64 
 11  discipline_name               74519 non-null  object
 12  discipline_name_hierarchy     74020 non-null  object
 13  all_disciplines 

---

In [99]:
# https://github.com/zambujo/p3data/issues/1
df_grants['string_code'] = df_grants.project_number_string.str[:-6]
df_grants['string_code'] = df_grants.string_code.str.replace('[-_ ]', '')

In [100]:
# acceptance tests
df_grants.string_code.value_counts()
df_grants.groupby(["string_code", "funding_instrument"]).size().reset_index().rename(columns={0: "count"}).sort_values(by='count', ascending=False).head()

Unnamed: 0,string_code,funding_instrument,count
181,2000,Project funding (Div. I-III),6072
286,3100,Project funding (Div. I-III),4997
183,200020,Project funding (Div. I-III),3506
188,200021,Project funding (Div. I-III),3306
292,31003A,Project funding (Div. I-III),2677
