# Data Prep for Lab 3
Construct the data which will be used in the maching learning notebook

In [1]:
# general use imports
%pylab inline
import datetime
import numpy as np
import os
import six
import warnings
import matplotlib.pyplot as plt
import re

# pandas-related imports
from __future__ import print_function
import pandas as pd
import scipy
import sklearn

# record linkage package
import recordlinkage as rl
from recordlinkage.preprocessing import clean
from recordlinkage.preprocessing import phonetic

# CSV file reading-related imports
import csv

# sqlalchemy an psycopg2 are sql connection packages
from sqlalchemy import create_engine


print( "Imports loaded at " + str( datetime.datetime.now() ) )

Populating the interactive namespace from numpy and matplotlib
Imports loaded at 2018-03-08 12:27:47.275061


#### The goal is to create a dataset that contains info on grants publications, etc for the machine learning lecture

In [2]:
## Start with project database
# create postgresql connection - three '/' indicate to use default host, port, username, and password
engine = create_engine('postgresql:///ada_pub')

QUERY = '''
SELECT *
FROM projects ;
'''
# here we pass the query and the connection to the pd.read_sql() function 
prj=pd.read_sql_query(QUERY,con=engine)

In [3]:
prj.project_number.head(100)

0        EPD05044
1        EPD05008
2        EPD05040
3        EPD05041
4        EPD05042
5        EPD05037
6        EPD05038
7        EPD05039
8        EPD05027
9        EPD05022
10       EPD05023
11    ARS-0410179
12       EPD05024
13       EPD05019
14       EPD05020
15       EPD05021
16       EPD05013
17       EPD05016
18       EPD05017
19       EPD05018
20       EPD05015
21       EPD05012
22       EPD05009
23       EPD05010
24        X832342
25        X832228
26    R829479C022
27    R829479C025
28    R829479C030
29    R829479C026
         ...     
70    ARS-0409188
71    ARS-0409376
72    ARS-0409382
73    ARS-0409459
74    ARS-0410193
75    ARS-0409544
76    ARS-0409570
77    ARS-0409636
78    ARS-0409639
79    ARS-0409642
80    ARS-0409644
81    ARS-0409662
82    ARS-0409718
83    ARS-0409771
84    ARS-0410195
85    ARS-0409782
86    ARS-0409819
87    ARS-0409832
88    ARS-0409855
89    ARS-0409873
90    ARS-0409874
91    ARS-0409878
92    ARS-0409880
93    ARS-0409946
94    ARS-

#### Variables to be constructed from project file
* number of researchers
* duration
* agency
* department
* Organization
* City
* total cost
* lenght of title
* year

In [4]:
prj = prj[['project_number','project_title','department','agency','project_start_date', 'project_end_date', 
     'contact_pi_project_leader', 'other_pis', 'organization_name' , 'organization_state','fy', 
     'fy_total_cost']]

In [5]:
# We have one data that is off
prj['project_end_date'].replace(['6/30/3018'], '6/30/2018', inplace=True)

In [6]:
# Dates are currently recorded as strings, but we can change the format
prj['project_start'] = pd.to_datetime(prj['project_start_date'])
prj['project_end'] = pd.to_datetime(prj['project_end_date'])

In [7]:
prj[prj['project_end'].isnull() & prj['project_end_date'].notnull()].head()

Unnamed: 0,project_number,project_title,department,agency,project_start_date,project_end_date,contact_pi_project_leader,other_pis,organization_name,organization_state,fy,fy_total_cost,project_start,project_end


In [8]:
prj[prj['project_start'].isnull() & prj['project_start_date'].notnull()].head()

Unnamed: 0,project_number,project_title,department,agency,project_start_date,project_end_date,contact_pi_project_leader,other_pis,organization_name,organization_state,fy,fy_total_cost,project_start,project_end


In [9]:
# Now we can easily generate the duration of a project
prj['duration']= prj['project_end'] - prj['project_start']

In [10]:
prj[['project_start','project_end','duration']].head()

Unnamed: 0,project_start,project_end,duration
0,2005-03-01,2005-08-31,183 days
1,2005-03-01,2005-08-31,183 days
2,2005-03-01,2005-08-31,183 days
3,2005-03-01,2005-08-31,183 days
4,2005-03-01,2005-08-31,183 days


In [11]:
# Get number of researchers, which is the PI plus other PIs. Entries are separated by ; 
# so we can just calculate the number of entries in the string other pis and then add one for primary
prj['pi_count'] = prj['other_pis'].str.split(';').str.len() + 1

In [12]:
# If the count is missing it means that there are no other pis. In this case we wnat to replace that missing value 
# with 1 becasue we have one main pi on each grant
prj['pi_count'] = prj['pi_count'].fillna(1)

In [13]:
prj[['pi_count', 'other_pis', 'contact_pi_project_leader','project_title' ]].head(20)

Unnamed: 0,pi_count,other_pis,contact_pi_project_leader,project_title
0,1.0,,"AGUILAR, ZORAIDA P.",A HYBRID PATHOGEN DETECTION SYSTEM
1,1.0,,"BOSE, ANIMESH",NOVEL MATERIAL AND PROCESS FOR REPLACEMENT OF ...
2,1.0,,"SILVER, JOEL A.",LOW-COST FEEDBACK POLLUTANT CONTROL FOR SMALL ...
3,1.0,,"PETERSON, KRISTEN",FIELD-PORTABLE FLUORESCENCE SENSOR FOR POLYCYC...
4,1.0,,"ELLIOTT, JEANNINE",CHROMATE-FREE NANOPARTICLE CORROSION INHIBITOR...
5,1.0,,"HSU, FU-CHIH",DEVELOPMENT OF A NOVEL VIRUS CAPTURE SYSTEM US...
6,2.0,"HAERLE, LOUIS","MLSNA, TODD",WIRELESS DECONTAMINATION GAS MONITOR
7,1.0,,"WU, XIAOQUN",A RETROFIT AND LOW-COST SMALL INDUSTRIAL BOILE...
8,1.0,,"HULL, MATTHEW",MAGNETITE NANOPARTICLES FOR ENHANCED ENVIRONME...
9,1.0,,"GOSAU, JAN-MICHAEL",A CATALYTIC EXHAUST PURIFYING SYSTEM FOR RESTA...


In [14]:
# Length of Title
# we need to remove special characters first (use recordlinkage clean function)
prj['project_title']=clean(prj['project_title'], lowercase=False, remove_brackets=True)

In [15]:
prj['tit_len'] = prj['project_title'].str.split(' ').str.len()

In [16]:
prj[['project_title', 'tit_len']].head()

Unnamed: 0,project_title,tit_len
0,A HYBRID PATHOGEN DETECTION SYSTEM,5
1,NOVEL MATERIAL AND PROCESS FOR REPLACEMENT OF ...,9
2,LOW COST FEEDBACK POLLUTANT CONTROL FOR SMALL ...,9
3,FIELD PORTABLE FLUORESCENCE SENSOR FOR POLYCYC...,8
4,CHROMATE FREE NANOPARTICLE CORROSION INHIBITOR...,7


#### Only keep the variables we need from this file

In [17]:
prj.count()

project_number               894014
project_title                894014
department                   894014
agency                       894014
project_start_date           778532
project_end_date             793081
contact_pi_project_leader    893919
other_pis                     98771
organization_name            893492
organization_state           880642
fy                           894014
fy_total_cost                727360
project_start                778532
project_end                  793081
duration                     778156
pi_count                     894014
tit_len                      894014
dtype: int64

In [18]:
prj = prj[['project_number','department','agency','project_start','project_end','organization_name', 'organization_state',
     'fy','fy_total_cost', 'duration', 'pi_count','tit_len']]   

In [19]:
prj.head()

Unnamed: 0,project_number,department,agency,project_start,project_end,organization_name,organization_state,fy,fy_total_cost,duration,pi_count,tit_len
0,EPD05044,EPA,EPA,2005-03-01,2005-08-31,VEGRANDIS INC.,,2005,70000.0,183 days,1.0,5
1,EPD05008,EPA,EPA,2005-03-01,2005-08-31,"MATERIALS PROCESSING, INC.",,2005,69916.0,183 days,1.0,9
2,EPD05040,EPA,EPA,2005-03-01,2005-08-31,SOUTHWEST SCIENCES INC,,2005,70000.0,183 days,1.0,9
3,EPD05041,EPA,EPA,2005-03-01,2005-08-31,SOUTHWEST SCIENCES INC,,2005,70000.0,183 days,1.0,8
4,EPD05042,EPA,EPA,2005-03-01,2005-08-31,TDA RESEARCH INC,,2005,70000.0,183 days,1.0,7


#### Now start preparing publication data

In [20]:
QUERY = '''
SELECT *
FROM publications_hhs ;
'''
# here we pass the query and the connection to the pd.read_sql() function 
pub=pd.read_sql_query(QUERY,con=engine)

In [21]:
pub.count()

affiliation            731687
author_list           1054259
country               1026318
issn                  1043121
journal_issue          945078
journal_title         1054427
journal_title_abbr    1054427
journal_volume        1044966
lang                  1054427
page_number           1038409
pmc_id                 808443
pmid                  1054427
pub_date              1044500
pub_title             1054376
pub_year              1054427
project_number        1054427
dtype: int64

Variables we wnat to create for each project are: 
* number of publications total
* number of publications (just dummy english/other)
* number of publications by Journal (US journal vs other)

In [22]:
# sort dataframe by project number and year
pub.sort_values(by=['project_number', 'pub_year'], inplace=True)
pub.head()

Unnamed: 0,affiliation,author_list,country,issn,journal_issue,journal_title,journal_title_abbr,journal_volume,lang,page_number,pmc_id,pmid,pub_date,pub_title,pub_year,project_number
1024452,,"Cooper, Jennifer; Zimmerman, Wendy",United States,1525-1446,1,"Public health nursing (Boston, Mass.)",Public Health Nurs,33,eng,53-64,,26354189,2016 Jan-Feb,The Evaluation of a Regional Faith Community N...,2016,B01DP009025
1030873,,"Schuch, Laura; Curtis, Jacqueline W; Curtis, A...",United States,1468-2869,1,Journal of urban health : bulletin of the New ...,J Urban Health,93,eng,36-52,4794455.0,26666248,2016 Feb,Breaking Out of Surveillance Silos: Integrativ...,2016,B01DP009042
735605,"Bristow Medical Center, 700 W. 7th St., Bristo...","Brandenburg, Mark A; Subera, Layne; Doran-Redu...",United States,0030-1876,10,The Journal of the Oklahoma State Medical Asso...,J Okla State Med Assoc,106,eng,391-7,,24404674,2013 Oct,Opioid prescribing guidelines for Oklahoma eme...,2013,B01DP009043
755924,,"Subera, Layne E; Brandenburg, Mark A; Doran-Re...",United States,0030-1876,8,The Journal of the Oklahoma State Medical Asso...,J Okla State Med Assoc,107,eng,416-30,,25326945,2014 Aug,Opioid prescribing guidelines for Oklahoma hea...,2014,B01DP009043
359272,"Centers for Disease Control and Prevention, Na...","Dube, Shanta R; Cook, Michelle L; Edwards, Val...",United States,1545-1151,3,Preventing chronic disease,Prev Chronic Dis,7,eng,A52,2879984.0,20394691,2010 May,Health-related outcomes of adverse childhood e...,2010,B01DP009053


In [23]:
# Create language dummy (english vs others)
pub['engl_pub'] = np.where(pub['lang'] == "eng", 1, 0)
pub['other_pub'] = np.where(pub['lang'] != "eng", 1, 0)

In [24]:
# Group country Variable
pub['NorthAmerica_pub'] = np.where((pub['country'] == "United States") | (pub['country'] == "Canada")
                             , 1, 0)
pub['Europe_pub'] = np.where((pub['country'] == "England") | (pub['country'] == "Greece") | 
                             (pub['country'] == "Switzerland") | (pub['country'] == "Netherlands") |
                             (pub['country'] == "Germany") | (pub['country'] == "Spain") |
                             (pub['country'] == "Irland") | (pub['country'] == "Denmark") |
                             (pub['country'] == "France") | (pub['country'] == "Scotland") |
                             (pub['country'] == "Norway") | (pub['country'] == "Czech Republic") |
                             (pub['country'] == "Italy") | (pub['country'] == "Poland") |
                             (pub['country'] == "Finland") | (pub['country'] == "Austria") |
                             (pub['country'] == "Sweden") | (pub['country'] == "Belgium") |
                             (pub['country'] == "Bulgaria") | (pub['country'] == "Hungary") |
                             (pub['country'] == "Romania") | (pub['country'] == "Bosnia and Herzegovina") |
                             (pub['country'] == "Slovenia") | (pub['country'] == "Slovakia") |
                             (pub['country'] == "United Kingdom") | (pub['country'] == "Portugal") |
                             (pub['country'] == "Serbia") | (pub['country'] == "Malta") |
                             (pub['country'] == "Lithuania") | (pub['country'] == "Malta") |
                             (pub['country'] == "Wales") | (pub['country'] == "Iceland") 
                             , 1, 0)

In [25]:
pub[['project_number', 'journal_title', 'engl_pub', 'other_pub', 'NorthAmerica_pub','Europe_pub']].head(100)

Unnamed: 0,project_number,journal_title,engl_pub,other_pub,NorthAmerica_pub,Europe_pub
1024452,B01DP009025,"Public health nursing (Boston, Mass.)",1,0,1,0
1030873,B01DP009042,Journal of urban health : bulletin of the New ...,1,0,1,0
735605,B01DP009043,The Journal of the Oklahoma State Medical Asso...,1,0,1,0
755924,B01DP009043,The Journal of the Oklahoma State Medical Asso...,1,0,1,0
359272,B01DP009053,Preventing chronic disease,1,0,1,0
1005910,C06RR020539,Clinical cancer research : an official journal...,1,0,1,0
570480,C06RR028655,Leukemia research,1,0,0,1
702061,C06RR029965,Prenatal diagnosis,1,0,0,1
734826,C06RR029965,JAMA neurology,1,0,1,0
861745,C06RR029965,Fertility and sterility,1,0,1,0


In [26]:
# Now lets generate some vars for each project, using groupby
pub_outcome = pub.groupby(by=['project_number'])['engl_pub', 'other_pub', 'NorthAmerica_pub','Europe_pub'].agg(['sum'])

In [27]:
pub_outcome = pub_outcome.reset_index()

In [28]:
pub_outcome['total_pub'] = pub_outcome['engl_pub'] + pub_outcome['other_pub']
pub_outcome.count()

project_number           113121
engl_pub          sum    113121
other_pub         sum    113121
NorthAmerica_pub  sum    113121
Europe_pub        sum    113121
total_pub                113121
dtype: int64

In [29]:
pub_outcome.head(100)

Unnamed: 0_level_0,project_number,engl_pub,other_pub,NorthAmerica_pub,Europe_pub,total_pub
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,sum,sum,sum,Unnamed: 6_level_1
0,B01DP009025,1,0,1,0,1
1,B01DP009042,1,0,1,0,1
2,B01DP009043,2,0,2,0,2
3,B01DP009053,1,0,1,0,1
4,C06RR020539,1,0,1,0,1
5,C06RR028655,1,0,0,1,1
6,C06RR029965,5,1,3,3,6
7,C06RR030414,16,2,9,9,18
8,C06RR030651,2,1,2,1,3
9,D43CA153707,2,0,2,0,2


In [30]:
pub_outcome.total_pub.describe()

count    113121.000000
mean          9.321231
std          30.748991
min           1.000000
25%           2.000000
50%           4.000000
75%          10.000000
max        6669.000000
Name: total_pub, dtype: float64

In [31]:
QUERY = '''
SELECT project_number, title
FROM publications_other ;
'''
# here we pass the query and the connection to the pd.read_sql() function 
pub2=pd.read_sql_query(QUERY,con=engine)

In [32]:
# Gen indicator to count publications
pub2['pub_count'] = np.where((pub2['title'] != np.NaN) , 1, 0)

In [33]:
pub2 = pub2[['project_number','pub_count']]

In [34]:
pub2.project_number.value_counts()

USFS-0000615    1320
USFS-0000558    1122
USFS-0000563     960
USFS-0000693     648
0822283          643
1157698          589
USFS-0000690     588
0833178          582
1002410          576
USFS-0000561     570
0819885          568
1300426          568
0822838          548
0820521          514
1410972          505
USFS-0000592     504
0903787          494
USFS-0000559     468
1211067          465
0814251          456
0749028          452
USFS-0000625     450
USFS-0000560     444
1430152          440
USFS-0000708     432
1125897          424
USFS-0000567     420
USFS-0000624     414
1205960          407
0855405          402
                ... 
1059160            1
1360509            1
0940520            1
1351129            1
1203947            1
1125698            1
0938120            1
1224868            1
1510296            1
1661756            1
1418858            1
0967023            1
1510293            1
1243622            1
0926848            1
1303378            1
1537372      

In [35]:
# Now lets generate some vars for each project, using groupby
pub2_outcome = pub2.groupby(by=['project_number'])['pub_count'].agg(['sum'])
pub2_outcome = pub2_outcome.reset_index()

In [36]:
pub2_outcome.head(100)

Unnamed: 0,project_number,sum
0,\t\t Clark,1
1,"\t\t Torbj{\""o}rn and Blaha",1
2,\t\t Levine and Julien Tierny and Valerio Pasc...,1
3,"\t""",14
4,"\tHans-Georg""",2
5,"\tRavi and Guido Sch\""{a}fer""",2
6,\tand Moriarty,1
7,\tat the Electron-Collecting Interface of P3HT...,1
8,\t{Bouvier},5
9,\t{Buckley},8


#### Now merge outcome file to project file
We want to keep the records that are only in the left dataframe. These are our projects which don't have publications

In [37]:
learn = prj.merge(pub2_outcome, left_on=['project_number'], right_on=['project_number'], how='left')

In [38]:
learn.rename(columns={'sum': 'pub_count'}, inplace=True)

In [39]:
learn.head(1000)

Unnamed: 0,project_number,department,agency,project_start,project_end,organization_name,organization_state,fy,fy_total_cost,duration,pi_count,tit_len,pub_count
0,EPD05044,EPA,EPA,2005-03-01,2005-08-31,VEGRANDIS INC.,,2005,70000.0,183 days,1.0,5,
1,EPD05008,EPA,EPA,2005-03-01,2005-08-31,"MATERIALS PROCESSING, INC.",,2005,69916.0,183 days,1.0,9,
2,EPD05040,EPA,EPA,2005-03-01,2005-08-31,SOUTHWEST SCIENCES INC,,2005,70000.0,183 days,1.0,9,
3,EPD05041,EPA,EPA,2005-03-01,2005-08-31,SOUTHWEST SCIENCES INC,,2005,70000.0,183 days,1.0,8,
4,EPD05042,EPA,EPA,2005-03-01,2005-08-31,TDA RESEARCH INC,,2005,70000.0,183 days,1.0,7,
5,EPD05037,EPA,EPA,2005-03-01,2005-08-31,SCIENTIFIC METHODS INC,,2005,69930.0,183 days,1.0,12,
6,EPD05038,EPA,EPA,2005-03-01,2005-08-31,SEACOAST SCIENCE INC,,2005,66204.0,183 days,2.0,4,
7,EPD05039,EPA,EPA,2005-03-01,2005-08-31,SORPTION TECHNOLOGIES INC.,,2005,69990.0,183 days,1.0,12,
8,EPD05027,EPA,EPA,2005-03-01,2005-08-31,LUNA INNOVATIONS INC,,2005,69939.0,183 days,1.0,6,
9,EPD05022,EPA,EPA,2005-03-01,2005-08-31,"CLEAR SKIES UNLIMITED, INC.",,2005,70000.0,183 days,1.0,7,


In [40]:
# we want to replace the missings with zero. It means that we haven't found publications for them in our database
learn['pub_count']=learn.pub_count.fillna(0)

In [41]:
learn.pub_count.describe()

count    894014.000000
mean          0.778203
std           9.868726
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max        1320.000000
Name: pub_count, dtype: float64

In [42]:
# delete the rows with missing values
learn = learn.dropna()

In [43]:
learn.describe()

Unnamed: 0,fy,fy_total_cost,duration,pi_count,tit_len,pub_count
count,683212.0,683212.0,683212,683212.0,683212.0,683212.0
mean,2011.84834,459479.1,2207 days 19:46:04.806238,1.271961,9.442429,0.950052
std,2.589219,1317437.0,2268 days 21:49:00.578435,0.819105,5.487699,10.456771
min,2004.0,1.0,-1939 days +00:00:00,1.0,1.0,0.0
25%,2010.0,148051.0,1081 days 00:00:00,1.0,6.0,0.0
50%,2012.0,299886.0,1641 days 00:00:00,1.0,8.0,0.0
75%,2014.0,432000.0,1947 days 00:00:00,1.0,11.0,0.0
max,2016.0,281122100.0,18627 days 00:00:00,39.0,54.0,1320.0


In [44]:
pwd

u'/wingrdp/homedirs/deh341/ada_pub'

In [45]:
# save as csv file
learn.to_csv('/wingrdp/gscratch/share/wagner2018/machinelearning.csv', encoding=('utf-8')) 