# Data Prep for Lab 2

In [1]:
# general use imports
%pylab inline
import datetime
import numpy as np
import os
import six
import warnings
import matplotlib.pyplot as plt
import re

# pandas-related imports
from __future__ import print_function
import pandas as pd
import scipy
import sklearn

# record linkage package
import recordlinkage as rl
from recordlinkage.standardise import clean
from recordlinkage.standardise import phonetic

# CSV file reading-related imports
import csv

# sqlalchemy an psycopg2 are sql connection packages
from sqlalchemy import create_engine


print( "Imports loaded at " + str( datetime.datetime.now() ) )

Populating the interactive namespace from numpy and matplotlib
Imports loaded at 2018-02-22 22:14:22.619895


## Prepare Inventor Data

In [23]:
# For Inventor Data
inventor=pd.read_csv('~/FederalReporter/rawinventor.tsv',delimiter='\t',error_bad_lines=False, encoding='utf8')

  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
inventor = inventor[['patent_id','name_first','name_last']]

In [29]:
inventor.tail()

Unnamed: 0,patent_id,name_first,name_last
13176606,9761374,Stephen John,Rigby
13176607,9774823,Guy,Gadnir
13176608,9773716,Chen-Chung,Lai
13176609,9748941,Dong Yun,Jung
13176610,9756967,Thomas,Vallette


In [2]:
patent=pd.read_csv('~/FederalReporter/patent.tsv',delimiter='\t',encoding='utf-8',error_bad_lines=False)

Skipping line 4232501: expected 10 fields, saw 11

Skipping line 4267898: expected 10 fields, saw 11

Skipping line 4327680: expected 10 fields, saw 11
Skipping line 4368028: expected 10 fields, saw 11
Skipping line 4377090: expected 10 fields, saw 11

Skipping line 4426415: expected 10 fields, saw 11

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# keep only patents 2004-2016
patent['fy']= patent.date.str.split('-').str.get(0)

In [5]:
# Create variable with TRUE if nationality is USA
american = patent['country'] == "US"

# Create variable with TRUE if patent are year 2004 - 2016
time = patent['fy'] > "2003"

# Select all cases where nationality is USA and age is greater than 50
patent = patent[american & time]

In [8]:
patent = patent[['id','country','fy','title','kind', 'num_claims']]

In [6]:
patent.shape

(3416258, 7)

In [15]:
patent.head()

Unnamed: 0,id,country,fy,title,kind,num_claims
2727898,6671884,US,2004,Method for defining areas of a protective garm...,B1,14.0
2727899,6671885,US,2004,Headwear for securing articles,B2,29.0
2727900,6671886,US,2004,Decorative face mask for use at sporting events,B1,18.0
2727901,6671887,US,2004,Weighted accessory for garments,B1,3.0
2727902,6671888,US,2004,Belt system,B2,6.0


In [32]:
inventor_merge = patent.merge(inventor, left_on=['id'], right_on=['patent_id'], how='inner')

In [33]:
inventor_merge.head(100)

Unnamed: 0,id,country,fy,title,kind,num_claims,patent_id,name_first,name_last
0,7617974,US,2009,Automatic gate and associated method for permi...,B2,49.0,7617974,Daniel,Vandyck
1,7617974,US,2009,Automatic gate and associated method for permi...,B2,49.0,7617974,Hervé,Gosselin
2,7617974,US,2009,Automatic gate and associated method for permi...,B2,49.0,7617974,François,Royen
3,7617975,US,2009,Service providing system and method,B2,7.0,7617975,Masanori,Ito
4,7617975,US,2009,Service providing system and method,B2,7.0,7617975,Naohisa,Kobayashi
5,7617975,US,2009,Service providing system and method,B2,7.0,7617975,Chikashi,Okamoto
6,7617975,US,2009,Service providing system and method,B2,7.0,7617975,Yoshishige,Narita
7,7617975,US,2009,Service providing system and method,B2,7.0,7617975,Kenji,Wada
8,7617976,US,2009,Shopping receptacle for receiving interactive ...,B2,20.0,7617976,Kia,Silverbrook
9,7617977,US,2009,Ticketing system for personal rapid transit,B2,20.0,7617977,Donald H.,"Mathews, III"


In [42]:
%cd ~
%cd FederalReporter
%pwd

/gpfs1/cusp/deh341
/gpfs1/cusp/deh341/FederalReporter


u'/gpfs1/cusp/deh341/FederalReporter'

In [50]:
# save as csv file
inventor_merge.to_csv('inventor.csv', encoding='utf-8') 

## Prepare Grants Data 

### Inspect Data

In [52]:
## For Grants Data
# create postgresql connection - three '/' indicate to use default host, port, username, and password
engine = create_engine('postgresql:///ada_pub')

QUERY = '''
SELECT contact_pi_project_leader , fy , organization_country, organization_zip, project_title, project_id 
FROM projects ;
'''
# here we pass the query and the connection to the pd.read_sql() function 
grants=pd.read_sql_query(QUERY,con=engine)
grants.contact_pi_project_leader.str.encode('utf-8')

0              MITCHELL, CHARLES DEBEAUX
1                 BERTAGNOLLI, MONICA M.
2                      GOLD, MARIELLE C.
3                      WUETHRICH, MARCEL
4            BAUER, NICHOLAS CHRISTOPHER
5                       KOUSA, YOUSSEF A
6                        BUTTE, NANCY F.
7                      HUDIS, CLIFFORD A
8                     SANDMEYER, SUZANNE
9                   NOTTERMAN, DANIEL A.
10                     BURTON, DENNIS R.
11                       FUCHS, SERGE Y.
12                        WELLER, JOSHUA
13                      SHAPIRO, JENESSA
14                         RIO, DONALD C
15                 LAROSE, JESSICA GOKEE
16                           ALCID, ERIC
17                     PASQUALE, ELENA B
18                      PRICE, CYNTHIA J
19                HERRERA, MARIA MARCELA
20                         SUNG, PATRICK
21                           ROMEO, TONY
22                       GALLANT, JACK L
23                 JESCHKE, MARC GERHARD
24              

In [53]:
# Check dataframe
grants.head(100)

Unnamed: 0,contact_pi_project_leader,fy,organization_country,organization_zip,project_title,project_id
0,"MITCHELL, CHARLES DEBEAUX",2013,UNITED STATES,331462926,MICROBIAL TRANSLOCATION & ALTERATIONS IN GUT M...,526114
1,"BERTAGNOLLI, MONICA M.",2013,UNITED STATES,021156110,SUPPORT FOR SPECIMEN BANKING IN ACOSOG,526115
2,"GOLD, MARIELLE C.",2013,UNITED STATES,972393098,MECHANISMS OF RECOGNITION BY MYCOBACTERIUM TUB...,526116
3,"WUETHRICH, MARCEL",2013,UNITED STATES,537151218,REGULATION OF VACCINE-INDUCED ANTI-FUNGAL T17 ...,526117
4,"BAUER, NICHOLAS CHRISTOPHER",2013,UNITED STATES,303224250,DNA DAMAGE-INDUCED REGULATION OF BASE EXCISION...,526118
5,"KOUSA, YOUSSEF A",2013,UNITED STATES,488242600,ROLE OF IRF6 IN PALATAL EPITHELIUM,526119
6,"BUTTE, NANCY F.",2013,UNITED STATES,770303411,NOVEL MODELS TO PREDICT ENERGY EXPENDITURE AND...,526120
7,"HUDIS, CLIFFORD A",2013,UNITED STATES,100656007,CANCER AND LEUKEMIA GROUP B,526121
8,"SANDMEYER, SUZANNE",2013,UNITED STATES,926173213,TY3 VIRUSLIKE PARTICLE MORPHOGENESIS AND HOST ...,526122
9,"NOTTERMAN, DANIEL A.",2013,UNITED STATES,170332360,RECIPROCAL GENETIC-ENVIRONMENTAL INTERACTIONS ...,526123


In [54]:
# Checking the attributes of the variables in the loaded dataframe
grants.dtypes

contact_pi_project_leader    object
fy                            int64
organization_country         object
organization_zip             object
project_title                object
project_id                    int64
dtype: object

In [55]:
# Looking at the names which have the maximum frequencies associated with them
grants['contact_pi_project_leader'].value_counts()

,                                    3887
HEIMBROOK, DAVID                      335
AZADI, PARASTOO                       306
STEWART, DAVID J.                     232
WEINER, GEORGE J.                     219
LAKATTA, EDWARD                       207
MCCORMICK, FRANK PATRICK              202
DEPINHO, RONALD ANTHONY               189
LERMAN, CARYN E.                      186
WICHA, MAX S.                         184
LE BEAU, MICHELLE M                   182
MARKLEY, JOHN LUTE                    172
GOLDMAN, ISRAEL DAVID                 171
WEINER, LOUIS M.                      170
CARBONE, MICHELE                      166
WOODLAND, DAVID L.                    161
DUNNAM, CURT R                        161
BRADY, KATHLEEN T.                    160
GERSON, STANTON L.                    160
PARTRIDGE, EDWARD E.                  155
LIPTON, STUART A                      150
PESTELL, RICHARD G                    149
ROBERTSON, ANDREW D                   149
COSTELLO, CATHERINE E.            

### Preprocessing

In [56]:
# Cleaning names (using the record linkage package tool, see imports)
# Clean removes any characters such as '-', '.', '/', '\', ':', brackets of all types. 
grants['contact_pi_project_leader']=clean(grants['contact_pi_project_leader'], lowercase=False, remove_brackets=False)

  _depr_warn()


In [57]:
# Remove Whitespaces
grants.rename(columns=lambda x: x.strip(), inplace=True)

In [58]:
# Getting the first character in the cleaned name variable
grants['name_last'] = grants.contact_pi_project_leader.str.split(' ').str.get(0)
grants['name_first'] = grants.contact_pi_project_leader.str.split(' ').str.get(1)
grants['name_middle'] = grants.contact_pi_project_leader.str.split(' ').str.get(2)

In [59]:
# check dataframe
grants[['contact_pi_project_leader', 'name_last', 'name_first', 'name_middle']].head(50)

Unnamed: 0,contact_pi_project_leader,name_last,name_first,name_middle
0,MITCHELL CHARLES DEBEAUX,MITCHELL,CHARLES,DEBEAUX
1,BERTAGNOLLI MONICA M,BERTAGNOLLI,MONICA,M
2,GOLD MARIELLE C,GOLD,MARIELLE,C
3,WUETHRICH MARCEL,WUETHRICH,MARCEL,
4,BAUER NICHOLAS CHRISTOPHER,BAUER,NICHOLAS,CHRISTOPHER
5,KOUSA YOUSSEF A,KOUSA,YOUSSEF,A
6,BUTTE NANCY F,BUTTE,NANCY,F
7,HUDIS CLIFFORD A,HUDIS,CLIFFORD,A
8,SANDMEYER SUZANNE,SANDMEYER,SUZANNE,
9,NOTTERMAN DANIEL A,NOTTERMAN,DANIEL,A


In [60]:
# Extracting ZIPCODE info

grants['zip']=grants['organization_zip'].str.extract('(\d{5})', expand=True)
 
# \d ---- tells that we need a digit
# \d{5} ---- tells that we need 5 digits consecutively
# () enclosing brackets tell that we need to extract this information in the new variable
grants[['organization_zip', 'zip']].head()

Unnamed: 0,organization_zip,zip
0,331462926,33146
1,21156110,2115
2,972393098,97239
3,537151218,53715
4,303224250,30322


In [61]:
grants = grants[['name_last', 'name_first', 'name_middle' , 'fy' , 'organization_country', 'zip', 'project_title', 'project_id']]
grants.head()

Unnamed: 0,name_last,name_first,name_middle,fy,organization_country,zip,project_title,project_id
0,MITCHELL,CHARLES,DEBEAUX,2013,UNITED STATES,33146,MICROBIAL TRANSLOCATION & ALTERATIONS IN GUT M...,526114
1,BERTAGNOLLI,MONICA,M,2013,UNITED STATES,2115,SUPPORT FOR SPECIMEN BANKING IN ACOSOG,526115
2,GOLD,MARIELLE,C,2013,UNITED STATES,97239,MECHANISMS OF RECOGNITION BY MYCOBACTERIUM TUB...,526116
3,WUETHRICH,MARCEL,,2013,UNITED STATES,53715,REGULATION OF VACCINE-INDUCED ANTI-FUNGAL T17 ...,526117
4,BAUER,NICHOLAS,CHRISTOPHER,2013,UNITED STATES,30322,DNA DAMAGE-INDUCED REGULATION OF BASE EXCISION...,526118


In [63]:
# save as csv file
grants.to_csv('grants.csv', encoding=('utf-8')) 