# Data Preparation and Database Load

This notebook reads in all the individual files that were downloaded from federal reporter (https://federalreporter.nih.gov/FileDownload) and creates three final datasets: Project, Abstract, Publications. The datasets contain info on project, abstract and publication by year. The data structure is organized in a longitundinal format having one observation per year. 

In [1]:
# general use imports
import datetime
import glob
import inspect
import numpy
import os
import six
import warnings
warnings.filterwarnings('ignore')

# pandas-related imports
import pandas as pd
import sqlalchemy

# CSV file reading-related imports
import csv

# database interaction imports
import psycopg2

In [2]:
# Store the notebook's home directory in a variable
notebook_home_directory = os.getcwd()
print( "Home directory = " + notebook_home_directory )

Home directory = /wingrdp/homedirs/deh341/ada_pub


In [3]:
# Store the path to the data directory
data_directory_prj = "~/FederalReporter/projects"
data_directory_abs = "~/FederalReporter/abstracts"
data_directory_pub = "~/FederalReporter/publications"
data_directory_key = "~/FederalReporter/key"
data_directory_out = "~/FederalReporter/out"

print( "Data directory for Project Info = " + data_directory_prj )
print( "Data directory for Abstract Info = " + data_directory_abs )
print( "Data directory for Publication Info = " + data_directory_pub )
print( "Data directory for Crosswalk Info = " + data_directory_key )
print( "Data directory for Output files = " + data_directory_out )

Data directory for Project Info = ~/FederalReporter/projects
Data directory for Abstract Info = ~/FederalReporter/abstracts
Data directory for Publication Info = ~/FederalReporter/publications
Data directory for Crosswalk Info = ~/FederalReporter/key
Data directory for Output files = ~/FederalReporter/out


# Project Files

In [4]:
%cd $data_directory_prj

/gpfs1/cusp/deh341/FederalReporter/projects


In [5]:
# Append all the yearly files into one file
df_prj = pd.DataFrame() #initializes DF which will hold aggregated csv files

for f in glob.glob("*.csv"): #for all csv files in pwd
    df = pd.read_csv(f, low_memory=False) #create dataframe for reading current csv
    df_prj = df_prj.append(df, ignore_index=True) #appends current csv to final DF

In [6]:
# See what is available (notice we have some whitespaces here)
df_prj.count()

PROJECT_ID                     894014
 PROJECT_TERMS                 894013
 PROJECT_TITLE                 894014
 DEPARTMENT                    894014
 AGENCY                        894014
 IC_CENTER                     724197
 PROJECT_NUMBER                894014
 PROJECT_START_DATE            778532
 PROJECT_END_DATE              793081
 CONTACT_PI_PROJECT_LEADER     893919
 OTHER_PIS                      98771
 CONGRESSIONAL_DISTRICT        830210
 DUNS_NUMBER                   883561
 ORGANIZATION_NAME             893492
 ORGANIZATION_CITY             889995
 ORGANIZATION_STATE            880642
 ORGANIZATION_ZIP              852989
 ORGANIZATION_COUNTRY          890167
 BUDGET_START_DATE             679756
 BUDGET_END_DATE               679737
 CFDA_CODE                     744484
 FY                            894014
 FY_TOTAL_COST                 727360
 FY_TOTAL_COST_SUB_PROJECTS    134513
dtype: int64

In [7]:
# Remove Whitespaces
df_prj.rename(columns=lambda x: x.strip(), inplace=True)

In [8]:
# Make columns lower case
df_prj.columns = map(str.lower, df_prj.columns)

In [9]:
# Look at the first 5 entries
df_prj.head()

Unnamed: 0,project_id,project_terms,project_title,department,agency,ic_center,project_number,project_start_date,project_end_date,contact_pi_project_leader,...,organization_city,organization_state,organization_zip,organization_country,budget_start_date,budget_end_date,cfda_code,fy,fy_total_cost,fy_total_cost_sub_projects
0,584395,Address; Aflatoxins; Arachis hypogaea; Area; ...,DEVELOP AND TRANSFER IRRIGATED AND NON-IRRIGAT...,USDA,ARS,,ARS-0407399,10/22/2003,10/21/2008,"SORENSEN, RONALD B",...,DAWSON,GA,39842,UNITED STATES,,,10.001,2004,,
1,584396,base; Benign; Chemicals; cold temperature; De...,SMALL FRUIT PRODUCTION SYSTEMS,USDA,ARS,,ARS-0407571,11/28/2003,11/27/2008,"TAKEDA, FUMIOMI",...,KEARNEYSVILLE,WV,25430,UNITED STATES,,,10.001,2004,,
2,584397,Acer; Alder plant; Biochemical; Breeding; Cli...,GENETIC IMPROVEMENT OF LANDSCAPE TREES FOR SUP...,USDA,ARS,,ARS-0407833,11/19/2003,10/31/2008,"OLSEN, RICHARD T",...,WASHINGTON,DC,20002,UNITED STATES,,,10.001,2004,,
3,584398,Adjuvant; Agriculture; Bacillus thuringiensis...,IMPROVING CROP PROTECTION TECHNOLOGY FOR HORTI...,USDA,ARS,,ARS-0407835,10/29/2003,10/28/2008,"DERKSEN, RICHARD C",...,WOOSTER,OH,44691,UNITED STATES,,,10.001,2004,,
4,584399,Anthocyanins; Area; Berry; Characteristics; C...,PRODUCTION SYSTEMS TO PROMOTE YIELD AND QUALIT...,USDA,ARS,,ARS-0407838,11/16/2003,11/15/2008,"TARARA, JULIE M",...,CORVALLIS,OR,97331,UNITED STATES,,,10.001,2004,,


In [10]:
# Save dataframe as csv for data ingestion
%cd $data_directory_out
df_prj.to_csv('FedExPrj0416.csv') 

/gpfs1/cusp/deh341/FederalReporter/out


# Abstract Files

In [11]:
%cd $data_directory_abs

/gpfs1/cusp/deh341/FederalReporter/abstracts


In [12]:
# Append all the yearly files into one file
df_abs = pd.DataFrame() #initializes DF which will hold aggregated csv files

for f in glob.glob("*.csv"): #for all csv files in pwd
    df = pd.read_csv(f, low_memory=False) #create dataframe for reading current csv
    df_abs = df_abs.append(df, ignore_index=True) #appends current csv to final DF

In [13]:
# See what is available (notice we have some whitespaces here)
df_abs.count()

PROJECT_ID    860328
 ABSTRACT     853082
dtype: int64

In [14]:
# Remove Whitespaces
df_abs.rename(columns=lambda x: x.strip(), inplace=True)

In [15]:
# Make columns lower case
df_abs.columns = map(str.lower, df_abs.columns)

In [16]:
# Look at the first 5 entries
df_abs.head()

Unnamed: 0,project_id,abstract
0,584419,"Objective(s): Conduct yearly, accurate nationw..."
1,584415,Objective(s): Develop new sweetpotato geneotyp...
2,584411,Objective(s): The broad objective of this proj...
3,584406,Objective(s): The primary goal for this projec...
4,584402,Objective(s): Relate effects of hydrothermal t...


In [17]:
# Save dataframe as csv for data ingestion
%cd $data_directory_out
df_abs.to_csv('FedExAbs0416.csv')

/gpfs1/cusp/deh341/FederalReporter/out


# Publication Files

Here we will have one file for each agency that contains the publication info. We cannot combine that because the data structure is nit the same in each of the data files

## From Reporter 

In [18]:
%cd $data_directory_pub

/gpfs1/cusp/deh341/FederalReporter/publications


In [19]:
# Append all the yearly files into one file, but here only use NIH files
df_pub_hhs = pd.DataFrame() #initializes DF which will hold aggregated csv files

for f in glob.glob("RePORTER*.csv"): #for all csv files in pwd
    df = pd.read_csv(f, low_memory=False) #create dataframe for reading current csv
    df_pub_hhs = df_pub_hhs.append(df, ignore_index=True) #appends current csv to final DF

In [20]:
df_pub_hhs.head()

Unnamed: 0,AFFILIATION,AUTHOR_LIST,COUNTRY,ISSN,JOURNAL_ISSUE,JOURNAL_TITLE,JOURNAL_TITLE_ABBR,JOURNAL_VOLUME,LANG,PAGE_NUMBER,PMC_ID,PMID,PUB_DATE,PUB_TITLE,PUB_YEAR
0,"Division of Epidemiology, Mayo Clinic College ...","Yang, P; Bamlet, W R; Ebbert, J O; Taylor, W R...",England,0143-3334,10,Carcinogenesis,Carcinogenesis,25,eng,1935-44,,15192016,2004 Oct,Glutathione pathway genes and lung cancer risk...,2004
1,"Department of Molecular and Cellular Biology, ...","Kleckner, Nancy; Zickler, Denise; Jones, Garet...",United States,0027-8424,34,Proceedings of the National Academy of Science...,Proc Natl Acad Sci U S A,101,eng,12592-7,515102.0,15299144,2004 Aug 24,A mechanical basis for chromosome function.,2004
2,"Department of Human Genetics, The University o...","Weiss, Lauren A; Veenstra-Vanderweele, Jeremy;...",England,1018-4813,11,European journal of human genetics : EJHG,Eur J Hum Genet,12,eng,949-54,,15292919,2004 Nov,Genome-wide association study identifies ITGB3...,2004
3,Laboratory of Environmental and Genetic Toxico...,"Wise, Sandra S; Elmore, Lynne W; Holt, Shawn E...",Netherlands,0300-8177,1-2,Molecular and cellular biochemistry,Mol Cell Biochem,255,eng,103-11,,14971651,2004 Jan,Telomerase-mediated lifespan extension of huma...,2004
4,"Medizinische Hochschule Hannover, Hannover, Ge...","Battmer, Rolf D; Dillier, Norbert; Lai, Wai K;...",England,1499-2027,,International journal of audiology,Int J Audiol,43 Suppl 1,eng,S10-5,,15732376,2004 Dec,Evaluation of the neural response telemetry (N...,2004


In [21]:
# See what is available (notice we have some whitespaces here)
df_pub_hhs.count()

AFFILIATION            868575
AUTHOR_LIST           1204841
COUNTRY               1174463
ISSN                  1193313
JOURNAL_ISSUE         1082145
JOURNAL_TITLE         1205036
JOURNAL_TITLE_ABBR    1205036
JOURNAL_VOLUME        1189621
LANG                  1205036
PAGE_NUMBER           1183554
PMC_ID                 862179
PMID                  1205036
PUB_DATE              1194236
PUB_TITLE             1204980
PUB_YEAR              1205036
dtype: int64

In [22]:
df_pub_hhs.PUB_YEAR.value_counts()

2013    112714
2014    112397
2015    111879
2012    108703
2011    104253
2010     98430
2009     93792
2016     91589
2008     87502
2007     78427
2006     73052
2005     68496
2004     63802
Name: PUB_YEAR, dtype: int64

In [23]:
# Now we have to get the links to project IDS from the linktable
%cd $data_directory_key

/gpfs1/cusp/deh341/FederalReporter/key


In [24]:
# Append all the yearly files into one file, but here only use NIH files
df_pub_key = pd.DataFrame() #initializes DF which will hold aggregated csv files

for f in glob.glob("*.csv"): #for all csv files in pwd
    df = pd.read_csv(f, low_memory=False) #create dataframe for reading current csv
    df_pub_key = df_pub_key.append(df, ignore_index=True) #appends current csv to final DF

In [25]:
# See what is available (notice we have some whitespaces here)
df_pub_key.count()

PMID               2497774
 PROJECT_NUMBER    2497774
dtype: int64

In [26]:
# Remove Whitespaces
df_pub_key.rename(columns=lambda x: x.strip(), inplace=True)

In [27]:
df_pub_key.sort_values(by=['PMID'], inplace=True)

In [28]:
# Check the file
df_pub_key.head()

Unnamed: 0,PMID,PROJECT_NUMBER
34372,12576299,R01HL054926
34807,12716654,R01AI035796
38510,12719981,R01DA013261
8088,12730154,P50GM021681
20072,12730154,T32GM008593


In [29]:
# We have duplicates in the file. We are linking the publication to the first Project Number found in FederalReporter only
df_pub_key = df_pub_key.drop_duplicates(subset=['PMID'])

In [30]:
# Merge to publication dataframe
df_pub_hhs_key = df_pub_hhs.merge(df_pub_key, left_on=['PMID'], right_on=['PMID'], how='left')

In [31]:
df_pub_hhs_key.count()

AFFILIATION            868575
AUTHOR_LIST           1204841
COUNTRY               1174463
ISSN                  1193313
JOURNAL_ISSUE         1082145
JOURNAL_TITLE         1205036
JOURNAL_TITLE_ABBR    1205036
JOURNAL_VOLUME        1189621
LANG                  1205036
PAGE_NUMBER           1183554
PMC_ID                 862179
PMID                  1205036
PUB_DATE              1194236
PUB_TITLE             1204980
PUB_YEAR              1205036
PROJECT_NUMBER        1054427
dtype: int64

In [32]:
# Only keep the ones that have a valid project number because we only can link these
df_pub_hhs_key = df_pub_hhs_key.dropna(subset = ['PROJECT_NUMBER'])

In [33]:
# Make columns lower case
df_pub_hhs_key.columns = map(str.lower, df_pub_hhs_key.columns)

In [34]:
df_pub_hhs_key.head()

Unnamed: 0,affiliation,author_list,country,issn,journal_issue,journal_title,journal_title_abbr,journal_volume,lang,page_number,pmc_id,pmid,pub_date,pub_title,pub_year,project_number
0,"Division of Epidemiology, Mayo Clinic College ...","Yang, P; Bamlet, W R; Ebbert, J O; Taylor, W R...",England,0143-3334,10.0,Carcinogenesis,Carcinogenesis,25,eng,1935-44,,15192016,2004 Oct,Glutathione pathway genes and lung cancer risk...,2004,R01CA080127
1,"Department of Molecular and Cellular Biology, ...","Kleckner, Nancy; Zickler, Denise; Jones, Garet...",United States,0027-8424,34.0,Proceedings of the National Academy of Science...,Proc Natl Acad Sci U S A,101,eng,12592-7,515102.0,15299144,2004 Aug 24,A mechanical basis for chromosome function.,2004,R01HG003143
2,"Department of Human Genetics, The University o...","Weiss, Lauren A; Veenstra-Vanderweele, Jeremy;...",England,1018-4813,11.0,European journal of human genetics : EJHG,Eur J Hum Genet,12,eng,949-54,,15292919,2004 Nov,Genome-wide association study identifies ITGB3...,2004,R01HG001645
4,"Medizinische Hochschule Hannover, Hannover, Ge...","Battmer, Rolf D; Dillier, Norbert; Lai, Wai K;...",England,1499-2027,,International journal of audiology,Int J Audiol,43 Suppl 1,eng,S10-5,,15732376,2004 Dec,Evaluation of the neural response telemetry (N...,2004,P50DC000242
5,"Institute for Behavioral Research, University ...","Tinney, Shannon M; Oser, Carrie B; Johnson, J ...",United States,1094-3412,4.0,The journal of behavioral health services & re...,J Behav Health Serv Res,31,eng,403-17,,15602141,2004 Oct-Dec,Predominantly female caseloads: identifying or...,2004,R01DA013110


In [35]:
# Save dataframe as csv for data ingestion
%cd $data_directory_out
df_pub_hhs_key.to_csv('FedExPubHHS0416.csv')

/gpfs1/cusp/deh341/FederalReporter/out


## From Federal Reporter

Now we can just do this with the other Publication data as well. We will keep them in each in their own file

In [36]:
%cd $data_directory_pub

/gpfs1/cusp/deh341/FederalReporter/publications


In [37]:
# Append all the yearly files into one file, but here only use NIH files
df_pub_other = pd.DataFrame() #initializes DF which will hold aggregated csv files

for f in glob.glob("FedRePORTER*.csv"): #for all csv files in pwd
    df = pd.read_csv(f, low_memory=False, error_bad_lines=False) #create dataframe for reading current csv
    df_pub_other = df_pub_other.append(df, ignore_index=True) #appends current csv to final DF

b'Skipping line 285: expected 3 fields, saw 12\nSkipping line 1437: expected 3 fields, saw 4\nSkipping line 3588: expected 3 fields, saw 5\nSkipping line 5287: expected 3 fields, saw 5\nSkipping line 7092: expected 3 fields, saw 5\nSkipping line 9873: expected 3 fields, saw 5\n'
b'Skipping line 341: expected 3 fields, saw 7\nSkipping line 626: expected 3 fields, saw 7\nSkipping line 680: expected 3 fields, saw 7\nSkipping line 777: expected 3 fields, saw 8\nSkipping line 837: expected 3 fields, saw 11\nSkipping line 1239: expected 3 fields, saw 9\nSkipping line 1354: expected 3 fields, saw 7\nSkipping line 1355: expected 3 fields, saw 7\nSkipping line 1375: expected 3 fields, saw 7\nSkipping line 1376: expected 3 fields, saw 7\nSkipping line 1403: expected 3 fields, saw 8\nSkipping line 1445: expected 3 fields, saw 13\nSkipping line 1531: expected 3 fields, saw 6\nSkipping line 1532: expected 3 fields, saw 6\nSkipping line 1753: expected 3 fields, saw 6\nSkipping line 1754: expected 3 

In [38]:
df_pub_other.count()

COMMON_PROJECT_NUMBER    552510
 TITLE                   552415
 AUTHORS_LIST            551525
dtype: int64

In [39]:
# Remove Whitespaces
df_pub_other.rename(columns=lambda x: x.strip(), inplace=True)
# Make columns lower case
df_pub_other.columns = map(str.lower, df_pub_other.columns)
# Rename identifier
df_pub_other.rename(columns={'common_project_number': 'project_number'}, inplace=True)

In [40]:
df_pub_other.head()

Unnamed: 0,project_number,title,authors_list
0,ARS-0422661,Calibration and validation of the SWAT model f...,
1,ARS-0423023,The draft genome of whitefly Bemisia tabaci ME...,
2,ARS-0422661,Spatial uniformity in sensitivity coefficient ...,
3,ARS-0422661,Soil heat flux calculation for sunlit and shad...,Paul D. Colaizzi;Steven R. Evett;Nurit Agam;Ro...
4,ARS-0423057,Bioassay conditions for infection of Pinus rad...,Timothy L. Widmer;Stephen C. Dodge


In [41]:
# Save dataframe as csv for data ingestion
%cd $data_directory_out
df_pub_other.to_csv('FedExPubOTH0416.csv')

/gpfs1/cusp/deh341/FederalReporter/out


# Load data in Database

In [42]:
from sqlalchemy import create_engine
engine = create_engine('postgresql:///ada_pub')# ingest publications to database

In [43]:
# ingest publications from Reporter to database
df_pub_hhs_key.to_sql('publications_hhs', engine, index=False, if_exists='replace')

In [44]:
# ingest publications from Federal Reporter to database
df_pub_other.to_sql('publications_other', engine, index=False, if_exists='replace')

In [45]:
# ingest publications to database
df_abs.to_sql('abstracts', engine, index=False, if_exists='replace')

In [47]:
# ingest publications to database
df_prj.to_sql('projects', engine, index=False, if_exists='replace')

# Test queries

In [None]:
# Use pandas
a=pd.read_sql_query('select * FROM projects',con=engine)
a.head()

In [None]:
b=pd.read_sql_query('select * FROM abstracts',con=engine)
b.head()

In [None]:
c=pd.read_sql_query('select * FROM publications_hhs',con=engine)
c.head()

In [None]:
d=pd.read_sql_query('select * FROM publications_other',con=engine)
d

In [None]:
# Use sqlalchmey
sql_string = "SELECT COUNT( * ) AS row_count FROM projects;"
query_result = engine.execution_options( stream_results = True ).execute( sql_string )
result_list = query_result.fetchall()
result_list