## Wrangling Calculations for Tech Report 

df: raw FR data from DSPG 2020 (Sean's data pull)  
df_meta: raw FR data - abstracts from DSPG 2020, metadata from Kathryn's XML pull in February 2021. This should be the dataset we use going forward.

Using this script to check if df_meta and df will result in the same filtered dataset. 

In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 75) 

In [40]:
# reading in raw data

# set data types
cols = ['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE', 'DEPARTMENT', 'AGENCY', 'IC_CENTER', 
        'PROJECT_NUMBER', 'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS', 
        'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER', 'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE', 
        'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE', 'BUDGET_END_DATE', 'CFDA_CODE', 'FY.y', 
        'FY_TOTAL_COST', 'FY_TOTAL_COST_SUB_PROJECTS']
dtypes = {col: 'str' for col in cols}
dtypes["FY_TOTAL_COST"] = 'float'
dtypes["FY_TOTAL_COST_SUB_PROJECTS"] = 'float'

df = pd.read_csv('/home/kjl5t/dspg20RnD/data/original/working_federal_reporter_2020.csv', dtype = dtypes, engine='python')
print(df.shape)

(1156137, 26)


In [41]:
# reading in fixed metadata

# set data types
cols = ['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE', 'DEPARTMENT', 'AGENCY', 'IC_CENTER', 
        'PROJECT_NUMBER', 'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS', 
        'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER', 'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE', 
        'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE', 'BUDGET_END_DATE', 'CFDA_CODE', 'FY.y', 
        'FY_TOTAL_COST', 'FY_TOTAL_COST_SUB_PROJECTS']
dtypes = {col: 'str' for col in cols}
dtypes["FY_TOTAL_COST"] = 'float'
dtypes["FY_TOTAL_COST_SUB_PROJECTS"] = 'float'

# dtype = dtypes,
df_meta = pd.read_csv('../../../data/prd/Federal_RePORTER/FR_raw_2021FEB24.csv', dtype = dtypes, engine='python')
print(df_meta.shape)

(1156137, 26)


In [4]:
df[df['PROJECT_ID'] == '1e+05']

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
10004,100000.0,Dr. David Lai is awarded an NSF Astronomy and ...,2008,adaptive optics; Astronomy; Award; base; Calif...,CHEMICAL SIGNATURES OF RECENT MERGER EVENTS IN...,NSF,NSF,,802292,9/1/2008,8/31/2011,"LAI, DAVID K",,17,,"LAI, DAVID K",SANTA CRUZ,CA,95064-1077,UNITED STATES,,,47.049,2008,231000.0,


In [5]:
df_meta[df_meta['PROJECT_ID'] == '100000']

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
10004,100000,Dr. David Lai is awarded an NSF Astronomy and ...,2008,adaptive optics; Astronomy; Award; base; Cali...,CHEMICAL SIGNATURES OF RECENT MERGER EVENTS IN...,NSF,NSF,,802292,9/1/2008,8/31/2011,"LAI, DAVID K",,17,,"LAI, DAVID K",SANTA CRUZ,CA,95064-1077,UNITED STATES,,,47.049,2008,231000.0,


In [6]:
# find different elements in each set

A = set(df["PROJECT_ID"])
B = set(df_meta["PROJECT_ID"])

In [7]:
A.difference(B)

{'1e+05',
 '1e+06',
 '2e+05',
 '3e+05',
 '4e+05',
 '5e+05',
 '7e+05',
 '8e+05',
 '9e+05'}

In [8]:
B.difference(A)

{'100000',
 '1000000',
 '200000',
 '300000',
 '400000',
 '500000',
 '700000',
 '800000',
 '900000'}

In [42]:
# fix the scientific notation indices in df

sci_not = ['1e+05', '1e+06', '2e+05', '3e+05', '4e+05', '5e+05', '7e+05', '8e+05', '9e+05']
fixed = ['100000', '1000000', '200000', '300000', '400000', '500000', '700000', '800000', '900000']

for i, proj_id in enumerate(sci_not):
    idx = np.where(df["PROJECT_ID"] == proj_id)
    df['PROJECT_ID'][idx[0][0]] = fixed[i]
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PROJECT_ID'][idx[0][0]] = fixed[i]


In [43]:
# find different elements in each set

A = set(df["PROJECT_ID"])
B = set(df_meta["PROJECT_ID"])

print(A.difference(B))
print(B.difference(A))

set()
set()


In [11]:
df[df["PROJECT_ID"] == '200000']

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
118663,200000,DESCRIPTION (provided by applicant): Parkinso...,2009,Affect; Age; Age of Onset; age related; aging ...,DJ-1 AND ITS INTERACTIONS WITH PARKIN AND ALPH...,HHS,NIH,,5F31GM082026-02,1/1/2008,12/31/2011,"RAMSEY, CHENERE PIERCE",,2,42250712,UNIVERSITY OF PENNSYLVANIA,PHILADELPHIA,PA,191046205,UNITED STATES,,,93.859,2009,41176.0,


In [12]:
df_meta[df_meta["PROJECT_ID"] == '200000']

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
118663,200000,DESCRIPTION (provided by applicant): Parkinso...,2009,Affect; Age; Age of Onset; age related; aging...,DJ-1 AND ITS INTERACTIONS WITH PARKIN AND ALPH...,HHS,NIH,NIGMS,5F31GM082026-02,1/1/2008,12/31/2011,"RAMSEY, CHENERE PIERCE",,2,42250712,UNIVERSITY OF PENNSYLVANIA,PHILADELPHIA,PA,191046205,UNITED STATES,1/1/2009,12/31/2009,93.859,2009,41176.0,


## Project ID's are all the same, dataframes have the same length

In [13]:
# are df and df meta the same?

df.equals(df_meta)

False

In [14]:
sum(df.index == df_meta.index)

1156137

### Dataframes are not the same but their indices are

In [15]:
# check column by column

print(df.columns)
print(df_meta.columns)
print(df.columns == df_meta.columns)

Index(['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE',
       'DEPARTMENT', 'AGENCY', 'IC_CENTER', 'PROJECT_NUMBER',
       'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER',
       'OTHER_PIS', 'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER',
       'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE',
       'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE',
       'BUDGET_END_DATE', 'CFDA_CODE', 'FY.y', 'FY_TOTAL_COST',
       'FY_TOTAL_COST_SUB_PROJECTS'],
      dtype='object')
Index(['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE',
       'DEPARTMENT', 'AGENCY', 'IC_CENTER', 'PROJECT_NUMBER',
       'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER',
       'OTHER_PIS', 'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER',
       'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE',
       'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE',
       'BUDGET_END_DATE', 'CFDA_CODE',

### dataframes have same columns and in same order

In [16]:
# checking each column for equality - NaN's won't report as equal

for col in df.columns:
    s = sum(df[col] == df_meta[col])
    print(f"{col}: {len(df)-s}")
    

PROJECT_ID: 0
ABSTRACT: 42380
FY.x: 35035
PROJECT_TERMS: 1156137
PROJECT_TITLE: 163
DEPARTMENT: 0
AGENCY: 0
IC_CENTER: 622428
PROJECT_NUMBER: 1
PROJECT_START_DATE: 154926
PROJECT_END_DATE: 146332
CONTACT_PI_PROJECT_LEADER: 181
OTHER_PIS: 1101813
CONGRESSIONAL_DISTRICT: 74510
DUNS_NUMBER: 14633
ORGANIZATION_NAME: 14575
ORGANIZATION_CITY: 5969
ORGANIZATION_STATE: 17030
ORGANIZATION_ZIP: 49018
ORGANIZATION_COUNTRY: 5738
BUDGET_START_DATE: 659663
BUDGET_END_DATE: 659714
CFDA_CODE: 216350
FY.y: 29
FY_TOTAL_COST: 287698
FY_TOTAL_COST_SUB_PROJECTS: 1082117


### Goal: find number of duplicates - need the same abstract, project title and start date between dataframes to keep the same filtered abstracts

### remove null abstracts

In [44]:
l1 = len(df)
df = df[~df['ABSTRACT'].isnull()]
l2 = len(df)

print(l1-l2, "null ABSTRACTs removed")

42380 null ABSTRACTs removed


In [45]:
l1 = len(df_meta)
df_meta = df_meta[~df_meta['ABSTRACT'].isnull()]
l2 = len(df_meta)

print(l1-l2, "null ABSTRACTs removed")

42380 null ABSTRACTs removed


In [46]:
len(df)

1113757

#### project titles: differences bc df was read in from CSVs, df_meta from XML.

In [47]:
# strip white space from character columns

for col in df.columns[0:24]:
    print(col)
    df[col] = df[col].str.strip()
    df_meta[col] = df_meta[col].str.strip()

PROJECT_ID
ABSTRACT
FY.x
PROJECT_TERMS
PROJECT_TITLE
DEPARTMENT
AGENCY
IC_CENTER
PROJECT_NUMBER
PROJECT_START_DATE
PROJECT_END_DATE
CONTACT_PI_PROJECT_LEADER
OTHER_PIS
CONGRESSIONAL_DISTRICT
DUNS_NUMBER
ORGANIZATION_NAME
ORGANIZATION_CITY
ORGANIZATION_STATE
ORGANIZATION_ZIP
ORGANIZATION_COUNTRY
BUDGET_START_DATE
BUDGET_END_DATE
CFDA_CODE
FY.y


In [21]:
for col in df.columns:
    s = sum(df[col] == df_meta[col])
    print(f"{col}: {len(df)-s}")

PROJECT_ID: 0
ABSTRACT: 0
FY.x: 0
PROJECT_TERMS: 26124
PROJECT_TITLE: 57
DEPARTMENT: 0
AGENCY: 0
IC_CENTER: 596842
PROJECT_NUMBER: 0
PROJECT_START_DATE: 150344
PROJECT_END_DATE: 142110
CONTACT_PI_PROJECT_LEADER: 76
OTHER_PIS: 970531
CONGRESSIONAL_DISTRICT: 69297
DUNS_NUMBER: 12076
ORGANIZATION_NAME: 12565
ORGANIZATION_CITY: 5577
ORGANIZATION_STATE: 12198
ORGANIZATION_ZIP: 46190
ORGANIZATION_COUNTRY: 5363
BUDGET_START_DATE: 633748
BUDGET_END_DATE: 633799
CFDA_CODE: 205097
FY.y: 29
FY_TOTAL_COST: 276557
FY_TOTAL_COST_SUB_PROJECTS: 1042708


### all abstracts are the same, 57 titles differ

In [22]:
same_title = (df['PROJECT_TITLE'].values == df_meta['PROJECT_TITLE'].values)

In [23]:
np.where(same_title == False)

(array([ 90756, 203339, 203342, 203363, 203696, 203945, 203974, 208639,
        208640, 208641, 208645, 208646, 208650, 309610, 309611, 314275,
        314277, 314281, 314282, 314283, 314284, 314285, 314286, 314288,
        314290, 314291, 314295, 314297, 314299, 314300, 314304, 314305,
        314307, 314311, 314313, 314314, 314315, 314316, 314317, 314319,
        314320, 314325, 314326, 314327, 314328, 314332, 314333, 314335,
        407433, 407489, 492587, 492604, 492631, 579002, 579018, 579049,
        579907]),)

In [24]:
df["PROJECT_TITLE"].iloc[492587]

'FACILITATING EMPLOYMENT FOR YOUTH WITH AUTISM: A REPLICATION STUDY OF AN INTERNSHIP\r\nMODEL TO IDENTIFY EVIDENCE BASED PRACTICES'

In [25]:
df_meta["PROJECT_TITLE"].iloc[492587]

'FACILITATING EMPLOYMENT FOR YOUTH WITH AUTISM: A REPLICATION STUDY OF AN INTERNSHIP\nMODEL TO IDENTIFY EVIDENCE BASED PRACTICES'

In [26]:
# \r character seems to be the problem - whitespace - comes thru on CSV, not XML

count = 0

for i, title in enumerate(df['PROJECT_TITLE']):
    if '\r' in title:
        count = count + 1
        
print(count)

57


In [48]:
# replace \r in df titles to match with df_meta

df['PROJECT_TITLE'] = df['PROJECT_TITLE'].str.replace('\r', '')

In [49]:
for col in df.columns:
    s = sum(df[col] == df_meta[col])
    print(f"{col}: {len(df)-s}")

PROJECT_ID: 0
ABSTRACT: 0
FY.x: 0
PROJECT_TERMS: 26124
PROJECT_TITLE: 0
DEPARTMENT: 0
AGENCY: 0
IC_CENTER: 596842
PROJECT_NUMBER: 0
PROJECT_START_DATE: 150344
PROJECT_END_DATE: 142110
CONTACT_PI_PROJECT_LEADER: 76
OTHER_PIS: 970531
CONGRESSIONAL_DISTRICT: 69297
DUNS_NUMBER: 12076
ORGANIZATION_NAME: 12565
ORGANIZATION_CITY: 5577
ORGANIZATION_STATE: 12198
ORGANIZATION_ZIP: 46190
ORGANIZATION_COUNTRY: 5363
BUDGET_START_DATE: 633748
BUDGET_END_DATE: 633799
CFDA_CODE: 205097
FY.y: 29
FY_TOTAL_COST: 276557
FY_TOTAL_COST_SUB_PROJECTS: 1042708


### project titles are the same between the dataframes

In [29]:
# change NaN's to '' for character columns so we can compare df and df_meta column

for col in df.columns[0:24]:
    print(col)
    df[col] = df[col].fillna("")
    df_meta[col] = df_meta[col].fillna("")

PROJECT_ID
ABSTRACT
FY.x
PROJECT_TERMS
PROJECT_TITLE
DEPARTMENT
AGENCY
IC_CENTER
PROJECT_NUMBER
PROJECT_START_DATE
PROJECT_END_DATE
CONTACT_PI_PROJECT_LEADER
OTHER_PIS
CONGRESSIONAL_DISTRICT
DUNS_NUMBER
ORGANIZATION_NAME
ORGANIZATION_CITY
ORGANIZATION_STATE
ORGANIZATION_ZIP
ORGANIZATION_COUNTRY
BUDGET_START_DATE
BUDGET_END_DATE
CFDA_CODE
FY.y


In [30]:
for col in df.columns:
    s = sum(df[col] == df_meta[col])
    print(f"{col}: {len(df)-s}")

PROJECT_ID: 0
ABSTRACT: 0
FY.x: 0
PROJECT_TERMS: 22582
PROJECT_TITLE: 0
DEPARTMENT: 0
AGENCY: 0
IC_CENTER: 404975
PROJECT_NUMBER: 0
PROJECT_START_DATE: 0
PROJECT_END_DATE: 0
CONTACT_PI_PROJECT_LEADER: 33
OTHER_PIS: 25
CONGRESSIONAL_DISTRICT: 29
DUNS_NUMBER: 29
ORGANIZATION_NAME: 10939
ORGANIZATION_CITY: 29
ORGANIZATION_STATE: 29
ORGANIZATION_ZIP: 29
ORGANIZATION_COUNTRY: 29
BUDGET_START_DATE: 381125
BUDGET_END_DATE: 381112
CFDA_CODE: 11757
FY.y: 29
FY_TOTAL_COST: 276557
FY_TOTAL_COST_SUB_PROJECTS: 1042708


In [34]:
same = (df['ORGANIZATION_COUNTRY'].values == df_meta['ORGANIZATION_COUNTRY'].values)
np.where(same == False)

(array([  16224,   50513,   72729,   73366,   73389,   86793,  125878,
         153315,  185531,  186137,  215397,  232011,  266954,  291642,
         295964,  296496,  299010,  304690,  378951,  455937,  525428,
         633407,  937662,  937664, 1031979, 1031981, 1078894, 1080488,
        1105393]),)

In [35]:
df.iloc[16224]

PROJECT_ID                                                               152724
ABSTRACT                      DESCRIPTION (provided by applicant): Cell migr...
FY.x                                                                       2008
PROJECT_TERMS                 Actinin; Actins; Address; Adhesions; Adhesives...
PROJECT_TITLE                 DETERMINANTS OF MYOGENIC AND NEURONAL MEMBRANE...
DEPARTMENT                                                                  HHS
AGENCY                                                                      NIH
IC_CENTER                                                                      
PROJECT_NUMBER                                                  5R01GM023244-33
PROJECT_START_DATE                                                     1/1/1988
PROJECT_END_DATE                                                     12/31/2010
CONTACT_PI_PROJECT_LEADER                          HORWITZ, ALAN F. RICK",","05
OTHER_PIS                               

In [37]:
df_meta.iloc[16224]

PROJECT_ID                                                               152724
ABSTRACT                      DESCRIPTION (provided by applicant): Cell migr...
FY.x                                                                       2008
PROJECT_TERMS                 Actinin; Actins; Address; Adhesions; Adhesives...
PROJECT_TITLE                 DETERMINANTS OF MYOGENIC AND NEURONAL MEMBRANE...
DEPARTMENT                                                                  HHS
AGENCY                                                                      NIH
IC_CENTER                                                                 NIGMS
PROJECT_NUMBER                                                  5R01GM023244-33
PROJECT_START_DATE                                                     1/1/1988
PROJECT_END_DATE                                                     12/31/2010
CONTACT_PI_PROJECT_LEADER                               HORWITZ, ALAN F. "RICK"
OTHER_PIS                               

### df_meta has correct data with parsing issues fixed from names in ""

### Project Start Dates - need to check for how they were filled in since these determine duplicates, Project Start Dates and FY.x the same between dataframes

df_meta has more budget start dates bc of issues with how the CSV df was read in (missing data in first n entries caused those columns to be read in as missing)

In [31]:
same = (df['BUDGET_START_DATE'].values == df_meta['BUDGET_START_DATE'].values)
np.where(same == False)

(array([  13984,   13985,   13986, ..., 1078894, 1080488, 1105393]),)

In [32]:
df.iloc[13984]

PROJECT_ID                                                               150680
ABSTRACT                      DESCRIPTION (provided by applicant):  Gastropr...
FY.x                                                                       2008
PROJECT_TERMS                 Address; Adverse effects; Adverse event; Affec...
PROJECT_TITLE                 NEW DIAGNOSIS AND TREATMENT APPROACHES FOR PAT...
DEPARTMENT                                                                  HHS
AGENCY                                                                      NIH
IC_CENTER                                                                      
PROJECT_NUMBER                                                  1U01DK074035-01
PROJECT_START_DATE                                                    9/30/2008
PROJECT_END_DATE                                                      4/30/2009
CONTACT_PI_PROJECT_LEADER                                   MCCALLUM, RICHARD W
OTHER_PIS                               

In [33]:
df_meta.iloc[13984]

PROJECT_ID                                                               150680
ABSTRACT                      DESCRIPTION (provided by applicant):  Gastropr...
FY.x                                                                       2008
PROJECT_TERMS                 Address; Adverse effects; Adverse event; Affec...
PROJECT_TITLE                 NEW DIAGNOSIS AND TREATMENT APPROACHES FOR PAT...
DEPARTMENT                                                                  HHS
AGENCY                                                                      NIH
IC_CENTER                                                                 NIDDK
PROJECT_NUMBER                                                  1U01DK074035-01
PROJECT_START_DATE                                                    9/30/2008
PROJECT_END_DATE                                                      4/30/2009
CONTACT_PI_PROJECT_LEADER                                   MCCALLUM, RICHARD W
OTHER_PIS                               

### Fill in missing start dates in each dataframe and compare

Previously filled in missing start dates to compare data frames...rereading data, removing null abstracts and skipping to here

In [50]:
print(sum(df["PROJECT_START_DATE"].isna()))
print(100*sum(df["PROJECT_START_DATE"].isna())/len(df))

150344
13.498815271194704


In [51]:
print(sum(df_meta["PROJECT_START_DATE"].isna()))
print(100*sum(df_meta["PROJECT_START_DATE"].isna())/len(df_meta))

150344
13.498815271194704


In [52]:
# Fill missing values in Project Start Date with Budget Start Date
df['PROJECT_START_DATE'] = df['PROJECT_START_DATE'].fillna(df['BUDGET_START_DATE'])
df_meta['PROJECT_START_DATE'] = df_meta['PROJECT_START_DATE'].fillna(df_meta['BUDGET_START_DATE'])

In [53]:
print(sum(df["PROJECT_START_DATE"].isna()))
print(100*sum(df["PROJECT_START_DATE"].isna())/len(df))

80080
7.190078266623689


In [54]:
print(sum(df_meta["PROJECT_START_DATE"].isna()))
print(100*sum(df_meta["PROJECT_START_DATE"].isna())/len(df_meta))

41813
3.7542300519772267


In [60]:
100*(150344-41813)/len(df)

9.744585219217477

### differences in how many filled in with budget start date

In [55]:
#If START date is still missing, fill start date with FY
df['PROJECT_START_DATE'] = df['PROJECT_START_DATE'].fillna(df['FY.x'])
df_meta['PROJECT_START_DATE'] = df_meta['PROJECT_START_DATE'].fillna(df_meta['FY.x'])

In [56]:
print(sum(df["PROJECT_START_DATE"].isna()))
print(100*sum(df["PROJECT_START_DATE"].isna())/len(df))

0
0.0


In [57]:
print(sum(df_meta["PROJECT_START_DATE"].isna()))
print(100*sum(df_meta["PROJECT_START_DATE"].isna())/len(df_meta))

0
0.0


In [61]:
100*(41813-0)/len(df)

3.7542300519772267

In [58]:
len(df)

1113757

In [63]:
# number of projects in df and df_meta with a different start date

len(df) - sum(df['PROJECT_START_DATE'] == df_meta['PROJECT_START_DATE'])

38267

### Number of Duplicates in dataset

In [65]:
# save NON-duplicated rows and the LAST occurrance of duplicated rows
df_dedup = df[~df.duplicated(subset=['ABSTRACT',  'PROJECT_TITLE', 'PROJECT_START_DATE'], keep='last')]

In [66]:
print(f"Original: {len(df)}")
print(f"Deduplicated: {len(df_dedup)}")
print(f"Number of Duplicates: {len(df) - len(df_dedup)}")

Original: 1113757
Deduplicated: 699672
Number of Duplicates: 414085


In [67]:
# save NON-duplicated rows and the LAST occurrance of duplicated rows
df_meta_dedup = df_meta[~df_meta.duplicated(subset=['ABSTRACT',  'PROJECT_TITLE', 'PROJECT_START_DATE'], keep='last')]

In [68]:
print(f"Original: {len(df_meta)}")
print(f"Deduplicated: {len(df_meta_dedup)}")
print(f"Number of Duplicates: {len(df_meta) - len(df_meta_dedup)}")

Original: 1113757
Deduplicated: 699652
Number of Duplicates: 414105


### Difference between dataframes

In [72]:
# find different elements in each set

A = set(df_dedup["PROJECT_ID"])
B = set(df_meta_dedup["PROJECT_ID"])

In [73]:
print(A.difference(B))
print(B.difference(A))

{'451642', '454030', '448329', '73965', '448332', '448871', '421304', '485259', '451645', '448382', '456322', '448869', '493323', '497247', '73759', '450729', '453614', '521180', '478601', '448868', '456444', '531808', '448864', '448867', '438471', '449696', '480840', '52941', '52937', '488268', '451649', '447131', '448865', '517345', '450909', '481661', '52940', '495296', '451653', '447135', '66291', '481461', '477264', '536301', '449694', '450727', '491957', '513417', '529483', '453617', '403610', '520136', '448872', '532143', '481217', '499682', '505984', '52936', '448333', '453619', '477091', '52933', '477076', '450730', '445586', '514311', '489081', '506329', '450731', '479281', '39022', '456447', '478881', '447134', '451644', '490453', '501241', '448252', '449697', '311114', '516385', '73964', '234933', '66289', '52934', '479435', '490645', '513414', '511913', '66288', '447128', '453616', '451647', '483600', '515249', '449693', '73749', '449698', '497932', '55254', '451648', '449

In [89]:
df[df['PROJECT_ID'] == '451642']

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
394176,451642,The Data Compilation and Bioinformatics Shared...,2011,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,,5P30CA023100-27 (9001),2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,,,,2011,,


In [90]:
df_meta[df_meta['PROJECT_ID'] == '451642']

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
394176,451642,The Data Compilation and Bioinformatics Shared...,2011,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,5P30CA023100-27 (9001),5/1/2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2011,4/30/2013,,2011,,380131.0


In [91]:
df[df['PROJECT_TITLE'] == "DATA COMPILATION"]

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
74218,246024,Data Compilation and BioinformaticsThe Data Co...,2008,Address; Archives; Area; behavior measurement;...,DATA COMPILATION,HHS,NIH,,5P30CA023100-24 (9001),2008,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,,,,2008,,
189986,339732,Data Compilation and BioinformaticsThe Data Co...,2009,Address; Archives; Area; behavior measurement;...,DATA COMPILATION,HHS,NIH,,5P30CA023100-25 (9001),2009,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,,,,2009,,
295898,402481,The Data Compilation and Bioinformatics Shared...,2010,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,,5P30CA023100-26 (9001),2010,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,,,,2010,,
394176,451642,The Data Compilation and Bioinformatics Shared...,2011,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,,5P30CA023100-27 (9001),2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,,,,2011,,
470983,66299,The Data Compilation and Bioinformatics Shared...,2012,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,3P30CA023100-27S7 (9001),5/1/2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2011,4/30/2013,,2012,,200699.0
470985,66301,The Data Compilation and Bioinformatics Shared...,2012,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,3P30CA023100-27S8 (9001),5/1/2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2011,4/30/2013,,2012,,725058.0


In [92]:
df_dedup[df_dedup['PROJECT_TITLE'] == "DATA COMPILATION"]

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
74218,246024,Data Compilation and BioinformaticsThe Data Co...,2008,Address; Archives; Area; behavior measurement;...,DATA COMPILATION,HHS,NIH,,5P30CA023100-24 (9001),2008,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,,,,2008,,
189986,339732,Data Compilation and BioinformaticsThe Data Co...,2009,Address; Archives; Area; behavior measurement;...,DATA COMPILATION,HHS,NIH,,5P30CA023100-25 (9001),2009,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,,,,2009,,
295898,402481,The Data Compilation and Bioinformatics Shared...,2010,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,,5P30CA023100-26 (9001),2010,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,,,,2010,,
394176,451642,The Data Compilation and Bioinformatics Shared...,2011,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,,5P30CA023100-27 (9001),2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,,,,2011,,
470985,66301,The Data Compilation and Bioinformatics Shared...,2012,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,3P30CA023100-27S8 (9001),5/1/2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2011,4/30/2013,,2012,,725058.0


In [93]:
df_meta[df_meta['PROJECT_TITLE'] == "DATA COMPILATION"]

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
74218,246024,Data Compilation and BioinformaticsThe Data Co...,2008,Address; Archives; Area; behavior measurement;...,DATA COMPILATION,HHS,NIH,NCI,5P30CA023100-24 (9001),5/1/2008,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2008,4/30/2009,,2008,,379725.0
189986,339732,Data Compilation and BioinformaticsThe Data Co...,2009,Address; Archives; Area; behavior measurement;...,DATA COMPILATION,HHS,NIH,NCI,5P30CA023100-25 (9001),5/1/2009,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2009,4/30/2010,,2009,,412545.0
295898,402481,The Data Compilation and Bioinformatics Shared...,2010,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,5P30CA023100-26 (9001),5/1/2010,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2010,4/30/2011,,2010,,410937.0
394176,451642,The Data Compilation and Bioinformatics Shared...,2011,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,5P30CA023100-27 (9001),5/1/2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2011,4/30/2013,,2011,,380131.0
470983,66299,The Data Compilation and Bioinformatics Shared...,2012,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,3P30CA023100-27S7 (9001),5/1/2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2011,4/30/2013,,2012,,200699.0
470985,66301,The Data Compilation and Bioinformatics Shared...,2012,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,3P30CA023100-27S8 (9001),5/1/2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2011,4/30/2013,,2012,,725058.0


In [94]:
df_meta_dedup[df_meta_dedup['PROJECT_TITLE'] == "DATA COMPILATION"]

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
74218,246024,Data Compilation and BioinformaticsThe Data Co...,2008,Address; Archives; Area; behavior measurement;...,DATA COMPILATION,HHS,NIH,NCI,5P30CA023100-24 (9001),5/1/2008,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2008,4/30/2009,,2008,,379725.0
189986,339732,Data Compilation and BioinformaticsThe Data Co...,2009,Address; Archives; Area; behavior measurement;...,DATA COMPILATION,HHS,NIH,NCI,5P30CA023100-25 (9001),5/1/2009,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2009,4/30/2010,,2009,,412545.0
295898,402481,The Data Compilation and Bioinformatics Shared...,2010,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,5P30CA023100-26 (9001),5/1/2010,,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2010,4/30/2011,,2010,,410937.0
470985,66301,The Data Compilation and Bioinformatics Shared...,2012,Archives; behavior measurement; Biochemical; B...,DATA COMPILATION,HHS,NIH,NCI,3P30CA023100-27S8 (9001),5/1/2011,4/30/2014,"CARSON, DENNIS",,52,804355790,UNIVERSITY OF CALIFORNIA SAN DIEGO,LA JOLLA,CA,920930934,UNITED STATES,5/1/2011,4/30/2013,,2012,,725058.0


In [78]:
# reading in raw data

# set data types
cols = ['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE', 'DEPARTMENT', 'AGENCY', 'IC_CENTER', 
        'PROJECT_NUMBER', 'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS', 
        'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER', 'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE', 
        'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE', 'BUDGET_END_DATE', 'CFDA_CODE', 'FY.y', 
        'FY_TOTAL_COST', 'FY_TOTAL_COST_SUB_PROJECTS']
dtypes = {col: 'str' for col in cols}
dtypes["FY_TOTAL_COST"] = 'float'
dtypes["FY_TOTAL_COST_SUB_PROJECTS"] = 'float'

df_og = pd.read_csv('/home/kjl5t/dspg20RnD/data/original/working_federal_reporter_2020.csv', dtype = dtypes, engine='python')
print(df_og.shape)

(1156137, 26)


In [84]:
df_og[df_og['PROJECT_ID'] == '454030']

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
396135,454030,This Project is designed to pursue clinical de...,2011,Address; Adenovirus Vector; Adenoviruses; Adhe...,CLINICAL DEVELOPMENT OF CHIMP ADENOVIRUS VECTORS,HHS,NIH,,5U19AI074078-05 (0001),,8/31/2015,"ERTL, HILDEGUND C. J.",,2,75524595,WISTAR INSTITUTE,PHILADELPHIA,PA,191044265,UNITED STATES,,,,2011,,


In [80]:
df_og.iloc[394176] # start date was just missing, but budget date filled in for df_meta

PROJECT_ID                                                               451642
ABSTRACT                      The Data Compilation and Bioinformatics Shared...
FY.x                                                                       2011
PROJECT_TERMS                 Archives; behavior measurement; Biochemical; B...
PROJECT_TITLE                                                  DATA COMPILATION
DEPARTMENT                                                                  HHS
AGENCY                                                                      NIH
IC_CENTER                                                                   NaN
PROJECT_NUMBER                                           5P30CA023100-27 (9001)
PROJECT_START_DATE                                                          NaN
PROJECT_END_DATE                                                      4/30/2014
CONTACT_PI_PROJECT_LEADER                                        CARSON, DENNIS
OTHER_PIS                               

In [85]:
# reading in fixed metadata

# set data types
cols = ['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE', 'DEPARTMENT', 'AGENCY', 'IC_CENTER', 
        'PROJECT_NUMBER', 'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS', 
        'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER', 'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE', 
        'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE', 'BUDGET_END_DATE', 'CFDA_CODE', 'FY.y', 
        'FY_TOTAL_COST', 'FY_TOTAL_COST_SUB_PROJECTS']
dtypes = {col: 'str' for col in cols}
dtypes["FY_TOTAL_COST"] = 'float'
dtypes["FY_TOTAL_COST_SUB_PROJECTS"] = 'float'

# dtype = dtypes,
df_meta_og = pd.read_csv('../../../data/prd/Federal_RePORTER/FR_raw_2021FEB24.csv', dtype = dtypes, engine='python')
print(df_meta_og.shape)

(1156137, 26)


In [86]:
df_meta_og[df_meta_og['PROJECT_ID'] == '454030']

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
396135,454030,This Project is designed to pursue clinical de...,2011,Address; Adenovirus Vector; Adenoviruses; Adh...,CLINICAL DEVELOPMENT OF CHIMP ADENOVIRUS VECTORS,HHS,NIH,NIAID,5U19AI074078-05 (0001),,8/31/2015,"ERTL, HILDEGUND C. J.",,2,75524595,WISTAR INSTITUTE,PHILADELPHIA,PA,191044265,UNITED STATES,9/1/2011,8/31/2013,,2011,,204266.0
