# Cleaning the dataset 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import string
import time

pd.set_option('display.max_columns', 50)

### Read in Raw Dataset

In [2]:
# reading in fixed metadata

# set data types
cols = ['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE', 'DEPARTMENT', 'AGENCY', 'IC_CENTER', 
        'PROJECT_NUMBER', 'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS', 
        'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER', 'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE', 
        'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE', 'BUDGET_END_DATE', 'CFDA_CODE', 'FY.y', 
        'FY_TOTAL_COST', 'FY_TOTAL_COST_SUB_PROJECTS']
dtypes = {col: 'str' for col in cols}
dtypes["FY_TOTAL_COST"] = 'float'
dtypes["FY_TOTAL_COST_SUB_PROJECTS"] = 'float'

df = pd.read_csv('../../../data/prd/Federal_RePORTER/FR-2021DEC/FR_raw_2021DEC17.csv', dtype = dtypes, engine='python')
print(df.shape)

(1262655, 26)


In [3]:
df.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science;...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,
1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; de...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,
2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,
3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Ch...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,
4,90000,Amphibian populations around the world are exp...,2008,Amphibia; Central America; Communicable Disea...,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,NSF,NSF,,815315,10/1/2008,9/30/2011,"ZAMUDIO, KELLY R",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.074,2008,370996.0,


#### some brief eda

In [4]:
len(pd.unique(df["PROJECT_ID"]))

1262655

In [5]:
df.dtypes

PROJECT_ID                     object
ABSTRACT                       object
FY.x                           object
PROJECT_TERMS                  object
PROJECT_TITLE                  object
DEPARTMENT                     object
AGENCY                         object
IC_CENTER                      object
PROJECT_NUMBER                 object
PROJECT_START_DATE             object
PROJECT_END_DATE               object
CONTACT_PI_PROJECT_LEADER      object
OTHER_PIS                      object
CONGRESSIONAL_DISTRICT         object
DUNS_NUMBER                    object
ORGANIZATION_NAME              object
ORGANIZATION_CITY              object
ORGANIZATION_STATE             object
ORGANIZATION_ZIP               object
ORGANIZATION_COUNTRY           object
BUDGET_START_DATE              object
BUDGET_END_DATE                object
CFDA_CODE                      object
FY.y                           object
FY_TOTAL_COST                 float64
FY_TOTAL_COST_SUB_PROJECTS    float64
dtype: objec

In [6]:
df.isnull().sum()

PROJECT_ID                          0
ABSTRACT                        42536
FY.x                            35162
PROJECT_TERMS                   62547
PROJECT_TITLE                       1
DEPARTMENT                          0
AGENCY                              0
IC_CENTER                      219780
PROJECT_NUMBER                      0
PROJECT_START_DATE             165214
PROJECT_END_DATE               156627
CONTACT_PI_PROJECT_LEADER       25128
OTHER_PIS                     1104459
CONGRESSIONAL_DISTRICT         103654
DUNS_NUMBER                     18276
ORGANIZATION_NAME                2135
ORGANIZATION_CITY               34339
ORGANIZATION_STATE              45975
ORGANIZATION_ZIP                77403
ORGANIZATION_COUNTRY            34111
BUDGET_START_DATE              285688
BUDGET_END_DATE                285880
CFDA_CODE                      217919
FY.y                                0
FY_TOTAL_COST                  227630
FY_TOTAL_COST_SUB_PROJECTS    1076820
dtype: int64

In [7]:
temp = df[df['FY_TOTAL_COST'].isnull()]
len(temp)

227630

In [8]:
temp['DEPARTMENT'].value_counts()

HHS     193009
USDA     18101
VA       14754
NSF        883
DOD        538
EPA        175
NASA       170
Name: DEPARTMENT, dtype: int64

In [9]:
# are FY.x and FY.y equal?  YES except for the entries where FY.x is NULL

temp = df[df["FY.x"] != df["FY.y"]]

In [10]:
print(len(temp))
temp['FY.x'].isnull().sum()

# CONCLUSION - Use FY.y for FY, drop FY.x

35162


35162

In [11]:
# FY.y is the reliable fiscal year information so we rename this column to FY

df = df.rename(columns={'FY.y': 'FY'})

In [12]:
df = df.drop(columns = ['FY.x'])

### Remove null abstracts

In [13]:
#remove rows with NULL abstracts

l1 = len(df)
df = df[~df.ABSTRACT.isnull()]
l2 = len(df)

print(l1-l2, "null ABSTRACTs removed")

42536 null ABSTRACTs removed


### Aggregate Counts

In [14]:
# strip white space from character columns

for col in df.columns[0:23]:
    print(col)
    df[col] = df[col].str.strip()

PROJECT_ID
ABSTRACT
PROJECT_TERMS
PROJECT_TITLE
DEPARTMENT
AGENCY
IC_CENTER
PROJECT_NUMBER
PROJECT_START_DATE
PROJECT_END_DATE
CONTACT_PI_PROJECT_LEADER
OTHER_PIS
CONGRESSIONAL_DISTRICT
DUNS_NUMBER
ORGANIZATION_NAME
ORGANIZATION_CITY
ORGANIZATION_STATE
ORGANIZATION_ZIP
ORGANIZATION_COUNTRY
BUDGET_START_DATE
BUDGET_END_DATE
CFDA_CODE
FY


**Aggregate funding amounts for rows with same ABSTRACT/TITLE/FY**

In [15]:
# Checking into duplicates first to examine FY_TOTAL_COST 

dup = df[df.duplicated(subset=['ABSTRACT',  'PROJECT_TITLE', 'FY'], keep=False)]
print(len(dup))

127229


In [16]:
dup.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
4,90000,Amphibian populations around the world are exp...,Amphibia; Central America; Communicable Diseas...,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,NSF,NSF,,815315,10/1/2008,9/30/2011,"ZAMUDIO, KELLY R",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.074,2008,370996.0,
12,89994,"Over the past century, rapid growth of human p...",Address; Affect; Area; base; Behavior; Biodive...,CNH: COLLABORATIVE RESEARCH: INTEGRATED DYNAMI...,NSF,NSF,,814260,10/1/2008,3/31/2012,"PLANTINGA, ANDREW J",,5,53599908,OREGON STATE UNIVERSITY,CORVALLIS,OR,97331-8507,UNITED STATES,,,47.075,2008,144965.0,
13,89993,Amphibian populations around the world are exp...,Amphibia; Central America; Communicable Diseas...,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,NSF,NSF,,815288,10/1/2008,3/31/2009,"LIPS, KAREN R",,12,939007555,SOUTHERN ILLINOIS UNIVERSITY CARBONDALE,CARBONDALE,IL,62901-4709,UNITED STATES,,,47.074,2008,,
18,90014,"Phylogenetic trees, which depict the genealogi...",Area; Biodiversity; Biological; Biology; Commu...,COLLABORATIVE RESEARCH: PHYLOGENETIC TREES FOR...,NSF,NSF,,830012,10/1/2008,9/30/2011,"FERNANDEZ-BACA, DAVID F","EULENSTEIN, OLIVER",4,5309844,IOWA STATE UNIVERSITY,AMES,IA,50011-2207,UNITED STATES,,,47.074,2008,800000.0,
26,89973,This project involves the study of Galois grou...,Algorithms; analog; Area; base; Case Study; de...,ARBOREAL GALOIS REPRESENTATIONS AND APPLICATIO...,NSF,NSF,,852826,8/20/2008,7/31/2012,"JONES, RAPHAEL F",,3,41509506,COLLEGE OF THE HOLY CROSS,WORCESTER,MA,01610-2395,UNITED STATES,,,47.049,2008,84353.0,


In [17]:
# look at specific dupicate group examples

#dup[dup['ABSTRACT'] == dup["ABSTRACT"].iloc[0]]

temp = df[df['PROJECT_TITLE'] == 'REDUCING HOSPITAL READMISSION RATES BY IMPLEMENTING AN INPATIENT TOBACCO CESSATION SERVICE DRIVEN BY INTERACTIVE-VOICE RECOGNITION TECHNOLOGY'] 
temp

# TAKEAWAY - Abstracts are basically the same for this example.  Differs in a junk phrase and the start and minor
# punctuation, capitalization.  If FY had been the same though, these wouldn't have been counted as duplicates.

Unnamed: 0,PROJECT_ID,ABSTRACT,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
706622,753743,﻿There is good reason to believe that providin...,Abstinence; Acute; Adverse effects; catalyst; ...,REDUCING HOSPITAL READMISSION RATES BY IMPLEME...,HHS,AHRQ,AHRQ,1R21HS023863-01,5/1/2015,4/30/2017,"CARTMELL, KATHLEEN BUFORD",,6,183710748,MEDICAL UNIVERSITY OF SOUTH CAROLINA,CHARLESTON,SC,294035120,UNITED STATES,5/1/2015,4/30/2016,93.226,2015,,
793244,841133,﻿ DESCRIPTION (provided by applicant): There...,,REDUCING HOSPITAL READMISSION RATES BY IMPLEME...,HHS,AHRQ,AHRQ,5R21HS023863-02,5/1/2015,4/30/2018,"CARTMELL, KATHLEEN BUFORD",,6,183710748,MEDICAL UNIVERSITY OF SOUTH CAROLINA,CHARLESTON,SC,294035120,UNITED STATES,5/1/2016,4/30/2018,93.226,2016,,


In [None]:
# OVERALL TAKEAWAY - sum total cost for each group of duplicates

**Aggregate counts for unique ORGANIZATION_NAMEs, PIs, and PROJECT_IDs in rows with same Abstract/Title/FY**

In [18]:
# group df by abstract/title/FY (exact matches)
all_grp = df.groupby(['ABSTRACT', 'PROJECT_TITLE', 'FY'], dropna=False)

In [19]:
# for each unique ABSTRACT/TITLE/FY in df, count unique Organizations, unique PIs, and group size
agg_df = all_grp.agg(ORG_COUNT=('ORGANIZATION_NAME', 'nunique'),
                     PI_COUNT=('CONTACT_PI_PROJECT_LEADER', 'nunique'),
                     FY_TOTAL_COST_SUM=('FY_TOTAL_COST', 'sum'),
                     NUM_RECORDS=('PROJECT_ID', 'count')
                    )

# merge to bring in the aggregated information
merged = df.merge(agg_df, how='left', on=['ABSTRACT', 'PROJECT_TITLE', 'FY'])

In [20]:
agg_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ORG_COUNT,PI_COUNT,FY_TOTAL_COST_SUM,NUM_RECORDS
ABSTRACT,PROJECT_TITLE,FY,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"! Project Summary Oral health problems (i.e. missing teeth, dental caries, and periodontal diseases) accumulate throughoutthe life span but occur with increasing frequency in later life. These problems arise in the context of increasingnumbers of older adults retaining their natural teeth and can seriously impact the individual's life, causing painas well as impairment of important functions such as speaking, chewing, and swallowing. Individuals withdementia have been shown to have significantly more oral plaque, more severe periodontal disease, morecaries, and fewer teeth than cognitively intact older adults. Our current study shows that even individuals withmild dementia have an increased risk of poor oral health mainly due to brushing their teeth less frequently thancognitively normal older individuals. Maintaining good oral hygiene is a critical step in preventing deteriorationof oral health. But individuals with mild dementia and their informal caregivers, who provide supervision orassistance with other daily activities, often neglect oral hygiene. Few interventions have addressed oral healthproblems among older adults in general and among individuals with dementia in particular, and to ourknowledge, no oral health interventions have been conducted among community-dwelling individuals with milddementia. To address this gap in knowledge we propose an adaptive intervention focused on mild dementiabecause individuals with this level of impairment still retain sufficient cognitive ability to perform oral hygienetasks successfully with minimal assistance. Guided by the Adaptive Leadership Framework, our intervention isdesigned to work with those with mild dementia and caregivers to identify technical and adaptive challenges inoral care and assist them to improve the demented individuals' ability to engage in effective oral care. Overthe past several years, we have conducted preliminary studies that support our proposed intervention. As thenext step toward our long-term goal of conducting an intervention study, we are proposing a one-year planningproject with the following aims: 1) Finalize the behavioral intervention protocol; 2) Establish projectinfrastructure for the future U01 project, and 3) Prepare a U01 application for the intervention. This project willprovide the infrastructure and the intervention materials to facilitate swift implementation of a future U01 project.The aim of the U01 application will be to conduct an innovative caregiver- and technology-assisted oral carebehavior intervention for individuals with mild dementia.. The project, if successful, will create an innovativeapproach to assist individuals with mild dementia and their caregivers in implementing a cooperative oralhygiene care plan to prevent deterioration of oral health, which in turn, will help individuals with dementiamaintain independence and quality of life for an extended period of time.PUBLIC HEALTH RELEVANCE: Oral health worsens in individuals with dementia. The growing number ofindividuals with dementia in the U.S. makes geriatric oral health an increasingly important public health issuethat this study will address. The intervention we propose for this study could be incorporated into clinicalpractice by developing toolkits that dental or non-dental health professionals could use to educate and trainolder individuals with and without cognitive impairment to maintain and improve oral health. !!",DEVELOPING AN ORAL HEALTH INTERVENTION FOR INDIVIDUALS WITH MILD DEMENTIA,2014,1,1,242036.0,1
"! Project Summary: The goal of the CVM Vet-LRN Veterinary Diagnostic Laboratory # Program is to complement, develop, and use university, state and federal veterinary $ diagnostic laboratory resources and expertise in order to increase national laboratory% capacity to detect, respond to and recover from a catastrophic animal food/feed & contamination event, either microbial or chemical. The program's funding would allow' provision for equipment, supplies, and personnel; training in standardized testing ( methodologies; participation in proficiency testing in those methodologies; participation) in method enhancement activities to extend analysis capability; and analysis of * surveillance and emergency outbreak samples. Nationwide standardization of validated!+ laboratory tests, competent laboratory analysts, uniform laboratory quality systems, and!! electronic communication of laboratory data are essential to the FDA's mission to protect!# food safety and security nationwide with the ultimate goal of protecting public health. The!$ Washington Animal Disease Diagnostic Laboratory (WADDL) at Washington State!% University is well positioned to effectively function as a regional laboratory in the CVM!& Vet-LRN Veterinary Diagnostic Laboratory Program. Since 1974 WADDL has provided!' full laboratory diagnostic services, including pathology, microbiology and toxicology, to!( veterinarians and animal owners throughout the Pacific Northwest. WADDL actively!) participates in multiple national laboratory networks (USDA-NAHLN, CDC-LRN) and is!* well practiced in standardized methods, equipment, proficiency testing, and electronic#+ reporting. Furthermore, WADDL has a mature laboratory quality system based upon#! international ISO 17025 laboratory standards, being fully accredited by the American## Association of Veterinary Laboratory Diagnosticians laboratory accreditation program.#$ Finally, as a Select Agent approved laboratory, WADDL has didicated BSL-3 laboratory#% space with personnel highly trained in maximal biosecurity procedures. Participation by#& WSU-WADDL in the Vet-LRN Veterinary Diagnostic Laboratory Program would provide#' FDA-CVM with increased laboratory capacity and capability in the Pacific Northwest#( region of the USA in the event of animal food or drug related illnesses or other large#) scale animal food/feed emergency events by leveraging current high quality, state#* funded laboratory resources and expertise.",FDA VET-LRN VETERINARY DIAGNOSTIC LABORATORY PROGRAM,2012,1,1,16500.0,1
"! Project Summary: The goal of the CVM Vet-LRN Veterinary Diagnostic Laboratory # Program is to complement, develop, and use university, state and federal veterinary $ diagnostic laboratory resources and expertise in order to increase national laboratory% capacity to detect, respond to and recover from a catastrophic animal food/feed & contamination event, either microbial or chemical. The program's funding would allow' provision for equipment, supplies, and personnel; training in standardized testing ( methodologies; participation in proficiency testing in those methodologies; participation) in method enhancement activities to extend analysis capability; and analysis of * surveillance and emergency outbreak samples. Nationwide standardization of validated!+ laboratory tests, competent laboratory analysts, uniform laboratory quality systems, and!! electronic communication of laboratory data are essential to the FDA's mission to protect!# food safety and security nationwide with the ultimate goal of protecting public health. The!$ Washington Animal Disease Diagnostic Laboratory (WADDL) at Washington State!% University is well positioned to effectively function as a regional laboratory in the CVM!& Vet-LRN Veterinary Diagnostic Laboratory Program. Since 1974 WADDL has provided!' full laboratory diagnostic services, including pathology, microbiology and toxicology, to!( veterinarians and animal owners throughout the Pacific Northwest. WADDL actively!) participates in multiple national laboratory networks (USDA-NAHLN, CDC-LRN) and is!* well practiced in standardized methods, equipment, proficiency testing, and electronic#+ reporting. Furthermore, WADDL has a mature laboratory quality system based upon#! international ISO 17025 laboratory standards, being fully accredited by the American## Association of Veterinary Laboratory Diagnosticians laboratory accreditation program.#$ Finally, as a Select Agent approved laboratory, WADDL has didicated BSL-3 laboratory#% space with personnel highly trained in maximal biosecurity procedures. Participation by#& WSU-WADDL in the Vet-LRN Veterinary Diagnostic Laboratory Program would provide#' FDA-CVM with increased laboratory capacity and capability in the Pacific Northwest#( region of the USA in the event of animal food or drug related illnesses or other large#) scale animal food/feed emergency events by leveraging current high quality, state#* funded laboratory resources and expertise.",FDA VET-LRN VETERINARY DIAGNOSTIC LABORATORY PROGRAM,2013,1,1,16500.0,1
"! Project Summary: The goal of the CVM Vet-LRN Veterinary Diagnostic Laboratory # Program is to complement, develop, and use university, state and federal veterinary $ diagnostic laboratory resources and expertise in order to increase national laboratory% capacity to detect, respond to and recover from a catastrophic animal food/feed & contamination event, either microbial or chemical. The program's funding would allow' provision for equipment, supplies, and personnel; training in standardized testing ( methodologies; participation in proficiency testing in those methodologies; participation) in method enhancement activities to extend analysis capability; and analysis of * surveillance and emergency outbreak samples. Nationwide standardization of validated!+ laboratory tests, competent laboratory analysts, uniform laboratory quality systems, and!! electronic communication of laboratory data are essential to the FDA's mission to protect!# food safety and security nationwide with the ultimate goal of protecting public health. The!$ Washington Animal Disease Diagnostic Laboratory (WADDL) at Washington State!% University is well positioned to effectively function as a regional laboratory in the CVM!& Vet-LRN Veterinary Diagnostic Laboratory Program. Since 1974 WADDL has provided!' full laboratory diagnostic services, including pathology, microbiology and toxicology, to!( veterinarians and animal owners throughout the Pacific Northwest. WADDL actively!) participates in multiple national laboratory networks (USDA-NAHLN, CDC-LRN) and is!* well practiced in standardized methods, equipment, proficiency testing, and electronic#+ reporting. Furthermore, WADDL has a mature laboratory quality system based upon#! international ISO 17025 laboratory standards, being fully accredited by the American## Association of Veterinary Laboratory Diagnosticians laboratory accreditation program.#$ Finally, as a Select Agent approved laboratory, WADDL has didicated BSL-3 laboratory#% space with personnel highly trained in maximal biosecurity procedures. Participation by#& WSU-WADDL in the Vet-LRN Veterinary Diagnostic Laboratory Program would provide#' FDA-CVM with increased laboratory capacity and capability in the Pacific Northwest#( region of the USA in the event of animal food or drug related illnesses or other large#) scale animal food/feed emergency events by leveraging current high quality, state#* funded laboratory resources and expertise.",FDA VET-LRN VETERINARY DIAGNOSTIC LABORATORY PROGRAM,2014,1,1,16500.0,1
"! Project Summary: The goal of the CVM Vet-LRN Veterinary Diagnostic Laboratory # Program is to complement, develop, and use university, state and federal veterinary $ diagnostic laboratory resources and expertise in order to increase national laboratory% capacity to detect, respond to and recover from a catastrophic animal food/feed & contamination event, either microbial or chemical. The program's funding would allow' provision for equipment, supplies, and personnel; training in standardized testing ( methodologies; participation in proficiency testing in those methodologies; participation) in method enhancement activities to extend analysis capability; and analysis of * surveillance and emergency outbreak samples. Nationwide standardization of validated!+ laboratory tests, competent laboratory analysts, uniform laboratory quality systems, and!! electronic communication of laboratory data are essential to the FDA's mission to protect!# food safety and security nationwide with the ultimate goal of protecting public health. The!$ Washington Animal Disease Diagnostic Laboratory (WADDL) at Washington State!% University is well positioned to effectively function as a regional laboratory in the CVM!& Vet-LRN Veterinary Diagnostic Laboratory Program. Since 1974 WADDL has provided!' full laboratory diagnostic services, including pathology, microbiology and toxicology, to!( veterinarians and animal owners throughout the Pacific Northwest. WADDL actively!) participates in multiple national laboratory networks (USDA-NAHLN, CDC-LRN) and is!* well practiced in standardized methods, equipment, proficiency testing, and electronic#+ reporting. Furthermore, WADDL has a mature laboratory quality system based upon#! international ISO 17025 laboratory standards, being fully accredited by the American## Association of Veterinary Laboratory Diagnosticians laboratory accreditation program.#$ Finally, as a Select Agent approved laboratory, WADDL has didicated BSL-3 laboratory#% space with personnel highly trained in maximal biosecurity procedures. Participation by#& WSU-WADDL in the Vet-LRN Veterinary Diagnostic Laboratory Program would provide#' FDA-CVM with increased laboratory capacity and capability in the Pacific Northwest#( region of the USA in the event of animal food or drug related illnesses or other large#) scale animal food/feed emergency events by leveraging current high quality, state#* funded laboratory resources and expertise.",FDA VET-LRN VETERINARY DIAGNOSTIC LABORATORY PROGRAM,2015,1,1,21500.0,2


In [21]:
agg_df['NUM_RECORDS'].sum()

1220119

In [22]:
len(df)

1220119

In [23]:
merged.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,FY_TOTAL_COST_SUM,NUM_RECORDS
0,89996,"This is a project to explore Game-based, Metap...",Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,1999467.0,1
1,89997,Institution: Franklin Institute Science Museum...,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,1799699.0,1
2,89998,Through programs (including small group conver...,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,1505858.0,1
3,89999,In partnership with the American Chemical Soci...,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,51000.0,1
4,90000,Amphibian populations around the world are exp...,Amphibia; Central America; Communicable Diseas...,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,NSF,NSF,,815315,10/1/2008,9/30/2011,"ZAMUDIO, KELLY R",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.074,2008,370996.0,,2,2,370996.0,2


### Deduplication

In [24]:
print(sum(merged['ABSTRACT'].isnull()))
print(sum(merged['PROJECT_TITLE'].isnull()))
print(sum(merged['FY'].isnull()))

0
1
0


In [25]:
# save NON-duplicated rows and the LAST occurrance of duplicated rows
dedup = merged[~merged.duplicated(subset=['ABSTRACT',  'PROJECT_TITLE', 'FY'], keep='last')]

In [26]:
print(f"Original: {len(df)}")
print(f"Deduplicated: {len(dedup)}")
print(f"Number of Duplicates: {len(df) - len(dedup)}")

Original: 1220119
Deduplicated: 1148217
Number of Duplicates: 71902


In [27]:
dedup['NUM_RECORDS'].sum()

1220119

In [28]:
#save final (deduplicated) dataframe as "df" to fit downstream code
df = dedup

### Dataset setup

##### Helper function used throughout

In [29]:
# drops strings with length 0 - indices printed out, find with df.loc[ix]

def drop_empties(df, col):
    
    l1 = len(df)
    ix = df[df[col].apply(len)==0].index
    print(ix)
    df.drop(ix,axis=0,inplace=True)
    l2 = len(df)
    
    print(f"dropped {l1-l2}")
    
    return df

In [30]:
# strip leading and trailing whitespace, save in a working abstract column that will be updated as text is cleaned
# strip() already ran above

# Note: we cannot lower case abstracts up front - capitalization is needed to find POS in preprocessing
 
#df = df.assign(working_abstract = [abstract.strip() for abstract in df["ABSTRACT"]])
df["working_abstract"] = df["ABSTRACT"]
df = drop_empties(df, "working_abstract")

wa = 'working_abstract'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["working_abstract"] = df["ABSTRACT"]


Int64Index([], dtype='int64')
dropped 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [31]:
def remove_short_abstracts(df, limit):

    # Remove abstracts with length < limit. 150 seems like a good cutoff, but it does lose some useful information.
    
    # what do we want to do for the cutoff?  -- SOLUTION UNTIL FURTHER EXPLORATION -> keep the same
    
    df['nchar']=df['working_abstract'].apply(len)
    l1 = len(df)
    df=df.loc[df['nchar']>=limit]
    l2 = len(df)
    
    print(l1-l2, "short abstracts removed")
    
    return df

df = remove_short_abstracts(df,limit=150)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['nchar']=df['working_abstract'].apply(len)


4308 short abstracts removed


In [32]:
len(df)

1143909

### Cleaning Strategy:
1. Remove abstracts with all non-alphanumeric characters.
2. Remove non-alphanumeric characters from the start and end of abstracts
3. Remove other non-readable abstracts. (REMOVAL ABSTRACTS FOUND BY INSPECTION)
4. Remove "junk" starting strings and ending strings
5. Remove "junk" strings in the middle 


##### Helper functions for cleaning

In [33]:
def strip_nonalnum(word):
    
    # function strips non-alphanumeric characters from the beginning and end of a string
    # adapted from: https://stackoverflow.com/questions/22650506/how-to-rermove-non-alphanumeric-characters-at-the-beginning-or-end-of-a-string
        
    if not word:
        return word  # nothing to strip
    if (len(word) == 1) and (not word[0].isalnum()):  
        return ""
    for start, c in enumerate(word):
        if c.isalnum():
            break
    for end, c in enumerate(word[::-1]):
        if c.isalnum():
            break
                  
    return word[start:len(word) - end]


In [34]:
def remove_phrase(x, phrase,loc='Start'):
    
    # returns x with phrase removed. location can be "Start" of string, "End" of string, or 
    # "Anywhere_All"--anywhere will remove all instances and Anywhere_First will remove the first instance
    # CASE info - this function assumes phrase is lower case, but that x is not.
    
    assert loc in ['Start','End']
    
    if loc=='End':
        if x.lower().endswith(phrase):
            return x[:-1*len(phrase)].strip()
        else:
            return x
    elif loc=='Start':
        if x.lower().startswith(phrase):
            return x[len(phrase):].strip()
        else:
            return x
    else:
        return 'Error'
    

In [35]:
# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

Int64Index([96789, 668610, 756384], dtype='int64')
dropped 3


In [36]:
# remove abstracts that are not readable: THIS NEEDS TO BE UPDATED BY HAND FOR EVERY NEW DATASET
# For example: index = 883794: ¢ £/¥ ƒ § ¤ ƒ “ ƒ « ...

df = df.assign(Start_Char = df['working_abstract'].apply(lambda x: x[0]))
ix = df[df['Start_Char'] == 'ƒ'].index
print(ix)

Int64Index([883794, 993838], dtype='int64')


In [37]:
df.drop(index = ix, inplace = True)

In [38]:
# "junk" phrases at start to remove

start_phrases=['abstract', 'summary', 'proposal', 'description', 'narrative', 
               'technical abstract',
               'non technical abstract', 
               'non- technical abstract',
               'non-technical abstract',                      
               'nontechnical abstract',
               'technical summary', 
               'nontechnical summary',
               'non-technical summary',
               'non-technical description',
               'description (provided by the applicant)',
               'description (provided by investigator)',  
               'description (provided by applicant)',
               'project summary/abstract',
               'proposal abstract',
               'research abstract',
               'project summary',
               'research summary',
               'project description'
               'see instructions):',
               'for center application (provided by the investigator):',
               'objective(s)',      
               'exceed the space provided',
               'provided by applicant',
               'provided by candidate']


In [39]:
#Remove found start phrases

for phrase in start_phrases:
    temp = df[wa].apply(remove_phrase,args=[phrase,'Start'])
    df = df.assign(working_abstract = temp) 

# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

Int64Index([], dtype='int64')
dropped 0


In [40]:
#Repeated start phrase removal in case the order of project summary/abstract varies

for phrase in start_phrases:
    temp = df[wa].apply(remove_phrase,args=[phrase,'Start'])
    df = df.assign(working_abstract = temp)  
    
df = drop_empties(df, "working_abstract")

Int64Index([], dtype='int64')
dropped 0


In [41]:
#starting_exact_phrases to remove

#'This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.'
#'This subproject is one of many research subprojects utilizing theresources provided by a Center grant funded by NIH/NCRR. The subproject andinvestigator (PI) may have received primary funding from another NIH source,and thus could be represented in other CRISP entries. The institution listed isfor the Center, which is not necessarily the institution for the investigator.'

temp = df[wa].apply(lambda x: x.replace('This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.',
                                       ''))
df = df.assign(working_abstract = temp) 

expression=re.compile('This subproject is one of many research subprojects.*not necessarily the institution for the investigator.')
temp = df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

expression=re.compile('This subproject is one of many research subprojects.*to the subproject or subproject staff.')
temp = df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

In [42]:
# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

Int64Index([], dtype='int64')
dropped 0


In [43]:
end_phrases = ['(end of abstract',
               'end of abstract', 
               '(abstract end',  
               '(end of abstract',
               '(end 0f abstract',
               '(end of absract',
               '(abstract below',
               '(end of reviewers\' comment',
               '(end abstract',
               'performance site ========================================section end',
               'key personnel ========================================section end',
               '[summary truncated at 7800 characters', 
               'this award reflects nsf\'s statutory mission and has been deemed worthy of support through evaluation using the foundation\'s intellectual merit and broader impacts review criteria',
               'project description page 6', 'page 1 of 1', 'project summary/abstract page 6',
               'project description page 7', 'project summary/abstract page 7', 'pag 1 o 1', 
               'page 2 number pages consecutively at the bottom throughout form page 2']


In [44]:
# end phrase removal

for phrase in end_phrases:
    temp = df[wa].apply(remove_phrase,args=[phrase,'End'])
    df = df.assign(working_abstract = temp) 

# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

Int64Index([], dtype='int64')
dropped 0


In [45]:
# "junk" removal within text body - not necessarily at the start or end

# 'Enter the text here that' ending with 'lines of text.'
expression=re.compile('Enter the text here that.*lines of text')
temp=df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

expression=re.compile('PHS .*?Continuation Format Page')
temp=df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

expression=re.compile('OMB No .*?Continuation Format Page')
temp=df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

temp=df[wa].replace('Project Summary/Abstract','')
df = df.assign(working_abstract = temp)

In [46]:
"""If it starts with 'one page and must contain',
This is an NIH thing and there aren't that many of them, but come from 3 different cfda
it will start with "one page and must contain a summary of the proposed activity suitable for dissemination to 
thepublic. It should be a self-contained description of the project and should contain a statement of objectives 
and methods to be employed. It should be informative to other persons working in the same or related fields and 
insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any 
proprietary/confidential information.* Please click the add attachment button to complete this entry." plus some 
attachments, which includes tracking number, twice: following the second trackign number, there is a grant number
followed by the actual content" 

At the end of these files, they all end in 'Project Narrative File'(last instance) followed by more attachments, 
all of which can be discarded
"""

expression1=re.compile('one page and must.*?Tracking Number.*?(Tracking Number)')
expression2=re.compile('Project Narrative File.*')

def fix_abstract(abstract):
    if abstract.startswith('one page and must contain'):
        abstract=re.sub(expression1,'',abstract)
        return re.sub(expression2,'',abstract)
    else:
        return abstract

temp=df[wa].apply(fix_abstract)
df = df.assign(working_abstract = temp)

In [47]:
# removal of phrase at end

expression=re.compile('Project Summary/Abstract Page.*')

def remove_contact_pd(x):
    
    """removes clause at end that tends to occur: eg Project Summary/Abstract Page 222Contact PD/PI: Sampson, HughNarrative ("""
    
    if x.startswith('Contact PD/PI'):
        return re.sub(expression,'',x)
    else:
        return x
    
temp=df[wa].apply(remove_contact_pd) 
df = df.assign(working_abstract = temp)

In [48]:
# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

Int64Index([], dtype='int64')
dropped 0


In [49]:
len(df)

1143904

In [50]:
df.to_pickle("../../../data/prd/Paper/FR_clean_22DEC21.pkl")
