# Cleaning/Processing our dataset - part 1  

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import string
import time

pd.set_option('display.max_columns', 30)

### Read in Raw Dataset

In [2]:
# reading in raw data

# set data types
cols = ['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE', 'DEPARTMENT', 'AGENCY', 'IC_CENTER', 
        'PROJECT_NUMBER', 'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS', 
        'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER', 'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE', 
        'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE', 'BUDGET_END_DATE', 'CFDA_CODE', 'FY.y', 
        'FY_TOTAL_COST', 'FY_TOTAL_COST_SUB_PROJECTS']
dtypes = {col: 'str' for col in cols}
dtypes["FY_TOTAL_COST"] = 'float'
dtypes["FY_TOTAL_COST_SUB_PROJECTS"] = 'float'

df = pd.read_csv('../../data/original/working_federal_reporter_2020.csv', dtype = dtypes, engine='python')
print(df.shape)

(1156137, 26)


In [3]:
df.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,
1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,
2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,
3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,
4,90000,Amphibian populations around the world are exp...,2008,Amphibia; Central America; Communicable Diseas...,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,NSF,NSF,,815315,10/1/2008,9/30/2011,"ZAMUDIO, KELLY R",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.074,2008,370996.0,


### Remove null abstracts, fill in missing data, and deal with duplicate abstracts

In [4]:
#remove rows with NULL abstracts

l1 = len(df)
df = df[~df.ABSTRACT.isnull()]
l2 = len(df)

print(l1-l2, "null ABSTRACTs removed")

42380 null ABSTRACTs removed


In [5]:
# drop abstracts with values of "ABSTRACT NOT PROVIDED" and "No abstract provided "

l1 = len(df)
df = df[df.ABSTRACT != 'ABSTRACT NOT PROVIDED']
df = df[df.ABSTRACT != 'No abstract provided']
l2 = len(df)

print(l1-l2, "ABSTRACT NOT PROVIDED and \"No abstract provided\" removed")

1324 ABSTRACT NOT PROVIDED and "No abstract provided" removed


In [6]:
# FY.x is the reliable fiscal year information so we rename this column to FY

df = df.rename(columns={'FY.x': 'FY'})

**Since we are tracking emerging abstract trends**, we will fill in missing information for:  
- PROJECT_START_DATE
- PROJECT_END_DATE.

In [7]:
# Fill missing values in Project Start/End Date with Budget Start/End Date
df['PROJECT_START_DATE'] = df['PROJECT_START_DATE'].fillna(df['BUDGET_START_DATE'])
df['PROJECT_END_DATE'] = df['PROJECT_END_DATE'].fillna(df['BUDGET_END_DATE'])

#df.isnull().sum()

#If START date is still missing, fill both start and end date with FY
df['PROJECT_START_DATE'] = df['PROJECT_START_DATE'].fillna(df['FY'])
df['PROJECT_END_DATE'] = df['PROJECT_END_DATE'].fillna(df['FY'])

#df.isnull().sum()

**Aggregate counts for unique ORGANIZATION_NAMEs in rows with duplicated Abstract/Title/Project_Start_Date**

In [8]:
# group df by abstract/title/start_date (exact matches)
all_grp = df.groupby(['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'])

# for each unique ABSTRACT/TITLE/START_DATE in df, count unique Organizations
unique_all = all_grp.agg({'ORGANIZATION_NAME' : 'nunique'}) 

#rename column as "count" to be different than original column
unique_all = unique_all.rename(columns={'ORGANIZATION_NAME': 'ORG_COUNT'})

# merge df with "unique_all" to bring in the unique Organization counts for each "duplicate" group
merged1 = df.merge(unique_all, left_on=['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'], 
                   right_on=['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'])

**Aggregate counts for unique PIs in rows with duplicated Abstract/Title/Project_Start_Date**

In [9]:
# for each unique ABSTRACT/TITLE/START_DATE in whole df, count unique PIs
unique_pi = all_grp.agg({'CONTACT_PI_PROJECT_LEADER' : 'nunique'}) 

#rename column as "count" to be different than original column
unique_pi = unique_pi.rename(columns={'CONTACT_PI_PROJECT_LEADER': 'PI_COUNT'})

# merge data frame with "unique_all" to bring in the unique PI counts for each "duplicate" group
merged2 = merged1.merge(unique_pi, left_on=['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'], 
                        right_on=['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'])

In [10]:
# Sort merged data so that duplicated rows occur in order of earliest to latest END date
merged = merged2.sort_values(['PROJECT_END_DATE'])

In [11]:
# save NON-duplicated rows and the LAST occurrance of duplicated rows
dedup = merged[~merged.duplicated(subset=['ABSTRACT',  'PROJECT_TITLE', 'PROJECT_START_DATE'], keep='last')]

In [12]:
#save final (deduplicated) dataframe as "df" to fit downstream code
df = dedup

In [13]:
len(df)

698600

### Dataset setup

##### Helper function used throughout

In [14]:
def drop_empties(df, col):
    
    l1 = len(df)
    ix = df[df[col].apply(len)==0].index
    print(ix)
    df.drop(ix,axis=0,inplace=True)
    l2 = len(df)
    
    print(f"dropped {l1-l2}")
    
    return df

In [15]:
# strip leading and trailing whitespace, save in a working abstract column that will be updated as text is cleaned

# Note: we cannot lower case abstracts up front - capitalization is needed to find POS in preprocessing
 
df = df.assign(working_abstract = [abstract.strip() for abstract in df["ABSTRACT"]])
df = drop_empties(df, "working_abstract")

wa = 'working_abstract'

Int64Index([], dtype='int64')
dropped 0


In [16]:
df = df.assign(Start_Char = df['working_abstract'].apply(lambda x: x[0]))

In [17]:
def remove_short_abstracts(df, limit):

    # Remove abstracts with length < limit. 150 seems like a good cutoff, but it does lose some useful information.
    
    # what do we want to do for the cutoff?  -- SOLUTION UNTIL FURTHER EXPLORATION -> keep the same
    
    df['nchar']=df['working_abstract'].apply(len)
    l1 = len(df)
    df=df.loc[df['nchar']>=limit]
    l2 = len(df)
    
    print(l1-l2, "short abstracts removed")
    
    return df

df = remove_short_abstracts(df,limit=150)

2414 short abstracts removed


### Cleaning Strategy:
1. Remove abstracts with all non-alphanumeric characters.
2. Remove non-alphanumeric characters from the start and end of abstracts
3. Remove other non-readable abstracts. (REMOVAL ABSTRACTS FOUND BY INSPECTION)
4. Remove "junk" starting strings and ending strings
5. Remove "junk" strings in the middle 
6. Remove title and organization name from abstracts


##### Helper functions for cleaning

In [18]:
def strip_nonalnum(word):
    
    # function strips non-alphanumeric characters from the beginning and end of a string
    # adapted from: https://stackoverflow.com/questions/22650506/how-to-rermove-non-alphanumeric-characters-at-the-beginning-or-end-of-a-string
        
    if not word:
        return word  # nothing to strip
    if (len(word) == 1) and (not word[0].isalnum()):  
        return ""
    for start, c in enumerate(word):
        if c.isalnum():
            break
    for end, c in enumerate(word[::-1]):
        if c.isalnum():
            break
                  
    return word[start:len(word) - end]


In [19]:
def remove_phrase(x, phrase,loc='Start'):
    
    # returns x with phrase removed. location can be "Start" of string, "End" of string, or 
    # "Anywhere_All"--anywhere will remove all instances and Anywhere_First will remove the first instance
    # CASE info - this function assumes phrase is lower case, but that x is not.
    
    assert loc in ['Start','End']
    
    if loc=='End':
        if x.lower().endswith(phrase):
            return x[:-1*len(phrase)].strip()
        else:
            return x
    elif loc=='Start':
        if x.lower().startswith(phrase):
            return x[len(phrase):].strip()
        else:
            return x
    else:
        return 'Error'
    

In [20]:
# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

Int64Index([179023, 738276, 835501], dtype='int64')
dropped 3


In [21]:
# remove abstracts that are not readable: THIS NEEDS TO BE UPDATED BY HAND FOR EVERY NEW DATASET
# For example: index = 490684: ¢ £/¥ ƒ § ¤ ƒ “ ƒ « ...

df = df.assign(Start_Char = df['working_abstract'].apply(lambda x: x[0]))
ix = df[df['Start_Char'] == 'ƒ'].index
df.drop(index = ix, inplace = True)

In [22]:
# "junk" phrases at start to remove

start_phrases=['abstract', 'summary', 'proposal', 'description', 'narrative', 
               'technical abstract',
               'non technical abstract', 
               'non- technical abstract',
               'non-technical abstract',                      
               'nontechnical abstract',
               'technical summary', 
               'nontechnical summary',
               'non-technical summary',
               'non-technical description',
               'description (provided by the applicant)',
               'description (provided by investigator)',  
               'description (provided by applicant)',
               'project summary/abstract',
               'proposal abstract',
               'research abstract',
               'project summary',
               'research summary',
               'project description'
               'see instructions):',
               'for center application (provided by the investigator):',
               'objective(s)',      
               'exceed the space provided',
               'provided by applicant',
               'provided by candidate']


In [23]:
#Remove found start phrases

for phrase in start_phrases:
    temp = df[wa].apply(remove_phrase,args=[phrase,'Start'])
    df = df.assign(working_abstract = temp) 

# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

Int64Index([], dtype='int64')
dropped 0


In [24]:
#Repeated start phrase removal in case the order of project summary/abstract varies

for phrase in start_phrases:
    temp = df[wa].apply(remove_phrase,args=[phrase,'Start'])
    df = df.assign(working_abstract = temp)  


In [25]:
#starting_exact_phrases to remove

#'This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.'
#'This subproject is one of many research subprojects utilizing theresources provided by a Center grant funded by NIH/NCRR. The subproject andinvestigator (PI) may have received primary funding from another NIH source,and thus could be represented in other CRISP entries. The institution listed isfor the Center, which is not necessarily the institution for the investigator.'

temp = df[wa].apply(lambda x: x.replace('This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.',
                                       ''))
df = df.assign(working_abstract = temp) 

expression=re.compile('This subproject is one of many research subprojects.*not necessarily the institution for the investigator.')
temp = df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

expression=re.compile('This subproject is one of many research subprojects.*to the subproject or subproject staff.')
temp = df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

In [26]:
# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

# update Start_Char column in df
df = df.assign(Start_Char = df['working_abstract'].apply(lambda x: x[0]))

Int64Index([], dtype='int64')
dropped 0


In [27]:
end_phrases = ['(end of abstract',
               'end of abstract', 
               '(abstract end',  
               '(end of abstract',
               '(end 0f abstract',
               '(end of absract',
               '(abstract below',
               '(end of reviewers\' comment',
               '(end abstract',
               'performance site ========================================section end',
               'key personnel ========================================section end',
               '[summary truncated at 7800 characters', 
               'this award reflects nsf\'s statutory mission and has been deemed worthy of support through evaluation using the foundation\'s intellectual merit and broader impacts review criteria',
               'project description page 6', 'page 1 of 1', 'project summary/abstract page 6',
               'project description page 7', 'project summary/abstract page 7', 'pag 1 o 1', 
               'page 2 number pages consecutively at the bottom throughout form page 2']


In [28]:
# end phrase removal

for phrase in end_phrases:
    temp = df[wa].apply(remove_phrase,args=[phrase,'End'])
    df = df.assign(working_abstract = temp) 

# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")
    
# update Last Char column in df
df = df.assign(LAST_CHAR = df['working_abstract'].apply(lambda x: x[0]))

Int64Index([], dtype='int64')
dropped 0


In [29]:
# "junk" removal within text body - not necessarily at the start or end

# 'Enter the text here that' ending with 'lines of text.'
expression=re.compile('Enter the text here that.*lines of text')
temp=df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

expression=re.compile('PHS .*?Continuation Format Page')
temp=df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

expression=re.compile('OMB No .*?Continuation Format Page')
temp=df[wa].apply(lambda x: re.sub(expression,'',x))
df = df.assign(working_abstract = temp)

temp=df[wa].replace('Project Summary/Abstract','')
df = df.assign(working_abstract = temp)

In [30]:
"""If it starts with 'one page and must contain',
This is an NIH thing and there aren't that many of them, but come from 3 different cfda
it will start with "one page and must contain a summary of the proposed activity suitable for dissemination to 
thepublic. It should be a self-contained description of the project and should contain a statement of objectives 
and methods to be employed. It should be informative to other persons working in the same or related fields and 
insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any 
proprietary/confidential information.* Please click the add attachment button to complete this entry." plus some 
attachments, which includes tracking number, twice: following the second trackign number, there is a grant number
followed by the actual content" 

At the end of these files, they all end in 'Project Narrative File'(last instance) followed by more attachments, 
all of which can be discarded
"""

expression1=re.compile('one page and must.*?Tracking Number.*?(Tracking Number)')
expression2=re.compile('Project Narrative File.*')

def fix_abstract(abstract):
    if abstract.startswith('one page and must contain'):
        abstract=re.sub(expression1,'',abstract)
        return re.sub(expression2,'',abstract)
    else:
        return abstract

temp=df[wa].apply(fix_abstract)
df = df.assign(working_abstract = temp)

In [31]:
# removal of phrase at end or beginning?

expression=re.compile('Project Summary/Abstract Page.*')

def remove_contact_pd(x):
    
    """removes clause at end that tends to occur: eg Project Summary/Abstract Page 222Contact PD/PI: Sampson, HughNarrative ("""
    
    if x.startswith('Contact PD/PI'):
        return re.sub(expression,'',x)
    else:
        return x
    
temp=df[wa].apply(remove_contact_pd) 
df = df.assign(working_abstract = temp)

In [32]:
# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

Int64Index([], dtype='int64')
dropped 0


In [33]:
def remove_title_org(record):
    
    # This function removes project titles and organization names from abstracts
    
    """ ignores case to remove multi-word phrases in a particular order, especially those likely to run into other words,
    e.g. Institution university of washingtonPI mary williams. This doesn't work when titles or insititutions have escape characters in them, which is a bummer
    see for example ENHANCING THE USE OF NASA EARTH SCIENCE RESULTS / DATA / AND TECHNOLOGY BY ENGAGING THE FEDERATION OF EARTH SCIENCE INFORMATION PARTNERS COMMUNITIES OF
    PRACTICE IN TARGET AREAS OF INTEREST TO NASA THE FEDERATION OF EARTH SCIENCE INFORMATION PARTNERS (''FED"""
    
    title=record['PROJECT_TITLE']
    
    try:
        new_abstract=re.sub(title,'',record[wa],flags=re.IGNORECASE)      
        return re.sub(record['ORGANIZATION_NAME'],'',new_abstract,flags=re.IGNORECASE)   
    except:
        try:
            return re.sub(record['ORGANIZATION_NAME'],'',record[wa],flags=re.IGNORECASE)   
        except:
            return record[wa]
        
        
temp=df.apply(lambda x: remove_title_org(x),axis=1)
df = df.assign(working_abstract = temp)

In [34]:
# strip non-alphanum characters from the beginning and end of each abstract

temp = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

df = df.assign(Start_Char = df[wa].apply(lambda x:x[0]))
df = df.assign(LAST_CHAR = df[wa].apply(lambda x:x[-1]))
df = df.assign(nchar = df[wa].apply(lambda x: len(x)))

Int64Index([ 12392, 463597, 998105, 828642, 303523,  12493,  11956,  13988,
             11786, 503566,
            ...
            910560, 183650, 386735, 719763, 920987, 301488, 199090, 292405,
            909636, 919062],
           dtype='int64', length=5327)
dropped 5327


In [35]:
len(df)

690855

In [36]:
df.to_pickle("../../data/working/clean_data_7-20.pkl")
