In [164]:
#Set up Environment

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%pylab inline
from string import ascii_letters
import sys
import re
#Preprocessing
import nltk
import os
import string

Populating the interactive namespace from numpy and matplotlib


In [165]:
#Visualize data
raw_df=pd.read_csv('/sfs/qumulo/qhome/sc2pg/src/prnd/publicrd/data/prd/DigitalOcean_Backup/public_rd/working/federal_reporter/abstracts_federal_reporter_combined.csv',engine='python')

print('Columns')
print(raw_df.columns)
print('Descriptive Stats')
print(raw_df.describe())
print('Length')
print(len(raw_df))

Columns
Index(['PROJECT_ID', 'ABSTRACT', 'FY', 'FIRST_CHAR', 'LAST_CHAR', 'DEPARTMENT',
       'AGENCY', 'IC_CENTER', 'PROJECT_NUMBER', 'PROJECT_TITLE',
       'PROJECT_TERMS', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS',
       'ORGANIZATION_NAME', 'CFDA_CODE', 'FY_TOTAL_COST'],
      dtype='object')
Descriptive Stats
         PROJECT_ID             FY  FY_TOTAL_COST
count  5.500880e+05  550088.000000   4.256850e+05
mean   4.980384e+05    2012.247477   4.509841e+05
std    3.262050e+05       3.183765   1.727112e+06
min    1.008600e+04    2008.000000   1.000000e+00
25%    1.790538e+05    2009.000000   1.390020e+05
50%    4.880660e+05    2012.000000   2.917820e+05
75%    7.817102e+05    2015.000000   4.500000e+05
max    1.101940e+06    2018.000000   3.227983e+08
Length
550088


In [166]:
##############
#Remove nulls and duplicates
#Currently removes only duplicates based on ABSTRACTS and only in the same YEAR
#The rationale here is that we may do year-by-year modelling and don't want to exclude projects
#But if we do all-in-one modelling (e.g. across all years), we will want to reconsider
#Also will want to do additional duplicate check once abstracts are cleaned
###############

#Drop projects with identical abstracts and year. Different year could indicate additional funding sent to this project.
df=raw_df.loc[pd.notnull(raw_df['ABSTRACT'])]
df.drop_duplicates(subset=['ABSTRACT','FY'],inplace=True) 
print('Length'+str(len(df)))

####################
#Check for additional duplicates
#Note that the project id isnt necessarily identical for each transaction on same grant--e.g. one number could be added, so this isnt that strict
#and why checking astract is needed
#####################
print('Project ID duplicates:')
vc=df['PROJECT_ID'].value_counts()
print(vc[vc>1])

Length550074
Project ID duplicates:
Series([], Name: PROJECT_ID, dtype: int64)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [167]:
################
#Function for removing any text we don't like at start, end, or anywhere within a string
#CASE SENSITIVE
################

def remove_phrase(x, phrase,loc='Start'):
    """returns x with phrase removed. location can be "Start" of string, "End" of string, or "Anywhere_All"--anywhere will remove all instances and Anywhere_First will remove the first instane"""
    assert loc in ['Start','End','Anywhere_All','Anywhere_First']
    if loc=='End':
        if x.endswith(phrase):
            return x[:-1*len(phrase)].strip()
        else:
            return x
    elif loc=='Start':
        if x.startswith(phrase):
            return x[len(phrase):].strip()
        else:
            return x
    elif loc=='Anywhere_All':
        return x.replace(phrase,'')
    elif loc=='Anywhere_First':
        return x.replace(phrase,'',1)
    else:
        return 'Error'
    


In [168]:
#Define a new series which is an abstract that keeps the raw text, but can be continuously manipulated.
wa='working_abstract'
df[wa]=df['ABSTRACT'].apply(str.strip)

#An illustrative case that Fedearl Reporter is NOT all R&D and why semantic content, rather than simple tagging, is better
#print(df.loc[df['ABSTRACT']== 'Not R&D, do not report',[wa,'PROJECT_TERMS']])
#print(df.loc[df['ABSTRACT']== 'Technical support services contract, not R&D',[wa,'PROJECT_TERMS']])


#Remove too short abstracts
df['nchar']=df[wa].apply(len)
limit=150 #Less than 150 chars is not an abstract
#[x for x in df.loc[df['nchar']<=limit,'ABSTRACT'].head(40)]
#This seems like a good cutoff, but it does lose some useful information.
df=df.loc[df['nchar']>=limit]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [169]:
####################
#2.1--phrases noticed through investigation of starting characters, as well as those identified in R Analysis
######################

start_phrases=['****TECHNICAL ABSTRACT****','****Technical Abstract****',
               '****Non Technical Abstract****','*** Non- Technical Abstract ***','**Non-Technical Abstract**',
               '*****NON-TECHNICAL ABSTRACT*****','***** NON-TECHNICAL ABSTRACT *****',
               '****NONTECHNICAL ABSTRACT****','****Non-Technical Abstract****','*Non-technical Abstract*',
               '*****NON-TECHNICAL ABSTRACT*****','****NON-TECHNICAL ABSTRACT****',
               '***NON-TECHNICAL ABSTRACT***','****Nontechnical abstract****',
               'TECHNICAL SUMMARY', 'NONTECHNICAL SUMMARY','NON-TECHNICAL SUMMARY','Non-technical description',
               'DESCRIPTION (Provided by the applicant)','DESCRIPTION (provided by investigator)',  'DESCRIPTION (provided by applicant)',
               'Project Summary/Abstract','PROJECT SUMMARY/ABSTRACT',
               'ABSTRACT','abstract','Proposal Abstract','Abstract','RESEARCH ABSTRACT',
               'PROJECT SUMMARY','Project Summary','SUMMARY','RESEARCH SUMMARY',
               'Proposal',
               'DESCRIPTION','Description','PROJECT DESCRIPTION'
               'NARRATIVE',
               '(See instructions):','\t',
              'FOR CENTER APPLICATION (provided by the investigator):','Objective(s)',      'EXCEED THE SPACE PROVIDED',
               'Provided by Applicant','Provided by applicant','provided by applicant','PROVIDED BY APPLICANT',
               'Provided by Candidate','Provided by candidate','provided by candidate','PROVIDED BY CANDIDATE']

df[wa]=df[wa].apply(str.lstrip,args=[' ?-_^. :,!;¿|()[]]#%>﻿&\''])
df.drop(df[df[wa].apply(len)==0].index[0],axis=0,inplace=True)
#Remove found phrases
for phrase in start_phrases:
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'Start']).apply(str.lstrip,args=[' :./)'])
#Repeated in case the order of project summary/abstract varies
for phrase in start_phrases:
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'Start']).apply(str.lstrip,args=[' :./)'])

    


//Examining Titles for Duplication (Instead of Project IDS)

In [170]:
##################
#Exploratory Data Analaysis at different levels to look for patterns to remove
#################



###########
#Finding repeats other than by project ID: using titles
#Title is not a good indicator of whether a project is repeated--many grants all have same title but very different abstracts and PIs
#You can run this code to see very common titles used by more than one PI. Don't rely on titles
########
"""vc=df['PROJECT_TITLE'].value_counts()
vc=vc[vc>1]
#First 50 sets that occur more than once
dup_strings=list(vc.index)

#Dataframe of common titles, which PIs use is and how many unique PIs
vc=df.loc[df['PROJECT_TITLE'].isin(dup_strings)].groupby('PROJECT_TITLE')['CONTACT_PI_PROJECT_LEADER'].unique()
common=pd.DataFrame([vc,vc.apply(len)],index=['Unique PIs','Num_Unique_PIs']).T
#Titles that appear more than once and are used by at least 2 PIS
common=common.loc[common['Num_Unique_PIs']>1]"""

###############33
#2: Pulling out duplicate first sentences
#Pull out starts that are absolutely high (not relatively by CFDA)
#If you run this BEFORE taking out all the starts in the cell above, you get very different results.
#Many of these are documented below, but we do not remove them since they are content.
################

#df['First 50']=df[wa].apply(lambda x: x[:50])
#vc=df['First 50'].value_counts()
#vc[vc>2]

##########################
#2: Starts
#Pull out starts that are proportionally high to check for uninformative/duplicate sentences
#Show how frequent certain first chars are by cfda by grouping, relative frequnecyy
#Again based on First Char, repeat with first actual sentence
##########################

#Put together a table that groups first char by CFDA code and instead of counts, see how frequently it occurs within that CFDA code
"""rel_freq=pd.DataFrame(df.groupby('CFDA_CODE')['First 50'].value_counts())
rel_freq=rel_freq.join(df['CFDA_CODE'].value_counts(),on='CFDA_CODE')
rel_freq['Relative Frequency']=rel_freq['First 50']/rel_freq['CFDA_CODE']

#Limit analysis to those items that occur at least 3 times and in at least 1% of abstracts (arbitrary)
a=rel_freq['First 50']>3 
b=rel_freq['Relative Frequency']>.01
rel_freq.loc[(a&b)] #Some first sentences are a very good indicator of a particular CFDA, suggesting they have particular formatting rules that would be a give away for the
#CFDA code, rather than the actual research conducted
#rel_freq.loc[(a&b)].to_csv('Common First 50.csv')
rel_freq.loc[(a&b)]"""

#############
#Demonsrate that individuals tend to reuse the same sentences across grants
#There are over 70,000 unique first 50 characters that are used more than once by the same PI
#############

"""vc=df.groupby('CONTACT_PI_PROJECT_LEADER')['First 50'].value_counts()
vc[vc>1]"""


"vc=df.groupby('CONTACT_PI_PROJECT_LEADER')['First 50'].value_counts()\nvc[vc>1]"

//Examining First Few Characters to Find Tags (Instead of sentences, because common first sentences have same first few characters!)

In [171]:
###########
#Grouping by first 50, we can pull out first_sentences that are used by more than one PI, suggesting this is not just a PI thing, but a grant level or in some case prject  thing
#Identifying more sentences to be examined and removed
#########

"""vc=df['First 50'].value_counts()
vc=vc[vc>1]
#First 50 sets that occur more than once
dup_strings=list(vc.index)

#Dataframe of common first phrases, which PIs use is and how many unique PIs
vc=df.loc[df['First 50'].isin(dup_strings)].groupby('First 50')['CONTACT_PI_PROJECT_LEADER'].unique()
common=pd.DataFrame([vc,vc.apply(len)],index=['Unique PIs','Num_Unique_PIs']).T
common=common.loc[common['Num_Unique_PIs']>1]
print(common.sort_values(by=['Num_Unique_PIs'],ascending=False))

#Here is a case where multiple investigators use very similar first sentences
for row in df.loc[df['First 50']=='Land-use change is a primary driver of the losses '].iterrows():
    print(row[1]['ABSTRACT'])"""

"vc=df['First 50'].value_counts()\nvc=vc[vc>1]\n#First 50 sets that occur more than once\ndup_strings=list(vc.index)\n\n#Dataframe of common first phrases, which PIs use is and how many unique PIs\nvc=df.loc[df['First 50'].isin(dup_strings)].groupby('First 50')['CONTACT_PI_PROJECT_LEADER'].unique()\ncommon=pd.DataFrame([vc,vc.apply(len)],index=['Unique PIs','Num_Unique_PIs']).T\ncommon=common.loc[common['Num_Unique_PIs']>1]\nprint(common.sort_values(by=['Num_Unique_PIs'],ascending=False))\n\n#Here is a case where multiple investigators use very similar first sentences\nfor row in df.loc[df['First 50']=='Land-use change is a primary driver of the losses '].iterrows():\n    print(row[1]['ABSTRACT'])"

In [172]:
#############################
#Issues identified through looking at either first character or first 50 characters
#To finish this, add in longer phrases identified in thousands, rather than scrolling through things
##############################

#Starting regular expressions to remove:
"""Some numbers of (*s, non or not) (space, -, or not), (technical abstract), (some number of stars)
some numbers (with . between) followed by abstract
(abstract) (of whichever capitalization, within parentheses)
(numbers + .)repeated endign with either + 'Project Summary/Abstract'
PHS 398/2590 + anything followed by + Page Continuation Format Page
If it starts with 'one page and must contain',
#This is an NIH thing and there aren't that many of them, but come from 3 different cfda
it will start with "one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. 
It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.
It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. 
This Abstract must not include any proprietary/confidential information.* 
Please click the add attachment button to complete this entry." plus some attachments, which includes tracking number, twice:
following the second trackign number, there is a grant number followed by the actual content" 

At the end of these files, they all end in 'Project Narrative File'(last instance) followed by more attachments, all of which can be discarded

'Enter the text here tha' ending with 'lines of text.'
'Close FormNextPrint PageAbout OMB Number'] #This is usually ended with "Project summary", so anything between those 2 can be delete, and ended with a clause starting with 'Close FormProject' and ending in'Narrative File'
#If ends in 'Description,', then go to last instance of PERFORMANCE (for Performance SITES), otherwise "KEY PERSONNEL", upper case, and cut all that follows
#Starting characters to remove
#[',',';','\n','\t','&','-','!']

#starting_exact_phrases to remove
'This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.'
'This subproject is one of many research subprojects utilizing theresources provided by a Center grant funded by NIH/NCRR. The subproject andinvestigator (PI) may have received primary funding from another NIH source,and thus could be represented in other CRISP entries. The institution listed isfor the Center, which is not necessarily the institution for the investigator.'


#############
#Some exampes of junk we cant remove--not regular expressiony or even if low info, is some info
#############
#Occurs often, but not about content of grants
'This award reflects NSF\'s statutory mission and has been deemed worthy of support through evaluation using the Foundation\'s intellectual merit and broader impacts review criteria.'
#The presence of 'American Recovery and Reinvestment Act' is a strong indicator of CFDA_CODE 47.082. This could present problems moving forward
'Program Director/Principal Investigator (Last, First, Middle): ', #What follows is usually a mix of existing info, but before a description that is usefull,  but not going to be possible to regular expression
'IGF::OT::IGF', #Not truncated but usually abstract as a whole is lacking in inofrmation: #Keep
#Variations of abstracts ending with Page 3, 4, etc.

#####################
#Based on analysis of CFDA numbers, these are starting phrases that make up a disproportionate amount of a CFDA number, but do not provide
#Information on the grant itself beyond the name. The impetus to remove them is that they may signal a grant OPPORTUNITY (i.e. CFDA ), rather 
#Than broad content found in numerous topics
##The presence of 'American Recovery and Reinvestment Act' is a strong indicator of CFDA_CODE 47.082. This could present problems moving forward
###############################################

#Examples:
'The broader impact/commercial potential of this I-Corps project'
'The broader impact/commercial potential of this project',
'The broader impact/commercial potential of this PFI project',
'The broader impact/commercial potential of this Small Business Innovation Research (SBIR) Phase I',
'The broader impact/commercial potential of this Small Business Innovation Research (SBIR) Phase II',
'The broader impacts/commercial potential of this Small Business Innovation Research (SBIR) Phase I',
'The broader impact/commercial potential of this Small Business Innovation Research (SBIR)',
'The broader impact/commercial potential of this Small Business Technology Transfer (STTR) Phase II',
'The broader impact/commercial potential of this Small Business Technology Transfer (STTR) Phase I',
'The broader impact/commercial potential of this Small Business Technology Transfer (STTR)',
'The broader impact/commercial potential of this Small Business Technology Transfer Phase II',
'The broader impact/commercial potential of this Small Business Technology Transfer Phase I',
'The broader impact/commercial potential of this Small Business Technology Transfer',
'The broader impact/commercial potential of this Small Business Innovation Research Phase II',
'The broader impact/commercial potential of this Small Business Innovation Research Phase I',
'The broader impact/commercial potential of this Small Business Innovation Research',
'This award is funded under the American Recovery and Reinvestment Act of 2009 (Public Law 111-5).',
'This award is made as part of the FY 2018 Mathematical Sciences Postdoctoral Research Fellowships',
'This application addresses broad Challenge Area'"""




'Some numbers of (*s, non or not) (space, -, or not), (technical abstract), (some number of stars)\nsome numbers (with . between) followed by abstract\n(abstract) (of whichever capitalization, within parentheses)\n(numbers + .)repeated endign with either + \'Project Summary/Abstract\'\nPHS 398/2590 + anything followed by + Page Continuation Format Page\nIf it starts with \'one page and must contain\',\n#This is an NIH thing and there aren\'t that many of them, but come from 3 different cfda\nit will start with "one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. \nIt should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.\nIt should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. \nThis Abstract must not include any proprietary/confidential information.* \nPlease click

//Looking just at the first character to check for truncation or start errors

In [173]:
#################
#First characters of abstracts (after some cleaning)
#Items with lowercase are not necessarily truncated, but certainly can be
#Often, a lowercase indicates possible truncation,e.g. 'n the last decade' but it can also be, e.g. 'von Hippel-Lindau Dissease', gamma-crystallins,?-lactam
#Characters that should be removed as a result of individual analysis of each unique, non uppercase starter:[',',';','\n','\t','&','-','!']
#. is usually just a leftover from abstract removal
#################
#First character assignment, to see whether all abstracts are now starting with upper case letters (in the best case scenario)
df['Start Char']=df[wa].apply(lambda x: x[0])
vc=df['Start Char'].value_counts()
#print(vc.loc[[x for x in vc.index if not x in string.ascii_uppercase+string.ascii_lowercase+'0123456789[(']])
####################
#2.2 Starting with first letter, do we see things are often completely surrounded by some sort of punctuation, in which case we can 
#make sure there isn't some starting clause we're missing
###################
df['Start Char']=df[wa].apply(lambda x:x[0])

#print(df.loc[df['Start Char']=='-','LAST_CHAR'].value_counts()) #Again--not starting and ending with '-'
#Although * is a common start, the whole thing isn't surrounded by *s
#print(df.loc[df['Start Char']=='*','LAST_CHAR'].value_counts())
#Although * is a common start, the whole thing isn't surrounded by *s
#print(df.loc[df['Start Char']=='=','LAST_CHAR'].value_counts())

#bad_starts=[x for x in vc.index if not x in string.ascii_uppercase]


df[wa]=df[wa].apply(str.lstrip,args=['?-*_^. :,!;=¿|]#%>&-\t\n']) #Often, sentences will start with - or *, but they indicate other quality issues and don't end with them,so it's okay to remove them



In [174]:
#'Enter the text here tha' ending with 'lines of text.'
expression=re.compile('Enter the text here that.*lines of text')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))

expression=re.compile('PHS .*?Continuation Format Page')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))
expression=re.compile('OMB No .*?Continuation Format Page')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))



df[wa]=df[wa].replace('Project Summary/Abstract','')

"""If it starts with 'one page and must contain',
#This is an NIH thing and there aren't that many of them, but come from 3 different cfda
it will start with "one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. 
It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.
It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. 
This Abstract must not include any proprietary/confidential information.* 
Please click the add attachment button to complete this entry." plus some attachments, which includes tracking number, twice:
following the second trackign number, there is a grant number followed by the actual content" 

At the end of these files, they all end in 'Project Narrative File'(last instance) followed by more attachments, all of which can be discarded
"""
expression1=re.compile('one page and must.*?Tracking Number.*?(Tracking Number)')
expression2=re.compile('Project Narrative File.*')
def fix_abstract(abstract):
    if abstract.startswith('one page and must contain'):
        abstract=re.sub(expression1,'',abstract)
        return re.sub(expression2,'',abstract)
    else:
        return abstract

df[wa]=df[wa].apply(fix_abstract)


df[wa]=df[wa].apply(lambda x: x.lstrip(',;\n\t&-!'))
df=df.loc[df[wa].apply(len)>0]

#starting_exact_phrases to remove
#'This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.'
#'This subproject is one of many research subprojects utilizing theresources provided by a Center grant funded by NIH/NCRR. The subproject andinvestigator (PI) may have received primary funding from another NIH source,and thus could be represented in other CRISP entries. The institution listed isfor the Center, which is not necessarily the institution for the investigator.'
df[wa]=df[wa].apply(lambda x: x.replace('This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.',
                                       ''))

expression=re.compile('This subproject is one of many research subprojects.*not necessarily the institution for the investigator.')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))
expression=re.compile('This subproject is one of many research subprojects.*to the subproject or subproject staff.')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))

def remove_long_phrase(record):
    """ ignores case to remove multi-word phrases in a particular order, especially those likely to run into other words,
    e.g. Institution university of washingtonPI mary williams. This doesn't work when titles or insititutions have escape characters in them, which is a bummer
    see for example ENHANCING THE USE OF NASA EARTH SCIENCE RESULTS / DATA / AND TECHNOLOGY BY ENGAGING THE FEDERATION OF EARTH SCIENCE INFORMATION PARTNERS COMMUNITIES OF
    PRACTICE IN TARGET AREAS OF INTEREST TO NASA THE FEDERATION OF EARTH SCIENCE INFORMATION PARTNERS (''FED"""
    title=record['PROJECT_TITLE']
    try:
        new_abstract=re.sub(title,'',record[wa],flags=re.IGNORECASE)      
        return re.sub(record['ORGANIZATION_NAME'],'',new_abstract,flags=re.IGNORECASE)   
    except:
        try:
            return re.sub(record['ORGANIZATION_NAME'],'',record[wa],flags=re.IGNORECASE)   
        except:
            return record[wa]

df[wa]=df.apply(lambda x: remove_long_phrase(x),axis=1)

expression=re.compile('Project Summary/Abstract Page.*')

def remove_contact_pd(x):
    """removes clause at end that tends to occur: eg Project Summary/Abstract Page 222Contact PD/PI: Sampson, HughNarrative ("""
    if x.startswith('Contact PD/PI'):
        return re.sub(expression,'',x)
    else:
        return x
df[wa]=df[wa].apply(remove_contact_pd)        
        



In [175]:
#Drop those that are now length 0 (ie were all punctuation or removable phrases
df.drop(list(df[df[wa].apply(lambda x: len(x)==0)].index),inplace=True)
df['Start Char']=df[wa].apply(lambda x:x[0])
df['LAST_CHAR']=df[wa].apply(lambda x:x[0])
#df.drop(df[df[wa].apply(len)==0].index[0],axis=0,inplace=True)

In [176]:
#####################
#Additional expressions we could choose to remove
#Identify abstracts with excessive amounts of other fields to uncover additional bad abstract types
#If we wanted to be on the safe side, some EDA makes me think we could remove anything with more than 3 or 4 of these fields. It's where they start getting wonky.
###################

fields=['Principal Investigator','Program Director','Attachment','Instructions','Lines',
        'Space Provided','Performance Site','Organization','Key Personnel']
all_fields=fields.copy()
all_fields.extend([x.lower() for x in fields])
all_fields.extend([x.upper() for x in fields])
all_fields.extend(['PI','Form','Page','Title','.pdf','.doc'])

def count_up_fields(abstract):
    count=0
    for field in all_fields:
        if field in abstract:
            count+=1
    return count

df['Field Count']=df[wa].apply(count_up_fields)


###########################
#Additional expressions we could remove, but there is a small possibility of some information being lost
##########################

#Issues: 'Close FormNextPrint PageAbout OMB Number']#This is usually ended with "Project summary", 
#so anything between those 2 can be delete, and #ended with a clause starting with 'Close FormProject' and ending in'Narrative File'

#expression1=re.compile('Close FormNext.*?Project Summary')
#expression2=re.compile('Close FormProject.*Narrative File')
def fix_abstract(abstract):
    if abstract.startswith('Close FormNext'):
        abstract=re.sub(expression1,'',abstract)
        return re.sub(expression2,'',abstract)
    else:
        return abstract
#df[wa]=df[wa].apply(fix_abstract)

#If ends in 'Description,', then go to last instance of PERFORMANCE (for Performance SITES), otherwise "KEY PERSONNEL", upper case, and cut all that follows

#expression1=re.compile('PERFORMANCE.*Description,$')
#expression2=re.compile('KEY PERSONNEL.*Description,$')

def apply_expressions(abstract):
    if abstract.endswith('Description,'):
        if re.search(expression1,abstract) != None:
            return re.sub(expression1,'',abstract)
        else:
            return re.sub(expression2,'',abstract)
    else:
        return abstract
    
#df[wa]=df[wa].apply(apply_expressions)

#expression1=re.compile('PERFORMANCE.*Page 3$')
#expression2=re.compile('KEY PERSONNEL.*Page 3,$')

def apply_expressions(abstract):
    if abstract.endswith('Description,'):
        if re.search(expression1,abstract) != None:
            return re.sub(expression1,'',abstract)
        else:
            return re.sub(expression2,'',abstract)
    else:
        return abstract
    
#df[wa]=df[wa].apply(apply_expressions)



In [177]:
df.to_csv('FRAbstractsSqueakyClean.csv')

In [178]:
#df[df['FY']==2008].to_csv('FedReporterAbstracts2008InProgressCleaning.csv')
#df.to_csv('FRAbstractsInProgressCleaning.csv')