In [1]:
#Set up Environment

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%pylab inline
from string import ascii_letters
import sys
import re
#Preprocessing
import nltk

raw_df=pd.read_csv('abstracts_federal_reporter_combined.csv',engine='python')


Populating the interactive namespace from numpy and matplotlib


FileNotFoundError: [Errno 2] No such file or directory: 'abstracts_federal_reporter_combined.csv'

In [5]:
#Visualize data

print('Columns')
print(raw_df.columns)
print('Descriptive Stats')
print(raw_df.describe())
print('Length')
print(len(raw_df))

Columns
Index(['PROJECT_ID', 'ABSTRACT', 'FY', 'FIRST_CHAR', 'LAST_CHAR', 'DEPARTMENT',
       'AGENCY', 'IC_CENTER', 'PROJECT_NUMBER', 'PROJECT_TITLE',
       'PROJECT_TERMS', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS',
       'ORGANIZATION_NAME', 'CFDA_CODE', 'FY_TOTAL_COST'],
      dtype='object')
Descriptive Stats
         PROJECT_ID             FY  FY_TOTAL_COST
count  5.500880e+05  550088.000000   4.256850e+05
mean   4.980384e+05    2012.247477   4.509841e+05
std    3.262050e+05       3.183765   1.727112e+06
min    1.008600e+04    2008.000000   1.000000e+00
25%    1.790538e+05    2009.000000   1.390020e+05
50%    4.880660e+05    2012.000000   2.917820e+05
75%    7.817102e+05    2015.000000   4.500000e+05
max    1.101940e+06    2018.000000   3.227983e+08
Length
550088


In [8]:
##############
#Remove nulls and duplicates
#Currently removes only duplicates based on ABSTRACTS and only in the same YEAR
#The rationale here is that we may do year-by-year modelling and don't want to exclude projects
#But if we do all-in-one modelling (e.g. across all years), we will want to reconsider
#Also will want to do additional duplicate check once abstracts are cleaned
###############

df=raw_df.loc[pd.notnull(raw_df['ABSTRACT'])]
df.drop_duplicates(subset=['ABSTRACT','FY'],inplace=True) #Drop projects with identical abstracts and year. Different year could indicate additional funding sent to this project.
print('Length'+str(len(df)))

####################
#Check for additional duplicates
#Note that the project id isnt necessarily identical for each transaction on same grant--e.g. one number could be added, so this isnt that strict and why checking astract is needed
#####################
print('Project ID duplicates')
vc=df['PROJECT_ID'].value_counts()
print(vc[vc>1])

Length550074
Project ID duplicates
Series([], Name: PROJECT_ID, dtype: int64)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [4]:
################
#Function for removing any text we don't like at start, end, or anywhere within a string
################

def remove_phrase(x, phrase,loc='Start'):
    """returns x with phrase removed. location can be "Start" of string, "End" of string, or "Anywhere_All"--anywhere will remove all instances and Anywhere_First will remove the first instane"""
    assert loc in ['Start','End','Anywhere_All','Anywhere_First']
    if loc=='End':
        if x.endswith(phrase):
            return x[:-1*len(phrase)].strip()
        else:
            return x
    elif loc=='Start':
        if x.startswith(phrase):
            return x[len(phrase):].strip()
        else:
            return x
    elif loc=='Anywhere_All':
        return x.replace(phrase,'')
    elif loc=='Anywhere_First':
        return x.replace(phrase,'',1)
    else:
        return 'Error'
    
#Testing phrases
"""
x='phrase to remove is at the start and phrase to remove is also in middle and phrase to remove is in the middle again and ends with phrase to remove'
y='nothing in common but a phrase to remov nothing in common'
print(remove_phrase(x,'phrase to remove',loc='Start'))
print(remove_phrase(x,'phrase to remove',loc='End'))
print(remove_phrase(x,'phrase to remove',loc='Anywhere_All'))
print(remove_phrase(x,'phrase to remove',loc='Anywhere_First'))
print(remove_phrase(y,'phrase to remove',loc='Start'))
print(remove_phrase(y,'phrase to remove',loc='End'))
print(remove_phrase(y,'phrase to remove',loc='Anywhere_All'))
print(remove_phrase(y,'phrase to remove',loc='Anywhere_First'))
"""

"\nx='phrase to remove is at the start and phrase to remove is also in middle and phrase to remove is in the middle again and ends with phrase to remove'\ny='nothing in common but a phrase to remov nothing in common'\nprint(remove_phrase(x,'phrase to remove',loc='Start'))\nprint(remove_phrase(x,'phrase to remove',loc='End'))\nprint(remove_phrase(x,'phrase to remove',loc='Anywhere_All'))\nprint(remove_phrase(x,'phrase to remove',loc='Anywhere_First'))\nprint(remove_phrase(y,'phrase to remove',loc='Start'))\nprint(remove_phrase(y,'phrase to remove',loc='End'))\nprint(remove_phrase(y,'phrase to remove',loc='Anywhere_All'))\nprint(remove_phrase(y,'phrase to remove',loc='Anywhere_First'))\n"

In [9]:
#Define a new series which is an abstract that keeps the raw text, but can be continuously manipulated.
wa='working_abstract'
df[wa]=df['ABSTRACT'].apply(str.strip)

#Remove too short abstracts
df['nchar']=df[wa].apply(len)
limit=150 #Less than 150 chars is not an abstract
df=df.loc[df['nchar']>=limit]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
#Data Cleaning Plan:

#1 KATHRYN
#Start by searching for truncation and end tags
#1.1 Identify last character. If it's not a sentence-ending punctuation, manual investigation. 
#1.2Pull out last sentences and clauses to check for any end-based tags
#1.3 Check for truncation

#2: SAM
#Then go to the first few sentences
#What if it's actually just short?
#2.1. Pull out starting tags that don't provide information--e.g. description, abstract
#2.2 Check abstracts that start with a non-alphabetical character

#3: SEAN
#Finally, flag abstracts that contain information thats not appropriate in an abstract
#3.1 Check for the phrases "Prinicipal Investigator", title of project, "truncated"

#4 Review of common bigrams/bigrams tuning

In [6]:
##########################
#2: Starts
#Pull out starts that are proportionally high to check for uninformative/duplicate sentences
#Show how frequent certain first chars are by cfda by grouping, relative frequnecyy
#Again based on First Char, repeat with first actual sentence
##########################

#Put together a table that groups first char by CFDA code and instead of counts, see how frequently it occurs within that CFDA code
attempt_1=pd.DataFrame(df.groupby('CFDA_CODE')['FIRST_CHAR'].value_counts())
attempt_2=df['CFDA_CODE'].value_counts()
attempt_3=attempt_1.join(attempt_2,on='CFDA_CODE')
attempt_3['Relative Frequency']=attempt_3['FIRST_CHAR']/attempt_3['CFDA_CODE']
mapper={'FIRST_CHAR':'Absolute Frequency','CFDA_CODE':'Frequency of CFDA Number'}
attempt_3.rename(columns=mapper)
attempt_3.drop('Frequency of CFDA Number',axis=1)

#Limit analysis to those items that occur at least 3 times and in at least 1% of abstracts (arbitrary)
a=attempt_3['Absolute Frequency']>3 
b=attempt_3['Relative Frequency']>.01
attempt_3.loc[(a&b)] #Some first sentences are a very good indicator of a particular CFDA, suggesting they have particular formatting rules that would be a give away for the
#CFDA code, rather than the actual research conducted

Unnamed: 0_level_0,Unnamed: 1_level_0,FIRST_CHAR,CFDA_CODE,Relative Frequency
CFDA_CODE,FIRST_CHAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
47.041,The broader impact/commercial potential of this Small Business Innovation Research (SBIR) Phase I pr,346,23165,0.014936
47.079,The International Research Fellowship Program enables U.S. scientists and engineers to conduct nine,70,2225,0.031461
47.079,"This Pan-American Advanced Studies Institutes (PASI) award, jointly supported by the NSF and the Dep",33,2225,0.014831
47.08,This award facilitates scientific research using the large new computational resource named Blue Wat,7,496,0.014113
47.08,"This award facilitates scientific research using the large, new, computational resource named Blue W",5,496,0.010081
47.08,"This proposal is for a provisional allocation of time on the Blue Waters computer system, due to bec",7,326,0.021472
47.082,This award is funded under the American Recovery and Reinvestment Act of 2009 (Public Law 111-5). Th,517,4622,0.111856
47.082,This award is funded under the American Recovery and Reinvestment Act of 2009 (Public Law 111-5).The,502,4622,0.108611
47.082,This award is funded under the American Recovery and Reinvestment Act of 2009 (Public Law 111-5).Thi,499,4622,0.107962
47.082,This award is funded under the American Recovery and Reinvestment Act of 2009 (Public Law 111-5).Wit,117,4622,0.025314


In [13]:
###############
#2: Starts
#Investigate CFDAs with identical first chars that occur very frequently
#47.041 seems to contain a lot of variations of the same first char
###############
print(df.loc[df['CFDA_CODE']=='47.041','FIRST_CHAR'].value_counts())

#Conclusion: This is an instance of near identical language that notes a grant rather than the research funding--e.g. SBIR

The broader impact/commercial potential of this Small Business Innovation Research (SBIR) Phase I pr    346
The broader impact/commercial potential of this Small Business Innovation Research (SBIR) Phase II p    218
The broader impact/commercial potential of this Small Business Innovation Research (SBIR) project is     89
The broader impact/commercial potential of this Small Business Technology Transfer (STTR) Phase I pr     62
The Planning Grants for Engineering Research Centers competition was run as a pilot solicitation wit     58
The broader impact/commercial potential of this Small Business Innovation Research Phase I project i     41
The broader impact/commercial potential of this Small Business Innovation Research (SBIR) project wi     28
The broader impact/commercial potential of this Small Business Innovation Research Phase II project      25
This award provides funding for a three year standard award to support a Research Experiences for Te     22
The broader impact/commercia

In [7]:
#2.1--phrases noticed through investigation of starting characters, as well as those identified in R Analysis

start_phrases=['****TECHNICAL ABSTRACT****','****Technical Abstract****',
               '****Non Technical Abstract****','*** Non- Technical Abstract ***','**Non-Technical Abstract**',
          '*****NON-TECHNICAL ABSTRACT*****','***** NON-TECHNICAL ABSTRACT *****'
          '****NONTECHNICAL ABSTRACT****','****Non-Technical Abstract****','*Non-technical Abstract*',
               '*****NON-TECHNICAL ABSTRACT*****','****NON-TECHNICAL ABSTRACT****',
               '***NON-TECHNICAL ABSTRACT***','****Nontechnical abstract****'
               'DESCRIPTION (provided by applicant):','DESCRIPTION (provided by applicant)',
               'Project Summary/Abstract','PROJECT SUMMARY/ABSTRACT','ABSTRACT',
               'PROJECT SUMMARY','Project Summary','/ASBTRACT','/ Proposal','/ SUMMARY','/ DESCRIPTION','/PROJECT SUMMARY',
               '/ PROJECT SUMMARY','/Abstract:','/ABSTRACT:','/ABSTRACT','/ ABSTRACT:',
               '/ ABSTRACT','/Abstract','/ Abstract','/Description','/SUMMARY','/PROJECT SUMMARY',
              '/ RESEARCH SUMMARY','/PROJECT SUMMARY','/abstract','/Proposal Abstract',
               '/DESCRIPTION','/PROJECT DESCRIPTION','/PROJECT SUMMARY','/NARRATIVE','/RESEARCH ABSTRACT','/ PROJECT DESCRIPTION']
df[wa]=df[wa].apply(str.lstrip,args=['?-_^. :,!;¿|]#%>&'])
df.drop(df[df[wa].apply(len)==0].index[0],axis=0,inplace=True)
#Remove found phrases
for phrase in start_phrases:
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'Start']).apply(str.lstrip,args=[' :'])

df['Start Char']=df[wa].apply(lambda x: x[0])

In [None]:
#INVESTIGATING ENDINGS OF ABSTRACTS FOR END TAGS AND TRUNCATION
#1.1Identify last character. If it's not a sentence-ending punctuation, manual investigation. 
#1.2Pull out last sentences and clauses to check for any end-based tags
#1.3Check out for truncation.

In [11]:
#1.1 Identify last character. If it's not a sentence-ending punctuation, manual investigation. 

punctuation=['!','?','.']

#Three types of bad ends:
#Numbers
numbers=range(10)
#Alphabet
alpha=ascii_letters
#Other
bad_ends=['#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '/',':', ';', '<', '=', '>', '[', '\\', ']', '^', '_', '`','{', '|', '}', '~', '�']


#Dealing with "other" last-chars
df.loc[df['LAST_CHAR'].isin(bad_ends),'LAST_CHAR'].value_counts()

)    1403
]     493
/     471
,     452
:     403
*     262
;     127
-     110
(      81
'      66
_      24
\      23
^      17
=      17
�      13
%      11
}       9
&       7
+       4
|       3
>       3
`       3
#       3
$       2
[       2
<       2
{       1
~       1
Name: LAST_CHAR, dtype: int64

In [None]:
#1.1 Look at ending character for characters that are non-ending punctuation--ignoring numbers and alpha for now, which are likely cut off

for x in ['}','=','~','{','[','$','<','#','>','`']:#'-',
    mini=df.loc[df['LAST_CHAR']==x]
    print(len(mini))
    for y in mini.iterrows():
        print(y[1]['ABSTRACT'])


truncation_last_chars=['&','(',':','<','>','[','{','|','\\'] #\ seems like a sign of actual cut off by system, not just characters stopping

"""Chars to investigate for contained clauses
) #Often times these contain things like the center that submitted, e.g. "(Lucas Annual Report and ISMRM 2011 Abstracts)"
] #Very often related to NIH with format '[Corresponds to KewalRamani Project 2 in the October 2011 site visit report of the HIV Drug Resistance Program]  ', plus also :'[summary truncated at 7800 characters]'
=#Most of the time its a string for =========SECTION END======, which also usually starts with exceed the space provided
"""

"""CHars to investigate
# Looks like a formatting error, no contained clauses, fine to keep
$ Looks like some sort of formatting  error, fine to keep
% #Sometimes grants end with % effort--no clear pattern
': #Often occurs at the end following full paragraph, but not always--no clear pattern
): is very often at the end of paragraphs of full sentences, so that's likely okay. #However, unlike the ***, ')' is sometimes joined by a '(' forming a full sentence, so it's not wise to apply a removal at all here except key phrases found in EDA
Don'y need to remove, would mess up structure in some cases and not clear cut whatthose would be

*:  *** isn't a surrounding tag, it tends to just be stand alone at end, when it does surround it, just surrounds "technical abstract" or "figures wont reproduce here see attachment". Can be removed
-#Sometimes truncations, usually strings of ____, which will be removed in post processing. Not indicative of pattern
< #All are truncated
> #Mostly truncated
+ #All truncated
/ websites, usually.
; (esp preceded by numbers?)
= #Most of the time its a string for =========SECTION END======, which also usually starts with exceed the space provided
](and what proceeds should be pulled out
^ look at individually
_ #Sometimes truncations, usually strings of ____, which will be removed in post processing. Not indicative of pattern
`#Not truncated
~ ##Looks like just a formatting error, fine to keep
} #No particular pattern, besides somtimes PIs using this to break up the text. If proceeded by valid punctuation, fine.
�#Truncation not clear--if what proceeds this char or series of this char is not punctuation or bad char, this is likely cut off.
"""

In [9]:
#1.2
#Ending phrases noticed through manual investigation of last character
print('End phrases to remove: ')
for phrase in ['(End of Abstract)',"End of Abstract", '(Abstract end)' "(END OF ABSTRACT)", '(End of abstract.)','(Abstract End)','(End 0f Abstract)','(End of Abstract.)','(End of Absract)',
               '(Abstract below)','(End of Reviewers\' Comment)','(End Abstract)','(End of abstract)','(End of abstract)',
               'PERFORMANCE SITE ========================================Section End===========================================',
                'KEY PERSONNEL ========================================Section End===========================================']:   
    print(phrase)
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'End'])
df['LAST_CHAR']=df[wa].apply(lambda x: x[-1])

End phrases to remove: 
(End of Abstract)
End of Abstract
(Abstract end)(END OF ABSTRACT)
(End of abstract.)
(Abstract End)
(End 0f Abstract)
(End of Abstract.)
(End of Absract)
(Abstract below)
(End of Reviewers' Comment)
(End Abstract)
(End of abstract)
(End of abstract)


In [12]:
#y='*' #Generally if last char is '*', it comes in as '***' following a complete sentence, so unlikely to be cut-of'

#Fixes '***' if that makes the last character a punctuation ending mark, otherwise should remove
y='*' 
entries_ending_right=df.loc[df['LAST_CHAR']==y]
entries_ending_right['new_last_char_possible']=entries_ending_right.apply(lambda x:remove_phrase(x[wa],'***',loc='End')[-1],axis=1)
entries_to_fix=list(entries_ending_right[entries_ending_right['new_last_char_possible'].isin(punctuation)]['PROJECT_ID'])

df[wa]=df.apply(lambda x: remove_phrase(x[wa],'***','End') if x['PROJECT_ID'] in entries_to_fix else x[wa],axis=1)
df['LAST_CHAR']=df[wa].apply(lambda x: x[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
#1.3 Remove any words esp at end, if they are entirely composed of non alpha numeric non punctuation characters
#Don't need to do this, because will not show up in model. Only an issue if lots of common last phrases that are masked by some punctuation.

In [None]:
#Clear signs of truncation
trunc_ends=['[summary truncated at 7800 characters]']
trunc_starts=['EXCEED THE SPACE PROVIDED']

In [34]:
#df[df['FY']==2008].to_csv('FedReporterAbstracts2008InProgressCleaning.csv')
#df.to_csv('FRAbstractsInProgressCleaning.csv')