In [1]:
#Set up Environment

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%pylab inline
from string import ascii_letters
import sys
import re
#Preprocessing
import nltk
import os


Populating the interactive namespace from numpy and matplotlib


In [2]:
#Visualize data
raw_df=pd.read_csv('/sfs/qumulo/qhome/sc2pg/src/prnd/publicrd/data/prd/DigitalOcean_Backup/public_rd/working/federal_reporter/abstracts_federal_reporter_combined.csv',engine='python')

print('Columns')
print(raw_df.columns)
print('Descriptive Stats')
print(raw_df.describe())
print('Length')
print(len(raw_df))

Columns
Index(['PROJECT_ID', 'ABSTRACT', 'FY', 'FIRST_CHAR', 'LAST_CHAR', 'DEPARTMENT',
       'AGENCY', 'IC_CENTER', 'PROJECT_NUMBER', 'PROJECT_TITLE',
       'PROJECT_TERMS', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS',
       'ORGANIZATION_NAME', 'CFDA_CODE', 'FY_TOTAL_COST'],
      dtype='object')
Descriptive Stats
         PROJECT_ID             FY  FY_TOTAL_COST
count  5.500880e+05  550088.000000   4.256850e+05
mean   4.980384e+05    2012.247477   4.509841e+05
std    3.262050e+05       3.183765   1.727112e+06
min    1.008600e+04    2008.000000   1.000000e+00
25%    1.790538e+05    2009.000000   1.390020e+05
50%    4.880660e+05    2012.000000   2.917820e+05
75%    7.817102e+05    2015.000000   4.500000e+05
max    1.101940e+06    2018.000000   3.227983e+08
Length
550088


In [3]:
##############
#Remove nulls and duplicates
#Currently removes only duplicates based on ABSTRACTS and only in the same YEAR
#The rationale here is that we may do year-by-year modelling and don't want to exclude projects
#But if we do all-in-one modelling (e.g. across all years), we will want to reconsider
#Also will want to do additional duplicate check once abstracts are cleaned
###############

#Drop projects with identical abstracts and year. Different year could indicate additional funding sent to this project.
df=raw_df.loc[pd.notnull(raw_df['ABSTRACT'])]
df.drop_duplicates(subset=['ABSTRACT','FY'],inplace=True) 
print('Length'+str(len(df)))

####################
#Check for additional duplicates
#Note that the project id isnt necessarily identical for each transaction on same grant--e.g. one number could be added, so this isnt that strict and why checking astract is needed
#####################
print('Project ID duplicates:')
vc=df['PROJECT_ID'].value_counts()
print(vc[vc>1])

Length550074
Project ID duplicates:
Series([], Name: PROJECT_ID, dtype: int64)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [4]:
################
#Function for removing any text we don't like at start, end, or anywhere within a string
#CASE SENSITIVE
################

def remove_phrase(x, phrase,loc='Start'):
    """returns x with phrase removed. location can be "Start" of string, "End" of string, or "Anywhere_All"--anywhere will remove all instances and Anywhere_First will remove the first instane"""
    assert loc in ['Start','End','Anywhere_All','Anywhere_First']
    if loc=='End':
        if x.endswith(phrase):
            return x[:-1*len(phrase)].strip()
        else:
            return x
    elif loc=='Start':
        if x.startswith(phrase):
            return x[len(phrase):].strip()
        else:
            return x
    elif loc=='Anywhere_All':
        return x.replace(phrase,'')
    elif loc=='Anywhere_First':
        return x.replace(phrase,'',1)
    else:
        return 'Error'
    
#Testing phrases
"""
x='phrase to remove is at the start and phrase to remove is also in middle and phrase to remove is in the middle again and ends with phrase to remove'
y='nothing in common but a phrase to remov nothing in common'
print(remove_phrase(x,'phrase to remove',loc='Start'))
print(remove_phrase(x,'phrase to remove',loc='End'))
print(remove_phrase(x,'phrase to remove',loc='Anywhere_All'))
print(remove_phrase(x,'phrase to remove',loc='Anywhere_First'))
print(remove_phrase(y,'phrase to remove',loc='Start'))
print(remove_phrase(y,'phrase to remove',loc='End'))
print(remove_phrase(y,'phrase to remove',loc='Anywhere_All'))
print(remove_phrase(y,'phrase to remove',loc='Anywhere_First'))
"""

"\nx='phrase to remove is at the start and phrase to remove is also in middle and phrase to remove is in the middle again and ends with phrase to remove'\ny='nothing in common but a phrase to remov nothing in common'\nprint(remove_phrase(x,'phrase to remove',loc='Start'))\nprint(remove_phrase(x,'phrase to remove',loc='End'))\nprint(remove_phrase(x,'phrase to remove',loc='Anywhere_All'))\nprint(remove_phrase(x,'phrase to remove',loc='Anywhere_First'))\nprint(remove_phrase(y,'phrase to remove',loc='Start'))\nprint(remove_phrase(y,'phrase to remove',loc='End'))\nprint(remove_phrase(y,'phrase to remove',loc='Anywhere_All'))\nprint(remove_phrase(y,'phrase to remove',loc='Anywhere_First'))\n"

In [11]:
"""
def replace_case_insensitive(list_of_replacements):
    if type(list_of_replacements)!=list:
        raise TypeError #Must be a list
    for i in list_of_replacements:
"""        

'\ndef replace_case_insensitive(list_of_replacements):\n    if type(list_of_replacements)!=list:\n        raise TypeError #Must be a list\n    for i in list_of_replacements:\n'

In [5]:
#Define a new series which is an abstract that keeps the raw text, but can be continuously manipulated.
wa='working_abstract'
df[wa]=df['ABSTRACT'].apply(str.strip)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
#An illustrative case that Fedearl Reporter is NOT all R&D and why semantic content, rather than simple tagging, is better
print(df.loc[df['ABSTRACT']== 'Not R&D, do not report'])

print(df.loc[df['ABSTRACT']== 'Technical support services contract, not R&D'])


#Remove too short abstracts
df['nchar']=df[wa].apply(len)
limit=150 #Less than 150 chars is not an abstract
#[x for x in df.loc[df['nchar']<=limit,'ABSTRACT'].head(40)]
#This seems like a good cutoff, but it does lose some useful information.
df=df.loc[df['nchar']>=limit]

        PROJECT_ID                ABSTRACT    FY              FIRST_CHAR  \
452095      851077  Not R&D, do not report  2016  Not R&D, do not report   

       LAST_CHAR DEPARTMENT AGENCY IC_CENTER       PROJECT_NUMBER  \
452095         t        HHS    NIH     NICHD  275201400092U-7-0-2   

       PROJECT_TITLE                        PROJECT_TERMS  \
452095  IGF::OT::IGF  Reporting; research and development   

       CONTACT_PI_PROJECT_LEADER OTHER_PIS          ORGANIZATION_NAME  \
452095            DEATON, LAUREN       NaN  COURTESY ASSOCIATES, LLC:   

       CFDA_CODE  FY_TOTAL_COST        working_abstract  
452095       NaN        25000.0  Not R&D, do not report  
        PROJECT_ID                                      ABSTRACT    FY  \
300896      723344  Technical support services contract, not R&D  2012   

                                          FIRST_CHAR LAST_CHAR DEPARTMENT  \
300896  Technical support services contract, not R&D         D        HHS   

       AGENCY IC_C

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [7]:
####################
#2.1--phrases noticed through investigation of starting characters, as well as those identified in R Analysis
######################

start_phrases=['****TECHNICAL ABSTRACT****','****Technical Abstract****',
               '****Non Technical Abstract****','*** Non- Technical Abstract ***','**Non-Technical Abstract**',
               '*****NON-TECHNICAL ABSTRACT*****','***** NON-TECHNICAL ABSTRACT *****',
               '****NONTECHNICAL ABSTRACT****','****Non-Technical Abstract****','*Non-technical Abstract*',
               '*****NON-TECHNICAL ABSTRACT*****','****NON-TECHNICAL ABSTRACT****',
               '***NON-TECHNICAL ABSTRACT***','****Nontechnical abstract****',
               'TECHNICAL SUMMARY', 'NONTECHNICAL SUMMARY','NON-TECHNICAL SUMMARY','Non-technical description',
               'DESCRIPTION (Provided by the applicant)','DESCRIPTION (provided by investigator)',  'DESCRIPTION (provided by applicant)',
               'Project Summary/Abstract','PROJECT SUMMARY/ABSTRACT',
               'ABSTRACT','abstract','Proposal Abstract','Abstract','RESEARCH ABSTRACT',
               'PROJECT SUMMARY','Project Summary','SUMMARY','RESEARCH SUMMARY',
               'Proposal',
               'DESCRIPTION','Description','PROJECT DESCRIPTION'
               'NARRATIVE',
               '(See instructions):','\t',
              'FOR CENTER APPLICATION (provided by the investigator):','Objective(s)',      'EXCEED THE SPACE PROVIDED',
               'Provided by Applicant','Provided by applicant','provided by applicant','PROVIDED BY APPLICANT',
               'Provided by Candidate','Provided by candidate','provided by candidate','PROVIDED BY CANDIDATE']

df[wa]=df[wa].apply(str.lstrip,args=[' ?-_^. :,!;¿|()[]]#%>﻿&\''])
df.drop(df[df[wa].apply(len)==0].index[0],axis=0,inplace=True)
#Remove found phrases
for phrase in start_phrases:
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'Start']).apply(str.lstrip,args=[' :./)'])
#Repeated in case the order of project summary/abstract varies
for phrase in start_phrases:
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'Start']).apply(str.lstrip,args=[' :./)'])

    


In [15]:

#Then go to the first few sentences
#2.1. Pull out starting tags that don't provide information--e.g. description, abstract
#2.2 Check abstracts that start with a non-alphabetical character

//Examining Titles for Duplication (Instead of Project IDS)

In [8]:
###########
#Grouping by first 50, we can pull out first_sentences that are used by more than one PI, suggesting this is not just a PI thing, but a grant level thing
#Titles
#########

vc=df['PROJECT_TITLE'].value_counts()
vc=vc[vc>1]
#First 50 sets that occur more than once
dup_strings=list(vc.index)
print(dup_strings[0:5])

#Dataframe of common first phrases, which PIs use is and how many unique PIs
vc=df.loc[df['PROJECT_TITLE'].isin(dup_strings)].groupby('PROJECT_TITLE')['CONTACT_PI_PROJECT_LEADER'].unique()
common=pd.DataFrame([vc,vc.apply(len)],index=['Unique PIs','Num_Unique_PIs']).T
common=common.loc[common['Num_Unique_PIs']>1]

###############
#Finding repeats other than by project ID: using titles
#Title is not a good indicator of whether a project is repeated--many grants all have same title but very different abstracts and PIs
###############
vc=common.sort_values(by=['Num_Unique_PIs'],ascending=False)['Num_Unique_PIs']
vc=vc.loc[[x for x in vc.index if not 'CORE' in x]]
vc=vc.loc[vc<50]
df.loc[df['PROJECT_TITLE']=='MOLECULAR MECHANISMS REGULATING SKELETAL MUSCLE GROWTH AND DIFFERENTIATION',wa]

['ADMINISTRATIVE CORE', 'POSTDOCTORAL RESEARCH FELLOWSHIP', 'ADMINISTRATION', 'CLINICAL CORE', 'ADMINISTRATIVE']


232219    Skeletal muscle is critical for life and is al...
232256    A wide variety of cells exist in skeletal musc...
232492    This study will investigate the influence of t...
232852    The overall rationale is to understand the cel...
232857    Meat, derived from skeletal muscle, is one of ...
232884    This investigation of muscle proteolysis will ...
232892    Cellular control of muscle type will be define...
232918    Skeletal muscle is the most commercially impor...
232941    How muscle cells grow is poorly understood. Im...
232977    The overall goal of this cooperative, multi-st...
232999    The importance of muscle as a food is exemplif...
233013    Sustainable livestock production depends on ef...
233148    The overall goal of this cooperative, multi-st...
233216    Muscle growth in meat producing animals is reg...
233242    Skeletal muscle is an extremely plastic tissue...
233546    The long-term goal of this research is to impr...
233559    This project will focus on gen

//Examining First Few Characters to Find Tags (Instead of sentences, because common first sentences have same first few characters!)

In [9]:
###############33
#2: Pulling out duplicate first sentences
#Starting with 'First_char', which doesn't work correctly but does tell us some stuff
#Pull out starts that are absolutely high (not relatively by CFDA)
#If you run this BEFORE taking out all the starts in the cell above, you get very different results.
#Looking at these, all of them seem relevant, for the most part
################

df['First 50']=df[wa].apply(lambda x: x[:50])
vc=df['First 50'].value_counts()
vc[vc>2]

This subproject is one of many research subproject    24173
This award is funded under the American Recovery a     3687
This Small Business Innovation Research (SBIR) Pha     1485
The broader impact/commercial potential of this Sm      946
This Small Business Innovation Research Phase I pr      649
The broader impact/commercial potential of this I-      446
With this award from the Major Research Instrument      250
The broader impact/commercial potential of this pr      238
Program Director/Principal Investigator (Last, Fir      232
This application addresses broad Challenge Area (0      232
This action funds an NSF Postdoctoral Research Fel      207
This PFI: AIR Technology Translation project focus      201
The research objective of this Faculty Early Caree      171
With this award, the Chemistry of Life Processes P      168
With support from the Chemical Measurement and Ima      162
This award funds the research activities of Profes      152
This Small Business Technology Transfer 

In [10]:
##########################
#2: Starts
#Pull out starts that are proportionally high to check for uninformative/duplicate sentences
#Show how frequent certain first chars are by cfda by grouping, relative frequnecyy
#Again based on First Char, repeat with first actual sentence
##########################

#Put together a table that groups first char by CFDA code and instead of counts, see how frequently it occurs within that CFDA code
rel_freq=pd.DataFrame(df.groupby('CFDA_CODE')['First 50'].value_counts())
rel_freq=rel_freq.join(df['CFDA_CODE'].value_counts(),on='CFDA_CODE')
rel_freq['Relative Frequency']=rel_freq['First 50']/rel_freq['CFDA_CODE']

#Limit analysis to those items that occur at least 3 times and in at least 1% of abstracts (arbitrary)
a=rel_freq['First 50']>3 
b=rel_freq['Relative Frequency']>.01
rel_freq.loc[(a&b)] #Some first sentences are a very good indicator of a particular CFDA, suggesting they have particular formatting rules that would be a give away for the
#CFDA code, rather than the actual research conducted
rel_freq.loc[(a&b)].to_csv('Common First 50.csv')
rel_freq.loc[(a&b)]

Unnamed: 0_level_0,Unnamed: 1_level_0,First 50,CFDA_CODE,Relative Frequency
CFDA_CODE,First 50,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10.001,The long-term objective of this project is to deve,21,1648,0.012743
43.007,THE AMERICAN SOCIETY FOR GRAVITATIONAL AND SPACE R,5,261,0.019157
47.041,This Small Business Innovation Research (SBIR) Pha,1449,23165,0.062551
47.041,The broader impact/commercial potential of this Sm,946,23165,0.040837
47.041,This Small Business Innovation Research Phase I pr,560,23165,0.024174
47.041,The broader impact/commercial potential of this I-,426,23165,0.01839
47.041,The broader impact/commercial potential of this pr,238,23165,0.010274
47.049,With this award from the Major Research Instrument,249,24484,0.01017
47.074,This action funds an NSF Postdoctoral Research Fel,198,12381,0.015992
47.076,The Louis Stokes Alliances for Minority Participat,98,9468,0.010351


In [20]:
#CFDAs that have a large proportion of repeated first 50 characters


#Passes for not having a lot of repeats--that is, under 11
#93.849, 93.94 , 93.945, 93.946, '77','47.080',43.007,77.   , 84.305,93.433, 93.448, 93.701, 93.077, 93.095, 93.135,
#       93.142, 93.143, 93.315,47.083,
#Issues already identified for rectification: 93.283 (one page), 93.333, 93.371,(subprojects),93.389 (subprojects)
#Repetitive word choice, but relevant and Not acceptable to remove: 93.281, 47.041, 47.049, 84.324, 47.074, 47.076, 47.079, 47.08 ,  47.082,
"""
for cfda in cfdas_repetitive:
    print(cfda)
    vc=df.loc[df['CFDA_CODE']==cfda,'First 50'].value_counts()
    print(vc[vc>10])
"""
#10:01: all start with 'Objective(s)', remove


"\nfor cfda in cfdas_repetitive:\n    print(cfda)\n    vc=df.loc[df['CFDA_CODE']==cfda,'First 50'].value_counts()\n    print(vc[vc>10])\n"

In [11]:
#############
#Demonsrate that individuals tend to reuse the same sentences across grants
#There are over 70,000 unique first 50 characters that are used more than once by the same PI
#############

vc=df.groupby('CONTACT_PI_PROJECT_LEADER')['First 50'].value_counts()
vc[vc>1]


CONTACT_PI_PROJECT_LEADER  First 50                                          
,                          The goal of this contract is to provide support of    22
                           Through partnership with the Office of Minority He    17
                           The primary goal of this project is to provide sup    12
                           The goal of this interagency agreement is to provi    10
                           The overall Women's Health Initiative (WHI) progra    10
                           This interagency transfer to the United States Dep    10
                           The National Cancer Institute (NCI) will deploy an     9
                           To provide support to the National Center for Heal     8
                           The NICHD, as delegated by the NIH, is responsible     7
                           The National Health and Nutrition Examination Surv     7
                           The National Longitudinal Survey of Youth 1979 (NL     

In [12]:
###########
#Grouping by first 50, we can pull out first_sentences that are used by more than one PI, suggesting this is not just a PI thing, but a grant level thing
#Identifying more sentences to be examined and removed
#########

vc=df['First 50'].value_counts()
vc=vc[vc>1]
#First 50 sets that occur more than once
dup_strings=list(vc.index)
print(dup_strings[0:5])

#Dataframe of common first phrases, which PIs use is and how many unique PIs
vc=df.loc[df['First 50'].isin(dup_strings)].groupby('First 50')['CONTACT_PI_PROJECT_LEADER'].unique()
common=pd.DataFrame([vc,vc.apply(len)],index=['Unique PIs','Num_Unique_PIs']).T
common=common.loc[common['Num_Unique_PIs']>1]
print(common.sort_values(by=['Num_Unique_PIs'],ascending=False))

#Here is a case where multiple investigators use a near, but not identical, abstract
for row in df.loc[df['First 50']=='Land-use change is a primary driver of the losses '].iterrows():
    print(row[1]['ABSTRACT'])

['This subproject is one of many research subproject', 'This award is funded under the American Recovery a', 'This Small Business Innovation Research (SBIR) Pha', 'The broader impact/commercial potential of this Sm', 'This Small Business Innovation Research Phase I pr']
                                                                                           Unique PIs  \
First 50                                                                                                
This subproject is one of many research subproject  [LEWIS, HENRY, SOLIMAN, KARAM F.A., REDDA, KIN...   
This award is funded under the American Recovery a  [GUSTIN, MAE S, GEEN, ALEXANDER VAN, SCHREIER,...   
This Small Business Innovation Research (SBIR) Pha  [CAMERON, SETH, GAITAS, ANGELO, FRIGO, MATTEO,...   
The broader impact/commercial potential of this Sm  [HUDGINS, DANIEL, KERBY, MATTHEW B, CHAN, SERE...   
This Small Business Innovation Research Phase I pr  [KARKAR, VICTOR, ATIYA, AMIR F, KAGANOVE, STEV.

//Looking just at the first character to check for truncation or start errors

In [13]:
#2.2
import string

#################
#First characters of abstracts (after some cleaning)
#Items with lowercase are not necessarily truncated, but certainly can be
#Often, a lowercase indicates possible truncation,e.g. 'n the last decade' but it can also be, e.g. 'von Hippel-Lindau Dissease', gamma-crystallins,?-lactam
#Characters that should be removed as a result of individual analysis of each unique, non uppercase starter:[',',';','\n','\t','&','-','!']
#. is usually just a leftover from abstract removal
#################
#First character assignment, to see whether all abstracts are now starting with upper case letters (in the best case scenario)
df['Start Char']=df[wa].apply(lambda x: x[0])
vc=df['Start Char'].value_counts()
print(vc.loc[[x for x in vc.index if not x in string.ascii_uppercase+string.ascii_lowercase+'0123456789[(']])


bad_starts=[x for x in vc.index if not x in string.ascii_uppercase]
#for start in bad_starts:
    #print(df.loc[df[wa].apply(lambda x: x[0]==start),wa])
    

df[wa]=df[wa].apply(str.lstrip,args=['?-_^. :,!;¿|]#%>&-\t\n'])
#Drop those that are now length 0 (ie were all punctuation)
df.drop(df[df[wa].apply(len)==0].index[0],axis=0,inplace=True)
#Define new starting char
df['Start Char']=df[wa].apply(lambda x:x[0])


–    812
-    507
#    226
,    171
*     56
“     50
—     45
!     42
?     35
¿     19
_     18
      18
&     17
=     15
'     15
α     14
;     10
γ     10
\     10
`      8
ß      8
β      8
$      7
~      6
<      4
�      4
      3
|      3
‐      2
​      2
•      1
¬      1
{      1
]      1
+      1
      1
δ      1
¢      1
Α      1
­      1
¨      1
Name: Start Char, dtype: int64


IndexError: index 0 is out of bounds for axis 0 with size 0

In [14]:
####################
#2.2 Starting with first letter, do we see things are often completely surrounded by some sort of punctuation, in which case we can 
#make sure there isn't some starting clause we're missing
###################
df['Start Char']=df[wa].apply(lambda x:x[0])

print(df.loc[df['Start Char']=='-','LAST_CHAR'].value_counts()) #Again--not starting and ending with '-'
#Although * is a common start, the whole thing isn't surrounded by *s
print(df.loc[df['Start Char']=='*','LAST_CHAR'].value_counts())

Series([], Name: LAST_CHAR, dtype: int64)
.    51
1     1
!     1
)     1
d     1
s     1
Name: LAST_CHAR, dtype: int64


In [15]:
#^wp.*php$
import re
my_string='n One page and must contain a summary boop doop. extra stuff'
ex=re.compile('one page and must contain a summary.*doop',re.IGNORECASE)
print(ex.search(my_string))
print(len(my_string))

<_sre.SRE_Match object; span=(2, 47), match='One page and must contain a summary boop doop'>
60


In [28]:
df.loc[df[wa].apply(lambda x: 'one page and must' in x),wa].iloc[2]

"one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any proprietary/confidential information.* Please click the add attachment button to complete this entry.5192-2007_CDC_Project_Abstract.pdfTracking Number: GRANT00387973AttachmentsProjectAbstractAddAttachmentFile Name Mime Type5192-2007_CDC_Project_Abstract.pdf application/pdfTracking Number: GRANT00387973National Fragile X Foundation11th International Fragile X Conference Proposal AbstractPublic Health Conference Support Program - Cycle AFunding Opportunity #: CDC-PA-HM08-801NCBDDD Topic Area: 300.1 Birth DefectsThe National Fragile X Foundation's biennial International C

In [None]:
'^start phrase.*end phrase$

In [72]:
df.loc[df[wa].apply(lambda x: 'PHS ' in x),wa].iloc[2]

"Principal Investigator/Program Director (Last, First, Middle):      Rajewsky, KlaUSAbstract for Research Plan Extension Period    Maintaining the overall title of the grant, the focus of the work will shift to later phases of B cell development,reaching from the naive, mature B cell to the germinal center (GC) reaction and memory B cell generation, maintenanceand response. In addition, while we expect to bring the analysis of the role of BCR specificity in the formation andmaintenance of the Bl and B2 subsets to completion within the first funding period, we expect that the experimentsproposed for the extension period will further contribute to the understanding of this problem by identifying criticalintracellular signaling cascades.    Our work in the first funding period has demonstrated that the maintenance of mature B cells requires, apart from anNFicB-mediated signal through the BAFF-R (Sasaki et al., 2006), a maintenance signal through the BCR, involving theIgo/p cytoplasmic tai

In [16]:
#'Enter the text here tha' ending with 'lines of text.'
expression=re.compile('Enter the text here that.*lines of text')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))


In [75]:
#I am not convinced this works
expression=re.compile('PHS .*Page Continuation Format Page')
for x in df.loc[df[wa].apply(lambda x: 'PHS ' in x),wa].apply(lambda x: re.sub(expression,'',x)).head(10):
    print(x)


NEW INSIGHT FROM OLD IMAGES:  REANALYSIS OF LUNAR ORBITER PHOTOGRAPHS  We propose a modest study of whether better results can be obtained from the earliest, high-quality lunar images, from the Lunar Orbiter missions more than 40 years ago. In many respe
Fenway Community Health (FCH), a Boston community-health center, which has conducted HIV/AIDS research for more than two decades, proposes to establish a Clinical Trials Unit (based on its experience coordinating HIVNET and HPTN consortia), which has been invited to be affiliated with two HIV/AIDS Clinical Trial Leadership Groups: Prevention of HIV (HPTN) and Microbicides (MTN). FCH will oversee the Administrative Component of the CTU and will coordinate the activities of three Clinical Research Sites in southeastern New England including: FCH, Latin American Health Institute (LHI, Boston, MA) and The Miriam Hospital (TMH, Providence, Rl). The CTU proposes to enroll a racially and ethnically diverse cohort of participants in the follow

In [None]:
#############################
#Issues identified through looking at either first character, first 50 characters, or first X number of characters
#To finish this, add in longer phrases identified in thousands, rather than scrolling through things
##############################

#Starting regular expressions to remove:
Some numbers of (*s, non or not) (space, -, or not), (technical abstract), (some number of stars)
some numbers (with . between) followed by abstract
(abstract) (of whichever capitalization, within parentheses)


#Completed:
"PHS 398/2590 + anything followed by + Page Continuation Format Page"

#If ends in 'Description,', then go to last instance of PERFORMANCE (for Performance SITES), otherwise "KEY PERSONNEL", upper case, and cut all that follows
'Close FormNextPrint PageAbout OMB Number'] #This is usually ended with "Project summary", so anything between those 2 can be delete, and ended with a clause starting with 'Close FormProject' and ending in'Narrative File'

'Enter the text here tha' ending with 'lines of text.'
'This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.'
'This subproject is one of many research subprojects utilizing theresources provided by a Center grant funded by NIH/NCRR. The subproject andinvestigator (PI) may have received primary funding from another NIH source,and thus could be represented in other CRISP entries. The institution listed isfor the Center, which is not necessarily the institution for the investigator.'
#Starting characters to remove
[',',';','\n','\t','&','-','!']
(numbers + .)repeated endign with either + 'Project Summary/Abstract' #This was just replaced, since numbers are removed
"""If it starts with 'one page and must contain',
#This is an NIH thing and there aren't that many of them, but come from 3 different cfda
it will start with "one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. 
It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.
It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. 
This Abstract must not include any proprietary/confidential information.* 
Please click the add attachment button to complete this entry." plus some attachments, which includes tracking number, twice:
following the second trackign number, there is a grant number followed by the actual content" 

At the end of these files, they all end in 'Project Narrative File'(last instance) followed by more attachments, all of which can be discarded
"""




In [215]:
expression=re.compile('PHS .*?Page Continuation Format Page')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))
expression=re.compile('OMB No .*?Page Continuation Format Page')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))

In [None]:
if first token is abstract, summary, remove.


In [201]:
df[wa]=df[wa].replace('Project Summary/Abstract','')

0         This is a project to explore Game-based, Metap...
1         Institution: Franklin Institute Science Museum...
2         Through programs (including small group conver...
3         In partnership with the American Chemical Soci...
4         Amphibian populations around the world are exp...
5         The Center for Molecular Interfacing (CMI) wil...
6         DRU: Integrated optimization of evacuation and...
7         The Flora of China (FOC) is an international c...
9         The overall goal of this project is to reconst...
10        MAPPING / CHARACTERIZATION / AND ANALYSIS OF C...
11        The proposed workshops will bring together a m...
12        Over the past century, rapid growth of human p...
13        The United States over the past two decades ha...
14        The Ohio State University College of Education...
15        This empirical study examines the variety of m...
16        This multidisciplinary project draws upon soci...
17        Phylogenetic trees, which depi

In [87]:
#Starting regular expressions to remove:
Some numbers of (*s, non or not) (space, -, or not), (technical abstract), (some number of stars)


Help on method_descriptor:

lstrip(...)
    S.lstrip([chars]) -> str
    
    Return a copy of the string S with leading whitespace removed.
    If chars is given and not None, remove characters in chars instead.



In [None]:
some numbers (with . between) followed by abstract


In [None]:
(abstract) (of whichever capitalization, within parentheses)


In [None]:
(numbers + .)repeated endign with either + 'Project Summary/Abstract'



In [None]:
"PHS 398/2590 + anything followed by + Page Continuation Format Page"


In [None]:
"""If it starts with 'one page and must contain',
#This is an NIH thing and there aren't that many of them, but come from 3 different cfda
it will start with "one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. 
It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.
It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. 
This Abstract must not include any proprietary/confidential information.* 
Please click the add attachment button to complete this entry." plus some attachments, which includes tracking number, twice:
following the second trackign number, there is a grant number followed by the actual content" 

At the end of these files, they all end in 'Project Narrative File'(last instance) followed by more attachments, all of which can be discarded
"""
expression1=re.compile('one page and must.*?Tracking Number.*?(Tracking Number)')
expression2=re.compile('Project Narrative File.*')
def fix_abstract(abstract):
    if abstract.startswith('one page and must contain'):
        abstract=re.sub(expression1,'',abstract)
        return re.sub(expression2,'',abstract)
    else:
        return abstract

df[wa]=df[wa].apply(fix_abstract)

In [None]:
'Close FormNextPrint PageAbout OMB Number']
#This is usually ended with "Project summary", 
#so anything between those 2 can be delete, and 
#ended with a clause starting with 'Close FormProject' and ending in'Narrative File'

expression1=re.compile('Close FormNext.*?Project Summary')
expression2=re.compile('Close FormProject.*Narrative File')
def fix_abstract(abstract):
    if abstract.startswith('Close FormNext'):
        abstract=re.sub(expression1,'',abstract)
        return re.sub(expression2,'',abstract)
    else:
        return abstract
df[wa]=df[wa].apply(fix_abstract)

In [180]:
#If ends in 'Description,', then go to last instance of PERFORMANCE (for Performance SITES), otherwise "KEY PERSONNEL", upper case, and cut all that follows

expression1=re.compile('PERFORMANCE.*Description,$')
expression2=re.compile('KEY PERSONNEL.*Description,$')

def apply_expressions(abstract):
    if abstract.endswith('Description,'):
        if re.search(expression1,abstract) != None:
            return re.sub(expression1,'',abstract)
        else:
            return re.sub(expression2,'',abstract)
    else:
        return abstract
    
df[wa]=df[wa].apply(apply_expressions)

In [98]:
df[wa]=df[wa].apply(lambda x: x.lstrip(',;\n\t&-!'))
df=df.loc[df[wa].apply(len)>0]

//Issues identified

In [None]:
#############################
#Issues identified through looking at either first character, first 50 characters, or first X number of characters
#To finish this, add in longer phrases identified in thousands, rather than scrolling through things
##############################

#Starting regular expressions to remove:
Some numbers of (*s, non or not) (space, -, or not), (technical abstract), (some number of stars)
some numbers (with . between) followed by abstract
(abstract) (of whichever capitalization, within parentheses)
(numbers + .)repeated endign with either + 'Project Summary/Abstract'
"PHS 398/2590 + anything followed by + Page Continuation Format Page"
"""If it starts with 'one page and must contain',
#This is an NIH thing and there aren't that many of them, but come from 3 different cfda
it will start with "one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. 
It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.
It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. 
This Abstract must not include any proprietary/confidential information.* 
Please click the add attachment button to complete this entry." plus some attachments, which includes tracking number, twice:
following the second trackign number, there is a grant number followed by the actual content" 

At the end of these files, they all end in 'Project Narrative File'(last instance) followed by more attachments, all of which can be discarded
"""
'Enter the text here tha' ending with 'lines of text.'
'Close FormNextPrint PageAbout OMB Number'] #This is usually ended with "Project summary", so anything between those 2 can be delete, and ended with a clause starting with 'Close FormProject' and ending in'Narrative File'
#If ends in 'Description,', then go to last instance of PERFORMANCE (for Performance SITES), otherwise "KEY PERSONNEL", upper case, and cut all that follows
#Starting characters to remove
[',',';','\n','\t','&','-','!']

#starting_exact_phrases to remove
'This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.'
'This subproject is one of many research subprojects utilizing theresources provided by a Center grant funded by NIH/NCRR. The subproject andinvestigator (PI) may have received primary funding from another NIH source,and thus could be represented in other CRISP entries. The institution listed isfor the Center, which is not necessarily the institution for the investigator.'


#############
#Some exampes of junk we cant remove--not regular expressiony or even if low info, is some info
#############
#Occurs often, but not about content of grants
'This award reflects NSF\'s statutory mission and has been deemed worthy of support through evaluation using the Foundation\'s intellectual merit and broader impacts review criteria.'
#The presence of 'American Recovery and Reinvestment Act' is a strong indicator of CFDA_CODE 47.082. This could present problems moving forward
'Program Director/Principal Investigator (Last, First, Middle): ', #What follows is usually a mix of existing info, but before a description that is usefull,  but not going to be possible to regular expression
'IGF::OT::IGF', #Not truncated but usually abstract as a whole is lacking in inofrmation: #Keep
#Variations of abstracts ending with Page 3, 4, etc.

#####################
#Based on analysis of CFDA numbers, these are starting phrases that make up a disproportionate amount of a CFDA number, but do not provide
#Information on the grant itself beyond the name. The impetus to remove them is that they may signal a grant OPPORTUNITY (i.e. CFDA ), rather 
#Than broad content found in numerous topics
##The presence of 'American Recovery and Reinvestment Act' is a strong indicator of CFDA_CODE 47.082. This could present problems moving forward
###############################################

#Examples:
"""'The broader impact/commercial potential of this I-Corps project'
'The broader impact/commercial potential of this project',
'The broader impact/commercial potential of this PFI project',
'The broader impact/commercial potential of this Small Business Innovation Research (SBIR) Phase I',
'The broader impact/commercial potential of this Small Business Innovation Research (SBIR) Phase II',
'The broader impacts/commercial potential of this Small Business Innovation Research (SBIR) Phase I',
'The broader impact/commercial potential of this Small Business Innovation Research (SBIR)',
'The broader impact/commercial potential of this Small Business Technology Transfer (STTR) Phase II',
'The broader impact/commercial potential of this Small Business Technology Transfer (STTR) Phase I',
'The broader impact/commercial potential of this Small Business Technology Transfer (STTR)',
'The broader impact/commercial potential of this Small Business Technology Transfer Phase II',
'The broader impact/commercial potential of this Small Business Technology Transfer Phase I',
'The broader impact/commercial potential of this Small Business Technology Transfer',
'The broader impact/commercial potential of this Small Business Innovation Research Phase II',
'The broader impact/commercial potential of this Small Business Innovation Research Phase I',
'The broader impact/commercial potential of this Small Business Innovation Research',
  'This award is funded under the American Recovery and Reinvestment Act of 2009 (Public Law 111-5).',
'This award is made as part of the FY 2018 Mathematical Sciences Postdoctoral Research Fellowships',
'This application addresses broad Challenge Area'
"""




In [17]:
#starting_exact_phrases to remove
#'This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.'
#'This subproject is one of many research subprojects utilizing theresources provided by a Center grant funded by NIH/NCRR. The subproject andinvestigator (PI) may have received primary funding from another NIH source,and thus could be represented in other CRISP entries. The institution listed isfor the Center, which is not necessarily the institution for the investigator.'
df[wa]=df[wa].apply(lambda x: x.replace('This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.',
                                       ''))

expression=re.compile('This subproject is one of many research subprojects.*not necessarily the institution for the investigator.')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))
expression=re.compile('This subproject is one of many research subprojects.*to the subproject or subproject staff.')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))

In [12]:
df['LAST_CHAR']=df[wa].apply(lambda x: x[-1])

In [13]:
df['Start Char']=df[wa].apply(lambda x: x[0])
df.loc[df['Start Char']=='(','LAST_CHAR'].value_counts() #None are surrounded


.    3096
)      42
s      22
e      15
y       9
1       7
?       6
!       5
d       4
S       4
t       4
l       4
n       3
,       2
a       2
_       2
f       2
'       1
C       1
R       1
k       1
D       1
r       1
-       1
T       1
]       1
w       1
o       1
/       1
(       1
c       1
Name: LAST_CHAR, dtype: int64

In [18]:
#####################
#Identify abstracts with excessive amounts of other fields to uncover additional bad abstract types
###################

fields=['Principal Investigator','Program Director','Attachment','Instructions','Lines',
        'Space Provided','Performance Site','Organization','Key Personnel']
all_fields=fields.copy()
all_fields.extend([x.lower() for x in fields])
all_fields.extend([x.upper() for x in fields])
all_fields.extend(['PI','Form','Page','Title'])
def count_up_fields(abstract):
    count=0
    for field in all_fields:
        if field in abstract:
            count+=1
    return count
df['Field Count']=df[wa].apply(count_up_fields)
df['Field Count'].describe()

['Principal Investigator', 'Program Director', 'Attachment', 'Instructions', 'Lines', 'Space Provided', 'Performance Site', 'Organization', 'Key Personnel']
['Principal Investigator', 'Program Director', 'Attachment', 'Instructions', 'Lines', 'Space Provided', 'Performance Site', 'Organization', 'Key Personnel', 'principal investigator', 'program director', 'attachment', 'instructions', 'lines', 'space provided', 'performance site', 'organization', 'key personnel', 'PRINCIPAL INVESTIGATOR', 'PROGRAM DIRECTOR', 'ATTACHMENT', 'INSTRUCTIONS', 'LINES', 'SPACE PROVIDED', 'PERFORMANCE SITE', 'ORGANIZATION', 'KEY PERSONNEL']


In [25]:
for x,y in df.loc[df['Field Count']>5].iterrows():
    print(y[wa])
    print(y['CONTACT_PI_PROJECT_LEADER'])
    print()

CBET-0801375    Title: Travel Grant related to a Research Proposal Writing Tutorial for the NSF Workshop at the November 2007 AIChE Annual MeetingPI:  James M LeeInstitution: Washington State UniversityThis is an individual travel grant that provides  funds for the principal investigator (PI) to work with a NSF program officer, Dr. John Regalbuto, to help prepare a presentation and participate in the  Proposal Writing Tutorial  for the NSF Workshop to be held at the 2007 AIChE Annual Meeting in Salt Lake City, Utah (November 4-9, 2007). For over 25 years, the Division of Chemical, Biological, Environmental, and Transport System (CBET) has held workshops at the AIChE Annual Meeting to introduce individual programs and NSF-wide initiatives, explain organizational changes, and provide opportunities for interactive communication between program directors and faculty. This year, CBET has changed the workshop format to include the  Proposal Writing Tutorial,  and  Interactive Breakout Panels

In [13]:
example='CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN D; REESE, DEBBIE D.'

fields_to_replace=[]

#Contact PI
i='REESE, DEBBIE D'
print([x for x in i.replace(',','').split() if len(x)>1])
fields_to_replace.extend([x for x in i.replace(',','').replace('.','').split() if len(x)>1])

#Other PIs
#For each pi, which are split by semicolons, and format is last,first;  #Sometimes a middle initial
example='CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN D; REESE, DEBBIE D.'
example_pis=[]
for i in example.split(';'):
    i=i.strip(' .')
    i=i.replace(',','')#.split()
    example_pis.extend([x for x in i.split() if len(x)>1])
print(example_pis)
fields_to_replace.extend(example_pis)

#organization
example='NATIONAL INSTITUTES OF HEALTH'
fields_to_replace.append(example)

#Title
example='Title'
fields_to_replace.append(example)

print(fields_to_replace)

##############
#one could do this by running custom regular exprsesions on original abstract, to account for variations in capitalization
#Or you can run it on the abstracts following the lowercase function but before any other type of preproccesing, including splitting tokens and removing too short BECAUSE
#1. If you run this after removing too short words (e.g. of) you would never match items like  'national institutes of health'
#2. If you tokenize, you have to either check for sequenes of tokens (which is very bad, given the number of unique sequqneces for names)
#Or you remove any subtokens, which is bad for a phrase like "National institutes of health", since "health" is a token we want to retain IN GENERAL
#So the best way to run this is to apply your own lowercase, remove custom stop words, and then continue reguolar preprocessing with tokenizing, lemmatizing, etc.
#############################################
def remove_embedded(record):
    """Designates stopwords for a particular abstract that contain embedded info e.g. author name or title and removes them from a lowercase abstract"""
    fields_to_replace=[]
    #Main PI
    #Adds all words in the pis names, excluding initials (hence why the commas and periods must be replaced)
    fields_to_replace.extend([x for x in record['CONTACT_PI_PROJECT_LEADER'].replace(',','').replace('.','').split() if len(x)>1])
    
    #Additional PIs
    #For each pi, which are split by semicolons, and format is last,first;  #Sometimes a middle initial
    for i in record['OTHER_PIS'].split(';'):
        i=i.strip() #Remove whitespace
        i=i.replace('.','')#Periods for initials
        i=i.replace(',','')#Commas between last, first
        fields_to_replace.extend([x for x in i.split() if len(x)>1])

['REESE', 'DEBBIE']
['CARTER', 'BEVERLY', 'WOOD', 'CHARLES', 'HITT', 'BEN', 'REESE', 'DEBBIE']
['REESE', 'DEBBIE', 'CARTER', 'BEVERLY', 'WOOD', 'CHARLES', 'HITT', 'BEN', 'REESE', 'DEBBIE', 'NATIONAL INSTITUTES OF HEALTH', 'Title']


In [14]:
df.columns

Index(['PROJECT_ID', 'ABSTRACT', 'FY', 'FIRST_CHAR', 'LAST_CHAR', 'DEPARTMENT',
       'AGENCY', 'IC_CENTER', 'PROJECT_NUMBER', 'PROJECT_TITLE',
       'PROJECT_TERMS', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS',
       'ORGANIZATION_NAME', 'CFDA_CODE', 'FY_TOTAL_COST', 'working_abstract',
       'nchar'],
      dtype='object')

In [63]:
for x in df.loc[df[wa].apply(str.startswith,args=['Contact PD/PI']),wa].head(30):
    print(x)
    print()

Contact PD/PI: Guo, Ju-TaoTITLEEvaluation of therapeutic benefits of HBV nucleocapsid assembly inhibitorsABSTRACTThis is a proposal to determine the feasibility and therapeutic benefits of newly discovered benzamidederivatives (BAs) as mono-therapeutic agents or in combination with nucleoside analogues for the treatment ofchronic hepatitis B. BAs were identified in our laboratory as inhibitors of hepatitis B virus (HBV) pregenomic(pg) RNA encapsidation, which is essential for the subsequent viral DNA synthesis. They are mechanisticallydistinct from, and should thus complement, the currently FDA-approved antiviral medications. In addition,inhibition of pgRNA encapsidation, or the nucleocapsid assembly, should not only preclude HBV genomereplication and virion production, it might also disrupt the metabolism of HBV pgRNA-reverse transcriptase(RT) complex and core protein, which could consequentially interfere with the host innate antiviral immuneresponse and cccDNA function in the infect

In [None]:
#df[df['FY']==2008].to_csv('FedReporterAbstracts2008InProgressCleaning.csv')
#df.to_csv('FRAbstractsInProgressCleaning.csv')