In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#from string import ascii_letters
import re
import nltk
import string
import time

pd.set_option('display.max_columns', 30)

### Read in Raw Dataset

In [36]:
# reading in raw data

# set data types
cols = ['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE', 'DEPARTMENT', 'AGENCY', 'IC_CENTER', 
        'PROJECT_NUMBER', 'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS', 
        'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER', 'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE', 
        'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE', 'BUDGET_END_DATE', 'CFDA_CODE', 'FY.y', 
        'FY_TOTAL_COST', 'FY_TOTAL_COST_SUB_PROJECTS']
dtypes = {col: 'str' for col in cols}
dtypes["FY_TOTAL_COST"] = 'float'
dtypes["FY_TOTAL_COST_SUB_PROJECTS"] = 'float'

df = pd.read_csv('../../data/original/working_federal_reporter_2020.csv', dtype = dtypes, engine='python')
print(df.shape)

(1156137, 26)


In [3]:
df.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,FY.x,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,
1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,
2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,
3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,
4,90000,Amphibian populations around the world are exp...,2008,Amphibia; Central America; Communicable Diseas...,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,NSF,NSF,,815315,10/1/2008,9/30/2011,"ZAMUDIO, KELLY R",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.074,2008,370996.0,


In [4]:
df.dtypes

PROJECT_ID                     object
ABSTRACT                       object
FY.x                           object
PROJECT_TERMS                  object
PROJECT_TITLE                  object
DEPARTMENT                     object
AGENCY                         object
IC_CENTER                      object
PROJECT_NUMBER                 object
PROJECT_START_DATE             object
PROJECT_END_DATE               object
CONTACT_PI_PROJECT_LEADER      object
OTHER_PIS                      object
CONGRESSIONAL_DISTRICT         object
DUNS_NUMBER                    object
ORGANIZATION_NAME              object
ORGANIZATION_CITY              object
ORGANIZATION_STATE             object
ORGANIZATION_ZIP               object
ORGANIZATION_COUNTRY           object
BUDGET_START_DATE              object
BUDGET_END_DATE                object
CFDA_CODE                      object
FY.y                           object
FY_TOTAL_COST                 float64
FY_TOTAL_COST_SUB_PROJECTS    float64
dtype: objec

In [38]:
df['ABSTRACT'].value_counts()[0:10]

ABSTRACT NOT PROVIDED                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

### Remove null abstracts, fill in missing data, and deal with duplicate abstracts

In [3]:
# summary of missing information

df.isnull().sum()

PROJECT_ID                          0
ABSTRACT                        42380
FY.x                            35035
PROJECT_TERMS                    4022
PROJECT_TITLE                       0
DEPARTMENT                          0
AGENCY                              0
IC_CENTER                      622428
PROJECT_NUMBER                      0
PROJECT_START_DATE             154926
PROJECT_END_DATE               146332
CONTACT_PI_PROJECT_LEADER          45
OTHER_PIS                     1012458
CONGRESSIONAL_DISTRICT          74483
DUNS_NUMBER                     14565
ORGANIZATION_NAME                1658
ORGANIZATION_CITY                5926
ORGANIZATION_STATE              17001
ORGANIZATION_ZIP                48995
ORGANIZATION_COUNTRY             5705
BUDGET_START_DATE              659654
BUDGET_END_DATE                659705
CFDA_CODE                      201355
FY.y                               19
FY_TOTAL_COST                  287698
FY_TOTAL_COST_SUB_PROJECTS    1082117
dtype: int64

In [4]:
#remove rows with NULL abstracts

l1 = len(df)
df = df[~df.ABSTRACT.isnull()]
l2 = len(df)

print(l1-l2, "null ABSTRACTs removed")

42380 null ABSTRACTs removed


In [5]:
# drop abstracts with values of "ABSTRACT NOT PROVIDED" and "No abstract provided "

l1 = len(df)
df = df[df.ABSTRACT != 'ABSTRACT NOT PROVIDED']
df = df[df.ABSTRACT != 'No abstract provided']
l2 = len(df)

print(l1-l2, "ABSTRACT NOT PROVIDED and \"No abstract provided\" removed")

1324 ABSTRACT NOT PROVIDED and "No abstract provided" removed


In [6]:
df.isnull().sum()

PROJECT_ID                          0
ABSTRACT                            0
FY.x                                0
PROJECT_TERMS                    3536
PROJECT_TITLE                       0
DEPARTMENT                          0
AGENCY                              0
IC_CENTER                      595857
PROJECT_NUMBER                      0
PROJECT_START_DATE             149391
PROJECT_END_DATE               141193
CONTACT_PI_PROJECT_LEADER          43
OTHER_PIS                      969227
CONGRESSIONAL_DISTRICT          69263
DUNS_NUMBER                     12047
ORGANIZATION_NAME                1626
ORGANIZATION_CITY                5548
ORGANIZATION_STATE              12162
ORGANIZATION_ZIP                46167
ORGANIZATION_COUNTRY             5341
BUDGET_START_DATE              632754
BUDGET_END_DATE                632805
CFDA_CODE                      192234
FY.y                               19
FY_TOTAL_COST                  275416
FY_TOTAL_COST_SUB_PROJECTS    1041539
dtype: int64

In [6]:
# FY.x is the reliable fiscal year information so we rename this column to FY

df = df.rename(columns={'FY.x': 'FY'})

**Since we are tracking emerging abstract trends**, we will fill in missing information for:  
- PROJECT_START_DATE
- PROJECT_END_DATE.

In [7]:
# Fill missing values in Project Start/End Date with Budget Start/End Date
df['PROJECT_START_DATE'] = df['PROJECT_START_DATE'].fillna(df['BUDGET_START_DATE'])
df['PROJECT_END_DATE'] = df['PROJECT_END_DATE'].fillna(df['BUDGET_END_DATE'])

#df.isnull().sum()

#If START date is still missing, fill both start and end date with FY
df['PROJECT_START_DATE'] = df['PROJECT_START_DATE'].fillna(df['FY'])
df['PROJECT_END_DATE'] = df['PROJECT_END_DATE'].fillna(df['FY'])

In [11]:
df.isnull().sum()

PROJECT_ID                          0
ABSTRACT                            0
FY                                  0
PROJECT_TERMS                    3536
PROJECT_TITLE                       0
DEPARTMENT                          0
AGENCY                              0
IC_CENTER                      595857
PROJECT_NUMBER                      0
PROJECT_START_DATE                  0
PROJECT_END_DATE                    0
CONTACT_PI_PROJECT_LEADER          43
OTHER_PIS                      969227
CONGRESSIONAL_DISTRICT          69263
DUNS_NUMBER                     12047
ORGANIZATION_NAME                1626
ORGANIZATION_CITY                5548
ORGANIZATION_STATE              12162
ORGANIZATION_ZIP                46167
ORGANIZATION_COUNTRY             5341
BUDGET_START_DATE              632754
BUDGET_END_DATE                632805
CFDA_CODE                      192234
FY.y                               19
FY_TOTAL_COST                  275416
FY_TOTAL_COST_SUB_PROJECTS    1041539
dtype: int64

In [26]:
len(df)

1112433

**Aggregate counts for unique ORGANIZATION_NAMEs in rows with duplicated Abstract/Title/Project_Start_Date**

In [8]:
# group df by abstract/title/start_date (exact matches)
all_grp = df.groupby(['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'])

# for each unique ABSTRACT/TITLE/START_DATE in df, count unique Organizations
unique_all = all_grp.agg({'ORGANIZATION_NAME' : 'nunique'}) 

#rename column as "count" to be different than original column
unique_all = unique_all.rename(columns={'ORGANIZATION_NAME': 'ORG_COUNT'})

# merge df with "unique_all" to bring in the unique Organization counts for each "duplicate" group
merged1 = df.merge(unique_all, left_on=['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'], 
                   right_on=['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'])

In [30]:
unique_all.ORG_COUNT.value_counts()

1     676002
2      18132
3       2075
0       1443
4        556
5        182
6         66
7         40
8         33
9         24
11         8
13         8
10         7
12         6
15         4
19         3
14         2
16         2
18         2
28         1
17         1
20         1
22         1
31         1
Name: ORG_COUNT, dtype: int64

In [25]:
len(merged1)

1112433

**Aggregate counts for unique PIs in rows with duplicated Abstract/Title/Project_Start_Date**

In [9]:
# for each unique ABSTRACT/TITLE/START_DATE in whole df, count unique PIs
unique_pi = all_grp.agg({'CONTACT_PI_PROJECT_LEADER' : 'nunique'}) 

#rename column as "count" to be different than original column
unique_pi = unique_pi.rename(columns={'CONTACT_PI_PROJECT_LEADER': 'PI_COUNT'})

# merge data frame with "unique_all" to bring in the unique PI counts for each "duplicate" group
merged2 = merged1.merge(unique_pi, left_on=['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'], 
                        right_on=['ABSTRACT', 'PROJECT_TITLE', 'PROJECT_START_DATE'])

In [29]:
unique_pi.PI_COUNT.value_counts()

1     680681
2      14608
3       2277
4        589
5        197
6         63
0         43
7         41
9         29
8         28
13         7
11         6
12         6
10         5
16         5
15         4
19         3
14         2
17         2
21         1
18         1
20         1
28         1
Name: PI_COUNT, dtype: int64

In [31]:
merged2.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT
0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1
1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1
2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1
3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1
4,90000,Amphibian populations around the world are exp...,2008,Amphibia; Central America; Communicable Diseas...,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,NSF,NSF,,815315,10/1/2008,9/30/2011,"ZAMUDIO, KELLY R",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.074,2008,370996.0,,2,2


In [32]:
len(merged2)

1112433

In [10]:
# Sort merged data so that duplicated rows occur in order of earliest to latest END date
merged = merged2.sort_values(['PROJECT_END_DATE'])

In [35]:
merged.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT
17608,152242,The multiprotein complex y-secretase proteolyt...,2008,Active Sites; Affect; Alzheimer's Disease; Amy...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,HHS,NIH,,5F32AG027647-03,12/1/2005,1/1/2008,"LIEBERMAN, RAQUEL L",,7,30811269,BRIGHAM AND WOMEN'S HOSPITAL,BOSTON,MA,21156110,UNITED STATES,,,93.866,2008,3483.0,,1,1
38727,160592,DESCRIPTION (provided by applicant): Contract...,2008,70-kDa Ribosomal Protein S6 Kinases; Adrenergi...,G PROTEIN-COUPLED RECEPTOR REGULATION IN AIRWA...,HHS,NIH,,2R01HL058506-10A2,8/1/1997,1/1/2009,"PENN, RAYMOND B.",,5,937727907,WAKE FOREST UNIVERSITY HEALTH SCIENCES,WINSTON-SALEM,NC,271570001,UNITED STATES,,,93.838,2008,384375.0,,2,1
111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,Affect; Animal Model; Axon; Behavior; Behavior...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,HHS,NIH,,5K99HD056157-02,9/1/2007,1/1/2009,"KAUFFMAN, ALEXANDER S",,7,605799469,UNIVERSITY OF WASHINGTON,SEATTLE,WA,981959472,UNITED STATES,,,93.865,2008,39175.0,,1,1
22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,Agreement; Antibodies; base; Binding; Biochemi...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,HHS,NIH,,5F32GM076971-02,1/2/2007,1/1/2009,"MACK, ERIC T",,5,82359691,HARVARD UNIVERSITY,CAMBRIDGE,MA,21385319,UNITED STATES,,,93.859,2008,49646.0,,1,1
35004,159362,Obesity is the cause of many adverse pregnancy...,2008,African; Analysis of Variance; Asians; Birth; ...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,HHS,NIH,,5F31NR009611-03,4/1/2006,1/1/2009,"HELMREICH, REBECCA J",,9,800771594,UNIVERSITY OF TEXAS HLTH SCI CTR HOUSTON,HOUSTON,TX,770305400,UNITED STATES,,,93.361,2008,20406.0,,1,1


In [36]:
len(merged)

1112433

In [11]:
# save NON-duplicated rows and the LAST occurrance of duplicated rows
dedup = merged[~merged.duplicated(subset=['ABSTRACT',  'PROJECT_TITLE', 'PROJECT_START_DATE'], keep='last')]

In [27]:
#save final (deduplicated) dataframe as "df" to fit downstream code
df = dedup

In [38]:
len(dedup)

698600

In [39]:
dedup.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT
17608,152242,The multiprotein complex y-secretase proteolyt...,2008,Active Sites; Affect; Alzheimer's Disease; Amy...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,HHS,NIH,,5F32AG027647-03,12/1/2005,1/1/2008,"LIEBERMAN, RAQUEL L",,7.0,30811269,BRIGHAM AND WOMEN'S HOSPITAL,BOSTON,MA,21156110,UNITED STATES,,,93.866,2008,3483.0,,1,1
111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,Affect; Animal Model; Axon; Behavior; Behavior...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,HHS,NIH,,5K99HD056157-02,9/1/2007,1/1/2009,"KAUFFMAN, ALEXANDER S",,7.0,605799469,UNIVERSITY OF WASHINGTON,SEATTLE,WA,981959472,UNITED STATES,,,93.865,2008,39175.0,,1,1
22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,Agreement; Antibodies; base; Binding; Biochemi...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,HHS,NIH,,5F32GM076971-02,1/2/2007,1/1/2009,"MACK, ERIC T",,5.0,82359691,HARVARD UNIVERSITY,CAMBRIDGE,MA,21385319,UNITED STATES,,,93.859,2008,49646.0,,1,1
35004,159362,Obesity is the cause of many adverse pregnancy...,2008,African; Analysis of Variance; Asians; Birth; ...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,HHS,NIH,,5F31NR009611-03,4/1/2006,1/1/2009,"HELMREICH, REBECCA J",,9.0,800771594,UNIVERSITY OF TEXAS HLTH SCI CTR HOUSTON,HOUSTON,TX,770305400,UNITED STATES,,,93.361,2008,20406.0,,1,1
371628,594482,Local potato advisory groups have expressed in...,2010,cost; Health; interest; Manure; Parasitic nema...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,USDA,NIFA,,0219605,10/1/2009,1/1/2010,"CHARLTON, B.",,,53599908,OREGON STATE UNIVERSITY,CORVALLIS,OR,97331,UNITED STATES,,,10.203,2010,,,1,1


In [52]:
dedup.PROJECT_START_DATE.value_counts()[0:51]

2013         12021
2009         10926
2011         10741
2008         10716
2010         10672
9/1/2009      7713
9/1/2012      6570
9/1/2010      6540
9/1/2011      6299
9/1/2015      6175
9/1/2014      6038
7/1/2010      5989
9/1/2016      5916
9/1/2008      5536
9/1/2018      5483
9/1/2017      5312
7/1/2009      5092
7/1/2012      5009
9/30/2009     4887
8/1/2009      4850
7/1/2015      4830
7/1/2016      4805
7/1/2018      4730
7/1/2011      4632
7/1/2014      4582
9/1/2013      4529
7/1/2017      4328
8/1/2014      4252
8/1/2012      4197
8/1/2008      4171
8/1/2018      4098
8/1/2017      4050
5/1/2010      4039
7/1/2008      3831
8/1/2015      3808
8/1/2010      3671
2012          3656
8/1/2011      3620
8/1/2016      3504
2015          3429
4/1/2011      3416
4/1/2010      3344
7/1/2019      3323
2014          3293
2016          3292
4/1/2008      3278
2017          3264
6/1/2009      3263
4/1/2012      3241
2018          3174
2019          3166
Name: PROJECT_START_DATE, dtype

In [53]:
3174+3166+3264+3293+3293+3429+3656+10672+10716+10741+10926+12021

78351

In [17]:
len(df)

698600

### Dataset setup

##### Helper function used throughout

In [29]:
def drop_empties(df, col):
    
    l1 = len(df)
    ix = df[df[col].apply(len)==0].index
    print(ix)
    df.drop(ix,axis=0,inplace=True)
    l2 = len(df)
    
    print(f"dropped {l1-l2}")
    
    return df

In [35]:
# strip leading and trailing whitespace, save in a working abstract column that will be updated as text is cleaned

# Note: we cannot lower case abstracts up front - capitalization is needed to find POS in preprocessing
#df["working_abstract"] = 

temp = [abstract.strip() for abstract in df["ABSTRACT"]] 
df = df.assign(working_abstract = temp)

df = drop_empties(df, "working_abstract")

wa = 'working_abstract'

In [15]:
df['Start Char']=df['working_abstract'].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
df.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,working_abstract,Start Char
17608,152242,The multiprotein complex y-secretase proteolyt...,2008,Active Sites; Affect; Alzheimer's Disease; Amy...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,HHS,NIH,,5F32AG027647-03,12/1/2005,1/1/2008,"LIEBERMAN, RAQUEL L",,7.0,30811269,BRIGHAM AND WOMEN'S HOSPITAL,BOSTON,MA,21156110,UNITED STATES,,,93.866,2008,3483.0,,1,1,The multiprotein complex y-secretase proteolyt...,T
111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,Affect; Animal Model; Axon; Behavior; Behavior...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,HHS,NIH,,5K99HD056157-02,9/1/2007,1/1/2009,"KAUFFMAN, ALEXANDER S",,7.0,605799469,UNIVERSITY OF WASHINGTON,SEATTLE,WA,981959472,UNITED STATES,,,93.865,2008,39175.0,,1,1,DESCRIPTION (provided by applicant): The Kis...,D
22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,Agreement; Antibodies; base; Binding; Biochemi...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,HHS,NIH,,5F32GM076971-02,1/2/2007,1/1/2009,"MACK, ERIC T",,5.0,82359691,HARVARD UNIVERSITY,CAMBRIDGE,MA,21385319,UNITED STATES,,,93.859,2008,49646.0,,1,1,DESCRIPTION (provided by applicant): The objec...,D
35004,159362,Obesity is the cause of many adverse pregnancy...,2008,African; Analysis of Variance; Asians; Birth; ...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,HHS,NIH,,5F31NR009611-03,4/1/2006,1/1/2009,"HELMREICH, REBECCA J",,9.0,800771594,UNIVERSITY OF TEXAS HLTH SCI CTR HOUSTON,HOUSTON,TX,770305400,UNITED STATES,,,93.361,2008,20406.0,,1,1,Obesity is the cause of many adverse pregnancy...,O
371628,594482,Local potato advisory groups have expressed in...,2010,cost; Health; interest; Manure; Parasitic nema...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,USDA,NIFA,,0219605,10/1/2009,1/1/2010,"CHARLTON, B.",,,53599908,OREGON STATE UNIVERSITY,CORVALLIS,OR,97331,UNITED STATES,,,10.203,2010,,,1,1,Local potato advisory groups have expressed in...,L


In [18]:
df['nchar']=df['working_abstract'].apply(len)
t=df.loc[df['nchar']<150]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
100*(len(t)/len(df))

0.34554823933581447

In [52]:
t['ABSTRACT'][100:128]

448032         Project Summaryn/a
448561    NINDS EPILEPSY DATABASE
450791           Alcohol:  NESARC
452095     Not R&D, do not report
453984                  Abstract:
461581             Background and
462158       IRB Manager Software
462185                    Alcohol
464125                     #NAME?
477487                          ?
478689              Not required.
480567                        NIH
483984     ABSTRACTNot Applicable
487825     Teleconference meeting
499406        Project Summary:N/A
501461          Alcohol treatment
514157      No abstract required.
516063      Document not required
516575        abstract 11/30/2017
517033          Project SummaryNA
519282            Not Applicable.
523397             Not Applicable
524542      No abstract available
526784      See Other Attachments
535169      Summary and Narrative
535580     CQM services for FFRDC
535910         Project SummaryN/A
536242                     #NAME?
Name: ABSTRACT, dtype: object

In [32]:
t['ABSTRACT'][88746]

'This project will examine the role of antioxidant enzymes in providing ozone tolerance.'

In [34]:
t['PROJECT_TITLE'][88746]

'CHARACTERIZATION AND MECHANISMS OF PLANT RESPONSES TO OZONE IN THE U.S'

In [38]:
len(df[df['nchar']<250])

2348

### Cleaning Strategy:
1. Remove abstracts with all non-alphanumeric characters.
2. Remove non-alphanumeric characters from the start and end of abstracts
3. Remove other non-readable abstracts. (REMOVAL ABSTRACTS FOUND BY INSPECTION)
4. Remove "junk" starting strings and ending strings
5. Remove "junk" strings in the middle 
6. Remove title, organization name, and pis from abstracts


##### Helper functions for cleaning

In [21]:
def strip_nonalnum(word):
    
    # function strips non-alphanumeric characters from the beginning and end of a string
    # adapted from: https://stackoverflow.com/questions/22650506/how-to-rermove-non-alphanumeric-characters-at-the-beginning-or-end-of-a-string
        
    if not word:
        return word  # nothing to strip
    if (len(word) == 1) and (not word[0].isalnum()):  
        return ""
    for start, c in enumerate(word):
        if c.isalnum():
            break
    for end, c in enumerate(word[::-1]):
        if c.isalnum():
            break
                  
    return word[start:len(word) - end]


In [22]:
def remove_phrase(x, phrase,loc='Start'):
    
    # returns x with phrase removed. location can be "Start" of string, "End" of string, or 
    # "Anywhere_All"--anywhere will remove all instances and Anywhere_First will remove the first instance
    # CASE info - this function assumes phrase is lower case, but that x is not.
    
    assert loc in ['Start','End']
    
    if loc=='End':
        if x.lower().endswith(phrase):
            return x[:-1*len(phrase)].strip()
        else:
            return x
    elif loc=='Start':
        if x.lower().startswith(phrase):
            return x[len(phrase):].strip()
        else:
            return x
    else:
        return 'Error'
    

In [23]:
# strip non-alphanum characters from the beginning and end of each abstract

df["working_abstract"] = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = drop_empties(df, "working_abstract")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Int64Index([855707, 179023, 738276, 835501, 930384, 890029, 925191], dtype='int64')
dropped 7


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [36]:
# remove abstracts that are not readable: THIS NEEDS TO BE UPDATED BY HAND FOR EVERY NEW DATASET
# For example: index = 490684: ¢ £/¥ ƒ § ¤ ƒ “ ƒ « ...

df["Start Char"] = df['working_abstract'].apply(lambda x: x[0])
ix = df[df['Start Char'] == 'ƒ'].index
df.drop(index = ix, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [31]:
'²'.isalnum()

True

In [35]:
df[df['Start Char'] == 'γ']  # interesting case showing a repeated abstract but with different start dates

Unnamed: 0,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,working_abstract,Start Char
761225,747301,γδ T cells represent a unique group of lymphoc...,2015,adaptive immunity; Alleles; Antigens; base; Ce...,GENETIC DISSECTION OF ID3-MEDIATED PATHWAYS IN...,HHS,NIH,NIAID,5P01AI102853-02 (7439),5/1/2015,4/30/2016,"ZHUANG, YUAN",,13,64367329,INSTITUTE FOR CANCER RESEARCH,PHILADELPHIA,PA,191112434,UNITED STATES,5/1/2015,4/30/2016,,2015,,445860.0,1,1,γδ T cells represent a unique group of lymphoc...,γ
886539,863876,γδ T cells represent a unique group of lymphoc...,2016,adaptive immunity; Alleles; Antigens; base; Ce...,GENETIC DISSECTION OF ID3-MEDIATED PATHWAYS IN...,HHS,NIH,NIAID,5P01AI102853-03 (7439),5/1/2016,4/30/2017,"ZHUANG, YUAN",,13,64367329,INSTITUTE FOR CANCER RESEARCH,PHILADELPHIA,PA,191112434,UNITED STATES,5/1/2016,4/30/2017,,2016,,486748.0,1,1,γδ T cells represent a unique group of lymphoc...,γ
981662,985345,γδ T cells represent a unique group of lymphoc...,2017,adaptive immune response; Alleles; Antigens; C...,GENETIC DISSECTION OF ID3-MEDIATED PATHWAYS IN...,HHS,NIH,NIAID,5P01AI102853-04 (7439),5/1/2017,4/30/2018,"ZHUANG, YUAN",,13,64367329,INSTITUTE FOR CANCER RESEARCH,PHILADELPHIA,PA,191112434,UNITED STATES,5/1/2017,4/30/2018,,2017,,384469.0,1,1,γδ T cells represent a unique group of lymphoc...,γ
698501,648993,γδ T cells represent a unique group of lymphoc...,2014,Alleles; Antigens; base; Cell Lineage; Cells; ...,GENETIC DISSECTION OF ID3-MEDIATED PATHWAYS IN...,HHS,NIH,NIAID,1P01AI102853-01A1 (7439),5/15/2014,4/30/2019,"ZHUANG, YUAN",,13,64367329,INSTITUTE FOR CANCER RESEARCH,PHILADELPHIA,PA,191112434,UNITED STATES,5/15/2014,4/30/2015,,2014,,,1,1,γδ T cells represent a unique group of lymphoc...,γ
1019773,1024518,γδ T cells represent a unique group of lymphoc...,2018,adaptive immune response; Alleles; Antigens; C...,GENETIC DISSECTION OF ID3-MEDIATED PATHWAYS IN...,HHS,NIH,NIAID,5P01AI102853-05 (7439),5/1/2018,4/30/2019,"ZHUANG, YUAN",,13,64367329,INSTITUTE FOR CANCER RESEARCH,PHILADELPHIA,PA,191112434,UNITED STATES,5/1/2018,4/30/2019,,2018,,385710.0,1,1,γδ T cells represent a unique group of lymphoc...,γ


In [29]:
df.loc[955744, "working_abstract"]

'ƒ § ¤ ƒ “ ƒ « ¤ ‹ › ﬁ ﬂ –¢† « ¤ ﬁ “‡ﬁ · ‹ ¶ ﬁ › † « ¤ “ ¤¢ ƒ ¶ · ﬁ ¶ ﬂ ƒ ¶ ‚ « ¶ ‹ « ¶ ƒ „ ”ƒ “ ƒ – ﬁ ﬁ § ¤ “ › ƒ ‹ † “ ¤ ‚ › ‚ ¤ “ ‹ † “ ‹ ƒ ¶ ‹ ƒ – –»‚ … /¥ ‰ ¿¥`« ¶ ¤ „ « ƒ¥« ﬁ ‹ ﬁ › › “ ¤ « ” ƒ ^ ﬁ ¶ « ‚ –~¤ ¯ ƒ ˘ . ¨· ﬁ –¢ﬂ › ﬁ « ¤ · ‚¥« ﬁ¥§ ¶ ﬁ ﬂ ‹ ƒ † † « – † ‚ ﬁ · › ﬁ ﬂ†‚‚ƒ“‚ﬁ¶‚ ‡¥¥ ¥ ´ ‹ﬁ“·· § – ƒ ‚ ° ﬂ ƒ ‚ ¤¢ “ “ ﬁ ¸ ƒ – ˘ . ¨ ‚ · ﬁ ¶ § ¶ ƒ ﬂ ¤ ‹ « ƒ ﬂ „ « ” ƒ « RA“ Gﬂ ¤ T‚ ‹ Oﬁ P¸ ƒ ¶ ƒ ﬂ ˘ . ¨ · –~¤ ¯ ƒ–ƒ‚·ﬁ¶ﬂƒ‚¤“°†“ﬂﬂƒ‚¤“¶¤„ﬁ‚̋¤«‹ƒ‚̋¤«ﬂƒ‚¤¶ƒﬂ˛ﬁ¶ƒ‚‹ƒ“«§¶ﬁ§ƒ¶«¤ƒ‚¥ˇ'

In [37]:
dict(df["Start Char"].value_counts())

{'D': 236243,
 'T': 182833,
 'P': 73793,
 'A': 44309,
 'I': 19379,
 'C': 19002,
 'S': 18308,
 'O': 13985,
 'W': 13058,
 'M': 11028,
 'B': 9689,
 'N': 8350,
 'R': 7522,
 'E': 6637,
 'F': 5155,
 'H': 5115,
 '1': 4031,
 'G': 4000,
 'U': 3891,
 'L': 3398,
 'V': 1659,
 '0': 1423,
 '7': 844,
 'K': 715,
 '2': 583,
 'J': 502,
 'Q': 411,
 'i': 369,
 'a': 236,
 '6': 205,
 '3': 199,
 'Y': 182,
 '9': 154,
 'X': 146,
 '5': 121,
 'Z': 119,
 '4': 96,
 't': 91,
 '8': 89,
 'c': 76,
 'p': 74,
 'b': 70,
 'l': 58,
 'h': 58,
 'n': 44,
 'e': 43,
 'o': 41,
 's': 40,
 'm': 36,
 'r': 32,
 'f': 32,
 'v': 29,
 'g': 18,
 'd': 17,
 'y': 11,
 'u': 10,
 'w': 10,
 'j': 6,
 'γ': 5,
 'α': 4,
 'β': 3,
 'z': 1,
 'x': 1,
 'δ': 1,
 '²': 1,
 'q': 1}

In [38]:
# "junk" phrases at start to remove

start_phrases=['abstract', 'summary', 'proposal', 'description', 'narrative', 
               'technical abstract',
               'non technical abstract', 
               'non- technical abstract',
               'non-technical abstract',                      
               'nontechnical abstract',
               'technical summary', 
               'nontechnical summary',
               'non-technical summary',
               'non-technical description',
               'description (provided by the applicant)',
               'description (provided by investigator)',  
               'description (provided by applicant)',
               'project summary/abstract',
               'proposal abstract',
               'research abstract',
               'project summary',
               'research summary',
               'project description'
               'see instructions):',
               'for center application (provided by the investigator):',
               'objective(s)',      
               'exceed the space provided',
               'provided by applicant',
               'provided by candidate']
                
 # [hrd #######]


In [39]:
#Remove found start phrases

for phrase in start_phrases:
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'Start']) 

# strip non-alphanum characters from the beginning and end of each abstract

df["working_abstract"] = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = drop_empties(df, "working_abstract")    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Int64Index([896234, 835221, 837285, 896226, 881341, 854895, 370437, 479457,
            966890, 172952, 879838],
           dtype='int64')
dropped 11


In [40]:
#Repeated start phrase removal in case the order of project summary/abstract varies

for phrase in start_phrases:
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'Start'])  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [41]:
#starting_exact_phrases to remove

#'This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.'
#'This subproject is one of many research subprojects utilizing theresources provided by a Center grant funded by NIH/NCRR. The subproject andinvestigator (PI) may have received primary funding from another NIH source,and thus could be represented in other CRISP entries. The institution listed isfor the Center, which is not necessarily the institution for the investigator.'

df[wa]=df[wa].apply(lambda x: x.replace('This subproject represents an estimate of the percentage of the CTSA funding that isbeing utilized for a broad area of research (AIDS research, pediatric research, orclinical trials).  The Total Cost listed is only an estimate of the amount of CTSAinfrastructure going towards this area of research, not direct funding provided bythe NCRR grant to the subproject or subproject staff.',
                                       ''))

expression=re.compile('This subproject is one of many research subprojects.*not necessarily the institution for the investigator.')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))

expression=re.compile('This subproject is one of many research subprojects.*to the subproject or subproject staff.')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [42]:
# strip non-alphanum characters from the beginning and end of each abstract

df["working_abstract"] = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = drop_empties(df, "working_abstract")
    
# update Start Char column in df
df['Start Char']=df[wa].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Int64Index([766880, 210938, 464990, 768772,  66012, 621828, 776084, 802916,
            173457, 260127, 644375, 644228],
           dtype='int64')
dropped 12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [43]:
end_phrases = ['(end of abstract',
               'end of abstract', 
               '(abstract end',  
               '(end of abstract',
               '(end 0f abstract',
               '(end of absract',
               '(abstract below',
               '(end of reviewers\' comment',
               '(end abstract',
               'performance site ========================================section end',
               'key personnel ========================================section end',
               '[summary truncated at 7800 characters', 
               'this award reflects nsf\'s statutory mission and has been deemed worthy of support through evaluation using the foundation\'s intellectual merit and broader impacts review criteria',
               'project description page 6', 'page 1 of 1', 'project summary/abstract page 6',
               'project description page 7', 'project summary/abstract page 7', 'pag 1 o 1', 
               'page 2 number pages consecutively at the bottom throughout form page 2']


In [44]:
# end phrase removal

for phrase in end_phrases:
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'End'])  

# strip non-alphanum characters from the beginning and end of each abstract

df["working_abstract"] = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = drop_empties(df, "working_abstract")
    
# update Last Char column in df
df['LAST_CHAR']=df[wa].apply(lambda x: x[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Int64Index([], dtype='int64')
dropped 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [45]:
# "junk" removal within text body - not necessarily at the start or end

# 'Enter the text here that' ending with 'lines of text.'
expression=re.compile('Enter the text here that.*lines of text')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))

expression=re.compile('PHS .*?Continuation Format Page')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))

expression=re.compile('OMB No .*?Continuation Format Page')
df[wa]=df[wa].apply(lambda x: re.sub(expression,'',x))

df[wa]=df[wa].replace('Project Summary/Abstract','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [46]:
"""If it starts with 'one page and must contain',
This is an NIH thing and there aren't that many of them, but come from 3 different cfda
it will start with "one page and must contain a summary of the proposed activity suitable for dissemination to 
thepublic. It should be a self-contained description of the project and should contain a statement of objectives 
and methods to be employed. It should be informative to other persons working in the same or related fields and 
insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any 
proprietary/confidential information.* Please click the add attachment button to complete this entry." plus some 
attachments, which includes tracking number, twice: following the second trackign number, there is a grant number
followed by the actual content" 

At the end of these files, they all end in 'Project Narrative File'(last instance) followed by more attachments, 
all of which can be discarded
"""

expression1=re.compile('one page and must.*?Tracking Number.*?(Tracking Number)')
expression2=re.compile('Project Narrative File.*')

def fix_abstract(abstract):
    if abstract.startswith('one page and must contain'):
        abstract=re.sub(expression1,'',abstract)
        return re.sub(expression2,'',abstract)
    else:
        return abstract

df[wa]=df[wa].apply(fix_abstract)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [47]:
# removal of phrase at end or beginning?

expression=re.compile('Project Summary/Abstract Page.*')

def remove_contact_pd(x):
    
    """removes clause at end that tends to occur: eg Project Summary/Abstract Page 222Contact PD/PI: Sampson, HughNarrative ("""
    
    if x.startswith('Contact PD/PI'):
        return re.sub(expression,'',x)
    else:
        return x
    
df[wa]=df[wa].apply(remove_contact_pd) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [48]:
# strip non-alphanum characters from the beginning and end of each abstract

df["working_abstract"] = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = drop_empties(df, "working_abstract")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Int64Index([752337, 709331, 927252], dtype='int64')
dropped 3


In [49]:
def remove_title_org(record):
    
    # This function removes project titles and organization names from abstracts
    
    """ ignores case to remove multi-word phrases in a particular order, especially those likely to run into other words,
    e.g. Institution university of washingtonPI mary williams. This doesn't work when titles or insititutions have escape characters in them, which is a bummer
    see for example ENHANCING THE USE OF NASA EARTH SCIENCE RESULTS / DATA / AND TECHNOLOGY BY ENGAGING THE FEDERATION OF EARTH SCIENCE INFORMATION PARTNERS COMMUNITIES OF
    PRACTICE IN TARGET AREAS OF INTEREST TO NASA THE FEDERATION OF EARTH SCIENCE INFORMATION PARTNERS (''FED"""
    
    title=record['PROJECT_TITLE']
    
    try:
        new_abstract=re.sub(title,'',record[wa],flags=re.IGNORECASE)      
        return re.sub(record['ORGANIZATION_NAME'],'',new_abstract,flags=re.IGNORECASE)   
    except:
        try:
            return re.sub(record['ORGANIZATION_NAME'],'',record[wa],flags=re.IGNORECASE)   
        except:
            return record[wa]
        
        
df[wa]=df.apply(lambda x: remove_title_org(x),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [52]:
# strip non-alphanum characters from the beginning and end of each abstract

df["working_abstract"] = [strip_nonalnum(abstract) for abstract in df["working_abstract"]]
df = drop_empties(df, "working_abstract")

df['Start Char']=df[wa].apply(lambda x:x[0])
df['LAST_CHAR']=df[wa].apply(lambda x:x[-1])
df['nchar']=df[wa].apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Int64Index([ 12392, 463597, 998105, 828642, 303523,  12493,  11956,  13988,
             11786, 503566,
            ...
            386735, 719763, 920987, 301488, 199090, 294437, 292405, 652474,
            909636, 919062],
           dtype='int64', length=5524)
dropped 5524


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [145]:
len(df[df['nchar'] < 50])

1202

In [146]:
temp = df[df['nchar'] < 50]
print(len(temp))
print(len(temp.working_abstract.unique()))

1202
484


In [147]:
temp.working_abstract.value_counts()[0:50]

Not Provided                                         77
N/A                                                  45
No abstract provided                                 42
is needed, use Project                               31
Abstract not available                               30
TBD                                                  28
not yet available                                    27
alcohol                                              20
A range of systems will be studied                   19
NOT AVAILABLE                                        15
not provided                                         15
Not Applicable                                       15
not available                                        14
No ABS                                               13
Not required, per RFA-DK-13-007                      10
RELEVANCE (See instructions                          10
NA                                                   10
No abstract available                           

In [127]:
len(temp.working_abstract.unique())

436

In [128]:
temp.working_abstract.unique()

array(['Services to the NCI by the up', 'alcohol',
       'IGF::OT::IGF LOX EXPANSION CYCLE (LEC', 'BARD',
       'Bldg. 560, Building Generator/ATS', '1',
       'Synthesis of (3.7, 1.5 and 2.0 mg', 'Synthesis of  (1.5 mg',
       'Synthesis of four thio linked mannosides',
       'Synthesis of  for Cu-free click reactions',
       'Synthesis of BSA-MI-4   HNAc',
       'Synthesis of  modified with biotin', 'Synthesis of',
       'Mass spectrometric analysis of 20 samples',
       'Synthesis of four mouse vaccination samples',
       'Synthesis of , 0.5 mg',
       'Synthesis of Fmoc ¿-D-GlcNAc(OAc)3 Ser-OH',
       'Synthesis of 50 mg of', 'Synthesis of  (~ 1 mg each',
       'Synthesis of , ~ 0.05 mg each',
       'Synthesis of 18B10.C7(3), 0.09 mg', 'Synthesis of , 50 mg',
       'is needed, use Project', 'is needed,use Project', 'N/A',
       'No abstract required', 'Document not required',
       'RELEVANCE (See instructions',
       'Please see Research Strategy section',
      

In [None]:
df = df[df.ABSTRACT != 'Not Provided']
df = df[df.ABSTRACT != 'N/A']
df = df[df.ABSTRACT != 'No abstract provided']
df = df[df.ABSTRACT != 'Abstract not available']
df = df[df.ABSTRACT != 'TBD']
df = df[df.ABSTRACT != 'not yet available']
df = df[df.ABSTRACT != 'Per instructions in PAR-14-021, this section left blank']
df = df[df.ABSTRACT != 'Not Applicable']
df = df[df.ABSTRACT != 'not provided']
df = df[df.ABSTRACT != 'NOT AVAILABLE']
df = df[df.ABSTRACT != 'not available']
df = df[df.ABSTRACT != 'UCSF LIVER CENTERFor the project summary/abstract please refer to the OVERALL/OVERVIEW section ']
df = df[df.ABSTRACT != 'Please see the Speciﬁc Aims for this Component. The abstract for this application is inthe Overall Component']
df = df[df.ABSTRACT != 'No ABS']

is needed, use Project
No abstract available 
NOT INCLUDED
Document not required
Not provided by applicant 
OVERALL\t 3
See Overall Component-Project Summary
No abstract required


In [89]:
len('Please see the Speciﬁc Aims for this Component. The abstract for this application is inthe Overall Component')

108

In [17]:
def remove_short_abstracts(df, limit):

    # Remove abstracts with length < limit. 150 seems like a good cutoff, but it does lose some useful information.
    
    # what do we want to do for the cutoff?  -- SOLUTION UNTIL FURTHER EXPLORATION -> do not remove short abstracts
    
    df['nchar']=df['working_abstract'].apply(len)
    l1 = len(df)
    df=df.loc[df['nchar']>=limit]
    l2 = len(df)
    
    print(l1-l2, "short abstracts removed")
    
    return df

In [5]:
df = remove_short_abstracts(df,limit=150)

1270 short abstracts removed


In [25]:
df.to_pickle("./clean_data_7-7-20.pkl")


In [8]:
# NOT RUNNING THIS

#####################
#Additional expressions we could choose to remove
#Identify abstracts with excessive amounts of other fields to uncover additional bad abstract types
#If we wanted to be on the safe side, some EDA makes me think we could remove anything with more than 
# 3 or 4 of these fields. It's where they start getting wonky.
###################

fields=['Principal Investigator','Program Director','Attachment','Instructions','Lines',
        'Space Provided','Performance Site','Organization','Key Personnel']
all_fields=fields.copy()
all_fields.extend([x.lower() for x in fields])
all_fields.extend([x.upper() for x in fields])
all_fields.extend(['PI','Form','Page','Title','.pdf','.doc'])

def count_up_fields(abstract):
    count=0
    for field in all_fields:
        if field in abstract:
            count+=1
    return count

df['Field Count']=df[wa].apply(count_up_fields)


In [176]:
# NOT RUNNING THIS

###########################
#Additional expressions we could remove, but there is a small possibility of some information being lost
##########################

#Issues: 'Close FormNextPrint PageAbout OMB Number']#This is usually ended with "Project summary", 
#so anything between those 2 can be delete, and #ended with a clause starting with 'Close FormProject' and ending in'Narrative File'

#expression1=re.compile('Close FormNext.*?Project Summary')
#expression2=re.compile('Close FormProject.*Narrative File')

def fix_abstract(abstract):
    if abstract.startswith('Close FormNext'):
        abstract=re.sub(expression1,'',abstract)
        return re.sub(expression2,'',abstract)
    else:
        return abstract

#df[wa]=df[wa].apply(fix_abstract)

#If ends in 'Description,', then go to last instance of PERFORMANCE (for Performance SITES), otherwise "KEY PERSONNEL", upper case, and cut all that follows

#expression1=re.compile('PERFORMANCE.*Description,$')
#expression2=re.compile('KEY PERSONNEL.*Description,$')

def apply_expressions(abstract):
    if abstract.endswith('Description,'):
        if re.search(expression1,abstract) != None:
            return re.sub(expression1,'',abstract)
        else:
            return re.sub(expression2,'',abstract)
    else:
        return abstract
    
#df[wa]=df[wa].apply(apply_expressions)

#expression1=re.compile('PERFORMANCE.*Page 3$')
#expression2=re.compile('KEY PERSONNEL.*Page 3,$')

def apply_expressions(abstract):
    if abstract.endswith('Description,'):
        if re.search(expression1,abstract) != None:
            return re.sub(expression1,'',abstract)
        else:
            return re.sub(expression2,'',abstract)
    else:
        return abstract
    
#df[wa]=df[wa].apply(apply_expressions)

