# Creation of Test and Training Sets 

### Using the same agency distribution as in the original data

In [1]:
import time
import importlib

import pandas as pd
import numpy as np
import random

In [2]:
import TextCleaning

In [3]:
raw_df=pd.read_csv('../../data/prd/RND Topic Modelling/abstracts_federal_reporter_combined.csv',engine='python')

In [4]:
# Junk starting phrases noticed through investigation of starting characters, as well as those identified in R Analysis

start_phrases=['****TECHNICAL ABSTRACT****','****Technical Abstract****',
               '****Non Technical Abstract****','*** Non- Technical Abstract ***','**Non-Technical Abstract**',
          '*****NON-TECHNICAL ABSTRACT*****','***** NON-TECHNICAL ABSTRACT *****'
          '****NONTECHNICAL ABSTRACT****','****Non-Technical Abstract****','*Non-technical Abstract*',
               '*****NON-TECHNICAL ABSTRACT*****','****NON-TECHNICAL ABSTRACT****',
               '***NON-TECHNICAL ABSTRACT***','****Nontechnical abstract****'
               'DESCRIPTION (provided by applicant):','DESCRIPTION (provided by applicant)',
               'Project Summary/Abstract','PROJECT SUMMARY/ABSTRACT','ABSTRACT',
               'PROJECT SUMMARY','Project Summary','/ASBTRACT','/ Proposal','/ SUMMARY','/ DESCRIPTION','/PROJECT SUMMARY',
               '/ PROJECT SUMMARY','/Abstract:','/ABSTRACT:','/ABSTRACT','/ ABSTRACT:',
               '/ ABSTRACT','/Abstract','/ Abstract','/Description','/SUMMARY','/PROJECT SUMMARY',
              '/ RESEARCH SUMMARY','/PROJECT SUMMARY','/abstract','/Proposal Abstract',
               '/DESCRIPTION','/PROJECT DESCRIPTION','/PROJECT SUMMARY','/NARRATIVE','/RESEARCH ABSTRACT','/ PROJECT DESCRIPTION',
              'EXCEED THE SPACE PROVIDED',
              'one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any proprietary/confidential information.* Please click the add attachment button to complete this entry.']

#Ending phrases noticed through manual investigation of last character

end_phrases = ['(End of Abstract)',"End of Abstract", '(Abstract end)' "(END OF ABSTRACT)", '(End of abstract.)','(Abstract End)','(End 0f Abstract)','(End of Abstract.)','(End of Absract)',
               '(Abstract below)','(End of Reviewers\' Comment)','(End Abstract)','(End of abstract)','(End of abstract)',
               'PERFORMANCE SITE ========================================Section End===========================================',
                'KEY PERSONNEL ========================================Section End===========================================',
               '[summary truncated at 7800 characters]', 
               'This award reflects NSF\'s statutory mission and has been deemed worthy of support through evaluation using the Foundation\'s intellectual merit and broader impacts review criteria.',
               'Project Description Page 6', 'Page 1 of 1', 'Project Summary/Abstract Page 6',
               'Project Description Page 7', 'Project Summary/Abstract Page 7', 'Pag 1 o 1', 
               'Page 2 Number pages consecutively at the bottom throughout Form Page 2',
               'This award reflects NSF\'s statutory mission and has been deemed worthy of support through evaluation using the Foundation\'s intellectual merit and broader impacts review criteria.']

In [5]:
df = TextCleaning.remove_nulls(raw_df, "ABSTRACT")
df = TextCleaning.remove_duplicates(df)
df = TextCleaning.create_working_abstract_col(df)

3 nulls in  ABSTRACT . These rows removed.
11 duplicate abstracts removed
0 project ID duplicates - not removed


In [6]:
df = TextCleaning.remove_junk_start(df, 'working_abstract', start_phrases)

****TECHNICAL ABSTRACT****
****Technical Abstract****
****Non Technical Abstract****
*** Non- Technical Abstract ***
**Non-Technical Abstract**
*****NON-TECHNICAL ABSTRACT*****
***** NON-TECHNICAL ABSTRACT *********NONTECHNICAL ABSTRACT****
****Non-Technical Abstract****
*Non-technical Abstract*
*****NON-TECHNICAL ABSTRACT*****
****NON-TECHNICAL ABSTRACT****
***NON-TECHNICAL ABSTRACT***
****Nontechnical abstract****DESCRIPTION (provided by applicant):
DESCRIPTION (provided by applicant)
Project Summary/Abstract
PROJECT SUMMARY/ABSTRACT
ABSTRACT
PROJECT SUMMARY
Project Summary
/ASBTRACT
/ Proposal
/ SUMMARY
/ DESCRIPTION
/PROJECT SUMMARY
/ PROJECT SUMMARY
/Abstract:
/ABSTRACT:
/ABSTRACT
/ ABSTRACT:
/ ABSTRACT
/Abstract
/ Abstract
/Description
/SUMMARY
/PROJECT SUMMARY
/ RESEARCH SUMMARY
/PROJECT SUMMARY
/abstract
/Proposal Abstract
/DESCRIPTION
/PROJECT DESCRIPTION
/PROJECT SUMMARY
/NARRATIVE
/RESEARCH ABSTRACT
/ PROJECT DESCRIPTION
EXCEED THE SPACE PROVIDED
one page and must contain a 

In [7]:
df = TextCleaning.remove_junk_end(df, 'working_abstract', end_phrases) 

(End of Abstract)
End of Abstract
(Abstract end)(END OF ABSTRACT)
(End of abstract.)
(Abstract End)
(End 0f Abstract)
(End of Abstract.)
(End of Absract)
(Abstract below)
(End of Reviewers' Comment)
(End Abstract)
(End of abstract)
(End of abstract)
[summary truncated at 7800 characters]
This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria.
Project Description Page 6
Page 1 of 1
Project Summary/Abstract Page 6
Project Description Page 7
Project Summary/Abstract Page 7
Pag 1 o 1
Page 2 Number pages consecutively at the bottom throughout Form Page 2
This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria.


In [8]:
# create document set to process if not using entire set

#idx = random.sample(range(len(df['working_abstract'])), 50000)
#docs = df['working_abstract'].iloc[idx]

#nasa_df = df[df['DEPARTMENT']=='NASA']
#docs = df['working_abstract']
#len(docs)

df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

In [10]:
tokened_abstracts = TextCleaning.tokenize(df['working_abstract'])
df['tokened_abstracts'] = tokened_abstracts

Time to tokenize abstracts 421.9240608215332 seconds


In [11]:
stopWords = TextCleaning.create_stopwords()
tokened_docs_nostop = TextCleaning.remove_stopwords(df['tokened_abstracts'], stopWords)

df['tokened_docs_nostop'] = tokened_docs_nostop

In [13]:
tns_bi_tri_docs = TextCleaning.add_bi_tri_grams(df['tokened_docs_nostop'])
df['tns_bi_tri_docs'] = tns_bi_tri_docs

In [15]:
lemma_docs = TextCleaning.lemmatize(df['tns_bi_tri_docs'])
df['lemma_abstracts'] = lemma_docs

Time to lemmatize: 4811.499058008194 seconds


In [16]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,FIRST_CHAR,LAST_CHAR,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,...,OTHER_PIS,ORGANIZATION_NAME,CFDA_CODE,FY_TOTAL_COST,working_abstract,Start Char,tokened_abstracts,tokened_docs_nostop,tns_bi_tri_docs,lemma_abstracts
0,0,89996,"This is a project to explore Game-based, Metap...",2008,"This is a project to explore Game-based, Metap...",.,NSF,NSF,,814512,...,"CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",WHEELING JESUIT UNIVERSITY,47.076,1999467.0,"This is a project to explore Game-based, Metap...",T,"[this, project, explore, game, based, metaphor...","[explore, game, based, metaphor, enhanced, gam...","[explore, game, based, metaphor, enhanced, gam...","[explore, game, base, metaphor, enhanced, game..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Institution: Franklin Institute Science Museum...,.,NSF,NSF,,741659,...,"ELINICH, KAREN; YOON, SUSAN",FRANKLIN INSTITUTE,47.076,1799699.0,Institution: Franklin Institute Science Museum...,I,"[institution, franklin, institute, science, mu...","[franklin, institute, science, museum, snyder,...","[franklin_institute, science, museum, snyder, ...","[franklin_institute, science, museum, snyder, ..."
2,2,89998,Through programs (including small group conver...,2008,Through programs (including small group conver...,.,NSF,NSF,,813522,...,"CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",SCIENCE MUSEUM OF MINNESOTA,47.076,1505858.0,Through programs (including small group conver...,T,"[through, programs, including, small, group, c...","[programs, small, group, conversations, citize...","[programs, small, group, conversations, citize...","[program, small, group, conversation, citizen,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,In partnership with the American Chemical Soci...,.,NSF,NSF,,838627,...,"MILLER, BRADLEY; BOWMAN, KATHERINE",INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,47.049,51000.0,In partnership with the American Chemical Soci...,I,"[partnership, with, the, american, chemical, s...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, ac,..."
4,4,90000,Amphibian populations around the world are exp...,2008,Amphibian populations around the world are exp...,.,NSF,NSF,,815315,...,,CORNELL UNIVERSITY ITHACA,47.074,370996.0,Amphibian populations around the world are exp...,A,"[amphibian, populations, around, the, world, a...","[amphibian, populations, around, world, experi...","[amphibian, populations, around_world, experie...","[amphibian, population, around_world, experien..."


In [19]:
# save this df

df.to_pickle("./entire_data_set.pkl")

In [7]:
# import saved data

df = pd.read_pickle("./entire_data_set.pkl")

### Create Train and Test Sets

#### 8 Agencies - HHS, NSF, USDA, NASA, DOD, VA, EPA, ED

In [8]:
dept_percent = df['DEPARTMENT'].value_counts()/len(df)
dept_percent.index.tolist()

['HHS', 'NSF', 'USDA', 'NASA', 'DOD', 'VA', 'EPA', 'ED']

In [23]:
# create train and test sets

agency_list = dept_percent.index.tolist() #['HHS', 'NSF', 'USDA', 'NASA', 'DOD', 'VA', 'EPA', 'ED']

train_idx = []
test_idx = []
split_percent = 0.6

for agency in agency_list:

    # find indices for specific department
    idx = df[df['DEPARTMENT']==agency].index.tolist()

    # find a random subset of indices from each agency
    # first 80% of indices: train set
    # last 20% of indices: test set
    
    rand_idx = random.sample(idx, len(idx))
    test_train_split = int(split_percent*len(idx))
    
    train_idx.append(rand_idx[:test_train_split])
    test_idx.append(rand_idx[test_train_split:])

# create one list from list of lists for train_idx and test_idx
flat_train_idx = [ix for idx_list in train_idx for ix in idx_list]
flat_test_idx = [ix for idx_list in test_idx for ix in idx_list]
        
# create train and test sets
train_df = df.loc[flat_train_idx]
test_df = df.loc[flat_test_idx]


In [3]:
# save/read train & test df's

#train_df.to_pickle("./train_df.pkl")
#test_df.to_pickle("./test_df.pkl")

train_df = pd.read_pickle("./train_df.pkl")
test_df = pd.read_pickle("./test_df.pkl")

In [4]:
train_df['DEPARTMENT'].value_counts()/len(train_df)

HHS     0.682531
NSF     0.211805
USDA    0.049633
NASA    0.029097
DOD     0.012635
VA      0.008835
EPA     0.002820
ED      0.002645
Name: DEPARTMENT, dtype: float64

In [5]:
test_df['DEPARTMENT'].value_counts()/len(test_df)

HHS     0.682500
NSF     0.211800
USDA    0.049637
NASA    0.029104
DOD     0.012643
VA      0.008835
EPA     0.002827
ED      0.002654
Name: DEPARTMENT, dtype: float64

In [6]:
test_df['DEPARTMENT'].value_counts()

HHS     75088
NSF     23302
USDA     5461
NASA     3202
DOD      1391
VA        972
EPA       311
ED        292
Name: DEPARTMENT, dtype: int64

In [19]:
dept_percent

HHS     0.682525
NSF     0.211804
USDA    0.049634
NASA    0.029098
DOD     0.012637
VA      0.008835
EPA     0.002821
ED      0.002647
Name: DEPARTMENT, dtype: float64

If we decide to use an evenly split test/train set:

The agency with the least amount of abstracts is ED with 1456.  
80%-20% split:  
- training set: 1164
- testing set: 292 

So, the largest training set we could have is 1164 abstracts from each of the 8 agencies: 1164\*8 = 9,312  
and the largest test set we could have is 292\*8 = 2,336.