### Corpus Filtering Method by Eads. et al 

**Testing on theme of AI**

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

##### Data Ingestion

In [2]:
# pull in data

df = pd.read_pickle("../../data/prd/Tech-Report/FR_meta_and_final_tokens_21SEPT14.pkl")
df.reset_index(inplace = True)

In [3]:
print(df.shape)
df.head()

(696093, 31)


Unnamed: 0,index,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,final_tokens
0,0,0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,"[project, explore, game, base, metaphor, enhan..."
1,1,1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,"[institution, franklin_institute, science, mus..."
2,2,2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,"[program, small, group, conversation, citizen,..."
3,3,3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,"[partnership, american, chemical, society, acs..."
4,4,5,90001,The Center for Molecular Interfacing (CMI) wil...,2008,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,"ABRUNA, HECTOR D",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.049,2008,1519821.0,,1,1,"[center, molecular, interfacing, cmi, enable, ..."


In [4]:
df.isna().sum()

index                              0
original index                     0
PROJECT_ID                         0
ABSTRACT                           0
FY                                 0
PROJECT_TERMS                  13006
PROJECT_TITLE                      0
DEPARTMENT                         0
AGENCY                             0
IC_CENTER                     175386
PROJECT_NUMBER                     0
PROJECT_START_DATE                 0
PROJECT_END_DATE              128963
CONTACT_PI_PROJECT_LEADER         41
OTHER_PIS                     601917
CONGRESSIONAL_DISTRICT         63655
DUNS_NUMBER                     9370
ORGANIZATION_NAME               1376
ORGANIZATION_CITY               5244
ORGANIZATION_STATE              8719
ORGANIZATION_ZIP               45394
ORGANIZATION_COUNTRY            5034
BUDGET_START_DATE             227230
BUDGET_END_DATE               227292
CFDA_CODE                     170273
FY.y                               0
FY_TOTAL_COST                 179217
F

##### Step 1 - short intuitive keyword list

In [None]:
# skip for now

#ai_keywords = ['artificial_intelligence']

##### Step 2 - LDA on full corpus

**Note** During DSPG many topic models with large numbers of topics were tested.  No topics unambiguously related to AI were produced.  So for this step, we filter our corpus to only include NSF projects with CFDA code 47.070 (Computer and Information Science and Engineering) and then compute a topic model.

In [5]:
df['AGENCY'].value_counts()

NIH        503425
NSF        121718
NIFA        25625
NASA        16005
CDMRP        7593
VA           5529
ARS          3950
ALLCDC       3108
AHRQ         2739
IES          1764
FDA          1558
EPA          1540
NIDILRR       810
ACF           283
FS            194
CNRM          131
DVBIC          99
CCCRP          22
Name: AGENCY, dtype: int64

In [6]:
# filter corpus - NSF, CFDA 47.070

df_nsf_cs = df[df['AGENCY'] == 'NSF']

In [7]:
df_nsf_cs.isna().sum()

index                              0
original index                     0
PROJECT_ID                         0
ABSTRACT                           0
FY                                 0
PROJECT_TERMS                      2
PROJECT_TITLE                      0
DEPARTMENT                         0
AGENCY                             0
IC_CENTER                     121718
PROJECT_NUMBER                     0
PROJECT_START_DATE                 0
PROJECT_END_DATE                   0
CONTACT_PI_PROJECT_LEADER          0
OTHER_PIS                      77912
CONGRESSIONAL_DISTRICT           716
DUNS_NUMBER                     2477
ORGANIZATION_NAME                  0
ORGANIZATION_CITY                194
ORGANIZATION_STATE               708
ORGANIZATION_ZIP                 465
ORGANIZATION_COUNTRY               4
BUDGET_START_DATE             121718
BUDGET_END_DATE               121718
CFDA_CODE                         48
FY.y                               0
FY_TOTAL_COST                    553
F

In [8]:
df_nsf_cs['CFDA_CODE'].value_counts()

47.049    25434
47.041    23933
47.070    16416
47.074    12994
47.050    12683
47.075    10946
47.076     9878
47.082     4712
47.079     2626
47.078      977
47.080      853
47.083      164
47.081       54
Name: CFDA_CODE, dtype: int64

In [9]:
df_nsf_cs = df_nsf_cs[df_nsf_cs['CFDA_CODE'] == '47.070'] 

In [10]:
df_nsf_cs.shape

(16416, 31)

In [11]:
# input needed for LDA is one string per document (not a list of strings)

text = []
docs = df_nsf_cs["final_tokens"]

for abstract in docs:
    text.append(" ".join(abstract))

In [12]:
# create document-term matrix

#stop_wds = ['research', 'study', 'project']
lda_vectorizer = CountVectorizer(max_df=0.6, min_df=20)  #, stop_words=stop_wds)
lda_dtm = lda_vectorizer.fit_transform(text)

In [22]:
# LDA -- to get same results as Haleigh must use same number of cores as well as random_state

num_topics = 100
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=19, random_state = 1)
doc_top = lda_model.fit_transform(lda_dtm)
top_term = lda_model.components_

In [14]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(top_term, vectorizer, top_n=10):
    # loop through each row of topic-term-matrix.  idx = row index.  topic = actual row
    for idx, topic in enumerate(top_term):  
        print("\nTopic %d:" % (idx))            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)

In [23]:
print_topics(top_term, lda_vectorizer, 10)


Topic 0:
('simulation', 1000.1575650675231)
('system', 647.6364193990157)
('dynamics', 642.866840173841)
('model', 486.7544183671984)
('process', 434.23622805482375)
('complex', 317.56220200152524)
('study', 309.67062934616575)
('behavior', 280.8635304010139)
('change', 255.6758442262005)
('time', 253.527316145451)

Topic 1:
('school', 1691.4775555014635)
('teacher', 1162.0896609218298)
('student', 1160.3792115043263)
('course', 959.3147125763461)
('cs', 857.6459876551179)
('high', 809.4504969779451)
('computer', 764.3245051646703)
('science', 657.5728976765444)
('curriculum', 398.0560936192607)
('education', 357.89076684603623)

Topic 2:
('infrastructure', 1116.460094179321)
('system', 831.5342498229434)
('support', 466.138854165251)
('researcher', 441.95808982936296)
('computing', 421.19933902688354)
('university', 406.99637545906205)
('computer', 401.2557716607453)
('community', 371.1037577054972)
('provide', 333.62613527620175)
('experiment', 294.48343315432885)

Topic 3:
('memory