# Eads et al Method, using NSF subsetted corpus to cfda = 47.070

In [1]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import filter
#from git/dspg21RnD/wheat_filtration/wheat_filtration import keywords
#from git/dspg21RnD/wheat_filtration/wheat_filtration import filter
#import keywords

In [2]:
def total_topic_proportion(document_topics, relevant_topics):
    """Return sum of relevant topic proportions for a document.
    Arguments:
        document_topics (iterable of float): topic proportions for one document.
        relevant topics (iterable of int): a list of the numbers corresponding
            with the topics considered relevant by the user."""
    assert (len(relevant_topics) <= len(document_topics)
            )  # TODO make this the right kind of error
    return sum([document_topics[i] for i in relevant_topics])


def keyword_proportion(document, keyword_list):
    """Return percentage of words in the given doc that are present in keyword_list."""
    doc_tokens = document.split()
    num_keywords = sum(
        [1 if word in keyword_list else 0 for word in doc_tokens])
    return float(num_keywords)/len(doc_tokens)


def superkeyword_presence(document, superkeywords):
    """Return 1 if document contains any superkeywords, 0 if not."""
    for word in superkeywords:
        if word in document.split():
            return True
    return False


class FilterHelper():
    """Creates a filter object containing filter criteria such as keyword list,
    superkeyword list, total topic proportion threshold, and keyword proportion
    threshold.

    Arguments:
        topic_model (TopicModel): a TopicModel object instantiated with a corpus or
            files from a Mallet topic model.
        relevant_topics (iterable of int): a list of the numbers corresponding
            with the topics considered relevant by the user. Note that the number
            corresponding with the first topic is '0', the second topic is '1', etc.
        n_keywords: number of keywords to include in keyword list. Default is 20.
        superkeywords (iterable of str): a list of keywords which signify immediate relevance
            of the document that contains them (better wording). Default is an empty list.
        keyword_list: A list of keywords ordered by [the relevance they signify]. Default is
            a keyword list generated using the relative entropy method.
        total_topic_prop_threshold (float): the threshold of relevance for the total proportion
            of relevant topics in a document. If a document surpases the threshold, it is considered relevant.
        keyword_prop_threshold (float): the threshold of relevance for the proportion of words
            on the keyword list that appear in a document. If a document surpases the threshold,
            it is considered relevant.

    Attributes:
        topic_model (TopicModel): a TopicModel object instantiated with a corpus or
            files from a Mallet topic model.
        relevant_topics (iterable of int): a list of the numbers corresponding
            with the topics considered relevant by the user.
        superkeywords (iterable of str): a list of keywords which signify immediate relevance
            of the document that contains them (better wording). Default is an empty list.
        keyword_list: A list of keywords ordered by [the relevance they signify]. Default is
            a keyword list generated using the relative entropy method.
        total_topic_prop_threshold (float): the threshold of relevance for the total proportion
            of relevant topics in a document. If a document surpases the threshold, 
            it is considered relevant. Default is 0.25.
        keyword_prop_threshold (float): the threshold of relevance for the proportion of words
            on the keyword list that appear in a document. If a document surpases the threshold,
            it is considered relevant. Default is 0.15.

    Raises:
        RuntimeError: if user enters both keyword list and n_keywords when using the
        keyword_list setter method.
        """

    def __init__(self, topic_model, vectorizer, relevant_topics, keyword_list=None, n_keywords=100, superkeywords=[],
                 term_words = [],
                 total_topic_prop_threshold=0.25, keyword_prop_threshold=0.15):
        self._relevant_topics = relevant_topics
        if keyword_list is None:
            keyword_list = keywords.rel_ent_key_list(
                topic_model, n_keywords, relevant_topics)
        self._keyword_list = keyword_list

        lower_superkeys = [word.lower() for word in superkeywords]
        # TODO: deal with this appropriately when making lowercasing optional
        extended_superkeys = [
            word for word in vectorizer.get_feature_names() if
            word in lower_superkeys or
            any([(chunk in lower_superkeys) for chunk in word.split('_')])
        ]
        self._superkeywords = extended_superkeys

        self._total_topic_prop_threshold = total_topic_prop_threshold
        self._keyword_prop_threshold = keyword_prop_threshold
        self.term_words = term_words
        self._topic_model = topic_model
        self._vectorizer = vectorizer

    @property
    def topic_model(self):
        """Get topic_model used to create filter"""
        return self._topic_model

    @property
    def relevant_topics(self):
        """Get list of relevant topics"""
        return self._relevant_topics

    @property
    def keyword_list(self):
        """Get or set keyword list. Input either a list of keywords, or input an integer n
        to generate a keyword list containing n words."""
        return self._keyword_list

    @keyword_list.setter
    def keyword_list(self, keyword_list=None, n_keywords=None):
        if keyword_list is not None:
            self._keyword_list = keyword_list
        elif n_keywords is not None:
            self._keyword_list = keywords.rel_ent_key_list(
                self.topic_model, n_keywords, self.relevant_topics)
        else:
            raise RuntimeError(
                "Enter either a keyword list or an integer for number of keywords")

    @property
    def superkeywords(self):
        return self._superkeywords

    @superkeywords.setter
    def superkeywords(self, superkeywords):
        self._superkeywords = superkeywords

    @property
    def total_topic_prop_threshold(self):
        return self._total_topic_prop_threshold

    @total_topic_prop_threshold.setter
    def total_topic_prop_threshold(self, total_topic_prop_threshold):
        self._total_topic_prop_threshold = total_topic_prop_threshold

    @property
    def keyword_prop_threshold(self):
        return self._keyword_prop_threshold

    @keyword_prop_threshold.setter
    def keyword_prop_threshold(self, keyword_prop_threshold):
        self._keyword_prop_threshold = keyword_prop_threshold


def proportion_lists():
    """makes a matrix or list of ttp, superkeyword, and keyword proportion for the docs in corpus
    and sets the respective topic model attributes"""
    pass


def subset_quality(threshs, labeled_subset):  # also had args word_list_gen and scorefun
    """Calculate F1 score for the array of thresholds threshs
    (max topic prop, total topic prop, vocab prop, and number of words
    in vocabulary list) on labeled subset"""
    pass


def subset_info(threshs):  # seems like a cool feature to include
    """Return set of false positives, true positives, false negatives, and true negatives, as
    well as the sizes of the false neg and false pos sets, as well as the size of set
    predicted as relevant, about the subset created by the given set of thresholds
    (mtp, ttp, voc prop, and voc list length, in that order).
    This function can be edited to output any kind of info about the subset, eg the filenames."""
    pass

In [3]:
#functions for creating a topic dictionary, viewing the topics in the topic model,
#and selecting only the relevant topics based on a threshold and our keyword list.

def topic_dictionary(lda_model, lda_vectorizer, top_n = 10):
    topic_ls = {} #append keys, append the values

    for idx, topic in enumerate(lda_model.components_):  # loop through each row of H.  idx = row index.  topic = actual row

        print_list = [(lda_vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        topic_ls[idx] = print_list

    return topic_ls

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
      
def rel_ent_key_list(topic_model, vectorizer, n_top_keywords, relevant_topics):
    """Returns a list of the top n keywords based on relative entropy score
     Arguments:
       topic_model (TopicModel): a topic by vocabulary word matrix where each entry
       is the total word count for that word in that topic
       n_top_words (int): the number of keywords the method will return
       relevant_topics (iterable of int)
     Returns:
       keyword_list (iterable of str): list of the top n keywords, sorted
     """
    topic_word_matrix = topic_model.components_
    lda_vectorizer = vectorizer
    
    # Log of probabilities of vocab words
    #this works
    vocab_logs = np.log(topic_word_matrix.sum(
        axis=0) / topic_word_matrix.sum())

    # Log of probabilities of vocab words given they were in each relevant topic
    #this is being built to calculate p(w)*log[p(w)/q(w)]
    #this works
    topic_logs = np.log(topic_word_matrix[relevant_topics, :].sum(
        axis=0) / topic_word_matrix[relevant_topics, :].sum())

    # relative entropy proportions, unsorted
    #log rules: log[p(w)/q(w)] = log(p(w)) - log(q(w))
    unsorted_props = np.asarray(topic_word_matrix.sum(axis=0) /
                                topic_word_matrix.sum()) * np.asarray(topic_logs - vocab_logs)

    unsorted_props = np.matrix.flatten(unsorted_props)

    sorted_props_and_voc = sorted([(unsorted_props[i], lda_vectorizer.get_feature_names()[i]) for i in list(
        np.argpartition(unsorted_props, len(lda_vectorizer.get_feature_names()) - n_top_keywords))[-n_top_keywords:]], reverse=True)
    ordered_vocab = []
    for (_, voc) in sorted_props_and_voc:
        ordered_vocab.append(voc)
    return ordered_vocab

In [4]:
#making a filter_corpus function (copied from wheat_filtration package)
def total_topic_proportion(document_topics, relevant_topics, doc_number = 0):
    """Return sum of relevant topic proportions for a document.
    Arguments:
        document_topics (iterable of float): topic proportions for one document.
        relevant topics (iterable of int): a list of the numbers corresponding
            with the topics considered relevant by the user."""
    assert (len(relevant_topics) <= len(document_topics)
            )  # TODO make this the right kind of error
    document = document_topics[doc_number]
    topic_prop = 0
    for i in relevant_topics:
        topic_prop += document[i]    
    return topic_prop

def keyword_proportion(document, keyword_list):
    """Return percentage of words in the given doc that are present in keyword_list."""
    doc_tokens = document
    num_keywords = sum(
        [1 if word in keyword_list else 0 for word in doc_tokens])
    return float(num_keywords)/len(doc_tokens)

def superkeyword_presence(document, superkeywords):
    """Return 1 if document contains any superkeywords, 0 if not."""
    for word in superkeywords:
        if word in document:
            return True
    return False

def in_ai_phrases(abstract, ai_phrases):
    text = " ".join(abstract)
    for phrase in ai_phrases:
        if phrase in text:
            return True
    return False

def is_relevant(doc, doc_topics, filter_helper, doc_number = 0, ai_phrases = ["machine learn", "deep learn", "deep learning", "artificial intelligence", "natural language processing"]):
    """Returns a boolean for relevance of given document. A document is considered
    relevant if: it contains any superkeywords(filter_helper.superkeywords), passes
    the total topic proportion threshold(filter_helper.total_topic_prop_threshold),
    or passes the keyword proportion threshold(filter_helper.keyword_prop_threshold).
    Arguments:
        doc (string): preprocessed document from the corpus
        doc_topics (iterable of float): proportion of each topic present in the given document
        filter_helper (FilterHelper): an object containing the necessary information
            to label the relevance of the given document
    Returns:
        (bool): Representing whether or not the given document is relevant according
        to the information in filter_helper"""

    has_superkeyword = superkeyword_presence(
        doc, filter_helper.superkeywords)
    
    in_phrases = in_ai_phrases(doc, ai_phrases)
    
    passes_total_topic_thresh = total_topic_proportion(
        doc_topics, filter_helper.relevant_topics, doc_number) > (filter_helper.total_topic_prop_threshold)
    
    passes_keyword_thresh = keyword_proportion(
        doc, filter_helper.keyword_list) > filter_helper.keyword_prop_threshold

    return has_superkeyword or passes_total_topic_thresh or passes_keyword_thresh or in_phrases


def filter_corpus(abstract_column, doc_topics, filter_helper, ai_phrases = ["machine learn", "deep learn", "deep learning", "artificial intelligence", "natural language processing"]):
    subcorpus_id = []
    superkey = 0
    topic_thresh = 0
    keyword_thresh = 0
    phrases = 0
    for i, abstract in enumerate(abstract_column):
        doc = abstract 
        if is_relevant(doc, doc_topics, filter_helper, doc_number = i):
            if superkeyword_presence(doc, filter_helper.superkeywords):
                superkey += 1
            if total_topic_proportion(doc_topics, filter_helper.relevant_topics, doc_number = i) > (filter_helper.total_topic_prop_threshold):
                topic_thresh += 1
            if keyword_proportion(doc, filter_helper.keyword_list) > filter_helper.keyword_prop_threshold :
                keyword_thresh += 1
            if in_ai_phrases(doc, ai_phrases):
                phrases += 1
            subcorpus_id.append(i)
    print("Superkeyword presence: ", superkey, "\nTotal Topic Proportion: ", topic_thresh, "\nKeyword Threshold: ",
          keyword_thresh, "\nPhrase words matched: ", phrases, "\nTotal docs: ", len(subcorpus_id))
    return subcorpus_id

In [5]:
#start with the core terms from the OECD paper
core_terms = ["adaboost","artificial intelligence","artificial neural network","back propagation"
,"back propagation neural network","computational intelligence","computer vision"
,"convolutional neural network","deep belief network","deep convolutional neural network"
,"deep learn","deep neural network","elman network","elman neural network"
,"expert system","fee forward neural network","inference engine","machine intelligence"
,"machine learn","machine translation","machine vision","multilayer neural network"
,"natural language process","perceptron","random forest","rbf neural network","recurrent neural network"
,"self organize map","spike neural network","supervise learn","support vector machine"
,"svm classifier","unsupervised learn","artificial_intelligence","artificial_neural_network","back_propagation"
,"back_propagation_neural_network","computational_intelligence","computer_vision"
,"convolutional_neural_network","deep_belief_network","deep_convolutional_neural_network"
,"deep_learn","deep_neural_network","elman_network","elman_neural_network"
,"expert_system","fee_forward_neural_network","inference_engine","machine_intelligence"
,"machine_learn","machine_translation","machine_vision","multilayer_neural_network"
,"natural_language_process","random_forest","rbf_neural_network","recurrent_neural_network"
,"self_organize_map","spike_neural_network","supervise_learn","support_vector_machine"
,"svm_classifier","unsupervised_learn", "machine_learning"]

In [6]:
def relevant_topics(topic_dictionary, keyword_list, threshold = 1):
    """returns a list of the topics which contain a threshold % of the
    relevant words in the keyword list"""
    relevant_topic = []
    for key in topic_dictionary:
        relevant_words = 0
        for i in range(len(topic_dictionary[key])):
            if topic_dictionary[key][i][0] in keyword_list:
                relevant_words += 1
            else: relevant_words += 0
        if (relevant_words) >= threshold :#/ len(topic_dictionary[key]) >= threshold :
            relevant_topic.append(key)
    return relevant_topic 

In [7]:
df = pd.read_pickle("../../data/dspg21RnD/smaller-final-dataset.pkl")

In [8]:
nsf = df[df["AGENCY"] == "NSF"]
# filter where cfda = 47.070

nsf_csci = nsf[nsf["CFDA_CODE"] == "47.070"]


In [41]:
df.reset_index(inplace=True)  # I don't think I need to do this, but just in case
nsf_csci.reset_index(inplace=True)

In [42]:
tokens = nsf_csci["final_frqwds_removed"]

text = [] # text will contain the processed tokens in string form (1 string per abstract)


for abstract in tokens:
    text.append(" ".join(abstract))
    
text = pd.Series(text)

In [43]:
lda_vectorizer = CountVectorizer(max_df=0.6, min_df=20)

lda_dtm = lda_vectorizer.fit_transform(text)


In [44]:
num_topics = 100
lda_model_100 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 0)

doc_top_dist_100 = lda_model_100.fit_transform(lda_dtm)
top_term_dist_100 = lda_model_100.components_

In [45]:
nsfcs_dic100 = topic_dictionary(lda_model_100, lda_vectorizer, 50)

In [46]:
relevant_topics(nsfcs_dic100, core_terms, 0.04)

[15, 23, 28, 87, 97]

So, we get 5 topics when we do 1 word out of 50 ahhaha.  We only get topic 97 when we use 2 words out of 50.  I will look through these topics and add to the topics i picked out myself and decide the relevant topics, then pick out the relative entropy keyword list before making a superkeyword list.

"AI" is the 20th term.  and there are only 34 times it comes up in this topic?  Not gonna include

I will keep 27.

In [47]:
relevant_topics_HT = [27]

In [48]:
relevant_topics_HT.append(87)

In [49]:
relevant_topics_HT.append(97)

When I ran it on my own, I picked out 19, 52, 54, 76, 79, 86, 97

In [50]:
relevant_topics_HT

[27, 87, 97]

In [51]:
relevant_topics_HT.append(19)

In [52]:
relevant_topics_HT.append(52)

In [20]:
#I don't know about this, since it is just robot and not the other AI terms.

In [53]:
relevant_topics_HT.append(79)

In [54]:
relevant_topics_HT.append(86)

In [55]:
relevant_topics_HT

[27, 87, 97, 19, 52, 79, 86]

In [56]:
print_topics(lda_model_100, lda_vectorizer)


Topic 0:
('event', 730.781845371416)
('discovery', 583.8164989359399)
('domain', 329.10975423693554)
('analytics', 273.5916752530804)
('framework', 270.4499109174751)
('software', 219.42028034058512)
('scientific', 206.20389200158652)
('workflow', 183.222440789522)
('science', 159.09815332735093)
('source', 155.7388801028597)

Topic 1:
('vehicle', 1003.5540341166559)
('transportation', 374.6671616662269)
('traffic', 351.07919632386125)
('driver', 252.09186600333013)
('driving', 198.9768789225774)
('road', 197.41491396313737)
('autonomous_vehicle', 188.85933542912068)
('trajectory', 182.416660865909)
('safety', 171.8632244973694)
('vehicular', 148.5576938678973)

Topic 2:
('team', 1133.5323661560801)
('engineering', 327.3253606310934)
('technology', 316.82114344244604)
('workshop', 201.58176970036058)
('report', 196.04222913384802)
('competition', 184.7872157240752)
('individual', 165.7674683226673)
('university', 148.41031049446593)
('disability', 144.2238105266517)
('nsf', 143.824387

Ok, so with my judgement plus some that the relevant_topics function picked out, we have 7 topics that should be roughly about AI.

Creating the relative entropy keyword list:

In [57]:
rel_ent_top200 = rel_ent_key_list(lda_model_100, lda_vectorizer, 200, relevant_topics_HT)

Creating the superkeyword list:

"To create the super keyword list, we examine an expanded list -- the top 1000 words -- of high-relative-entropy-constribution words from the last step and select those words that are unambiguously related to the concept of interest, i.e. likely to be used when referring to the concept of interest and no other concepts.

creating the filter helper to see if we can start trying to filter the corpus to get some sort of sense the abstracts that are about AI

In [58]:
ai_HT_KL = ['machine_learning',  'artificial_intelligence', 'artificial_intelligence_ai',
                'convolutional_neural_network', 'recognition_asr',  'artificial_intelligence_machine_learning']

phrase = ['learning', 'learn', 'processing',  'natural', 'deep', 'intelligence', 'artificial']

In [59]:
phrase = ["machine learn", "deep learn", "deep learning", "artificial intelligence", "natural language processing"]

In [60]:
my_filter_helper = FilterHelper(topic_model = lda_model_100,
                                vectorizer = lda_vectorizer,
                               relevant_topics = relevant_topics_HT,
                               superkeywords = ai_HT_KL,
                               keyword_list = rel_ent_top200,
                               total_topic_prop_threshold = 0.25,
                               keyword_prop_threshold = 0.25)

16k rows because one for each document.  100 columns because 1 for each topic.

In [28]:
#creating a new document-topic-distribution with the full corpus
tokens2 = df["final_frqwds_removed"]

fullcorpus = [] # text will contain the processed tokens in string form (1 string per abstract)


for abstract in tokens2:
    fullcorpus.append(" ".join(abstract))
    
fullcorpus = pd.Series(fullcorpus)

newdocs = fullcorpus
new_doc_term_matrix = lda_vectorizer.transform(newdocs) 
new_doc_term_dist = lda_model_100.transform(new_doc_term_matrix)


KeyboardInterrupt: 

In [None]:
pd.DataFrame(new_doc_term_dist)

In [61]:
my_subcorpus = filter_corpus(nsf_csci["final_frqwds_removed"], doc_top_dist_100, my_filter_helper)

Superkeyword presence:  1748 
Total Topic Proportion:  1870 
Keyword Threshold:  1981 
Phrase words matched:  868 
Total docs:  3736


In [62]:
len(my_subcorpus)

3736

In [66]:
my_subcorpus[0:10]

[8, 11, 19, 21, 23, 42, 46, 48, 71, 74]

## Kathryn - adapt to full corpus

In [63]:
def nsf_filter_corpus_KL(df, doc_topics, filter_helper, ai_phrases = ["machine learn", "deep learn", "deep learning", "artificial intelligence", "natural language processing"]):
    subcorpus_id = []
    superkey = 0
    topic_thresh = 0
    keyword_thresh = 0
    phrases = 0
    for i, abstract in enumerate(df["final_frqwds_removed"]):
        doc = abstract 
        if is_relevant(doc, doc_topics, filter_helper, doc_number = i):
            if superkeyword_presence(doc, filter_helper.superkeywords):
                superkey += 1
            if total_topic_proportion(doc_topics, filter_helper.relevant_topics, doc_number = i) > (filter_helper.total_topic_prop_threshold):
                topic_thresh += 1
            if keyword_proportion(doc, filter_helper.keyword_list) > filter_helper.keyword_prop_threshold :
                keyword_thresh += 1
            if in_ai_phrases(doc, ai_phrases):
                phrases += 1
            subcorpus_id.append(i)
    print("Superkeyword presence: ", superkey, "\nTotal Topic Proportion: ", topic_thresh, "\nKeyword Threshold: ",
          keyword_thresh, "\nPhrase words matched: ", phrases, "\nTotal docs: ", len(subcorpus_id))
    return subcorpus_id

In [64]:
nsf_idx = nsf_filter_corpus_KL(nsf_csci, doc_top_dist_100, my_filter_helper)

Superkeyword presence:  1748 
Total Topic Proportion:  1870 
Keyword Threshold:  1981 
Phrase words matched:  868 
Total docs:  3736


In [65]:
nsf_idx[0:10]

[8, 11, 19, 21, 23, 42, 46, 48, 71, 74]

In [67]:
nsf_csci.head(10)

Unnamed: 0,index,original index,PROJECT_ID,ABSTRACT,FY,ORG_COUNT,PI_COUNT,nchar,final_frqwds_removed,PROJECT_TERMS,...,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
0,1996,11849,101844,This symposium is a premiere forum for researc...,2008,1,1,697,"[symposium, premiere, forum, interaction, comp...",Arts; Award; Collaborations; Communities; Com...,...,CHAMPAIGN,IL,61820-7406,UNITED STATES,,,47.07,2008,4651.0,
1,2067,11747,101739,This award is to the Computer Research Associa...,2008,1,1,2638,"[award, computer, association, cra, coordinate...",Address; Architecture; Award; base; Collabora...,...,WASHINGTON,DC,20036-0000,UNITED STATES,,,47.07,2008,29940.0,
2,2187,12250,102249,IIS-0808994PI: Jonathan FurnerUniversity of Ca...,2008,1,1,1682,"[iis, pi, furneruniversity, california_los, an...",Arts; Award; California; Development; Dimensi...,...,LOS ANGELES,CA,90095-1406,UNITED STATES,,,47.07,2008,25859.0,
3,2234,12405,102404,ABSTRACT0812795Vijay K. VaishnaviGa State U Re...,2008,1,1,574,"[vaishnaviga, res, fdnthis, seek, funding, enc...",computer science; design; Discipline; Funding...,...,ATLANTA,GA,30303-3999,UNITED STATES,,,47.07,2008,9000.0,
4,2235,12451,102448,The 3rd International Conference on emerging N...,2008,1,1,3025,"[3rd, international, conference, emerge, netwo...",Award; base; career; Commit; Communication; C...,...,MADISON,WI,53715-1218,UNITED STATES,,,47.07,2008,10875.0,
5,2262,12260,102258,Need-Based Sponsorship Of Student Travelto the...,2008,1,1,1622,"[need, sponsorship, student, travelto, 2008, p...",Algorithms; Area; base; Code; Communities; Co...,...,LOS ANGELES,CA,90095-1406,UNITED STATES,,,47.07,2008,14025.0,
6,2274,12334,102333,ABSTRACT0745523Thomas B. HortonU of VA A speci...,2008,1,1,936,"[va, special, day, academy, software, engineer...",Academy; career; Community Health Education; ...,...,CHARLOTTESVILLE,VA,22904-4195,UNITED STATES,,,47.07,2008,10061.0,
7,2347,9194,99160,This NSF project provides travel funds to assi...,2008,1,1,1332,"[nsf, travel, fund, assist, approximately, par...",Applications Grants; Asia; Collaborations; co...,...,ALBUQUERQUE,NM,87131-0001,UNITED STATES,,,47.07,2008,,
8,2380,10449,99165,An effective document representation is a cruc...,2008,1,1,1851,"[document, representation, crucial, text, proc...",Classification; Computer Assisted; Computers;...,...,WEST LAFAYETTE,IN,47907-2114,UNITED STATES,,,47.07,2008,,
9,2461,3134,93103,The purpose of this workshop is to initiate a ...,2008,1,1,1554,"[purpose, workshop, initiate, dialogue, higher...",Communities; Country; Education; Educational ...,...,BOULDER,CO,80301-2538,UNITED STATES,,,47.07,2008,12492.0,


In [68]:
def full_filter_corpus_KL(df, filter_helper, ai_phrases = ["machine learn", "deep learn", "deep learning", "artificial intelligence", "natural language processing"]):
    subcorpus_id = []
    superkey = 0
    topic_thresh = 0
    keyword_thresh = 0
    phrases = 0
    
    for i, abstract in enumerate(df["final_frqwds_removed"]):
        doc = abstract 
         
        # check conditions other than total_topic_proportion    
        has_superkeyword = superkeyword_presence(doc, filter_helper.superkeywords)
        in_phrases = in_ai_phrases(doc, ai_phrases)
        passes_keyword_thresh = keyword_proportion(doc, filter_helper.keyword_list) > filter_helper.keyword_prop_threshold
        
        if has_superkeyword:
            superkey += 1
        if in_phrases:
            phrases += 1
        if passes_keyword_thresh:
            keyword_thresh += 1

        if(has_superkeyword or in_phrases or passes_keyword_thresh):    
            subcorpus_id.append(i)
            
    print("Superkeyword presence: ", superkey, "\nTotal Topic Proportion: ", topic_thresh, "\nKeyword Threshold: ",
          keyword_thresh, "\nPhrase words matched: ", phrases, "\nTotal docs: ", len(subcorpus_id))
    return subcorpus_id


In [69]:
full_ids = full_filter_corpus_KL(df, my_filter_helper)

Superkeyword presence:  5430 
Total Topic Proportion:  0 
Keyword Threshold:  6412 
Phrase words matched:  3004 
Total docs:  12304


In [71]:
full_ids[1:10]

[151, 206, 207, 249, 348, 397, 561, 609, 668]

In [74]:
# find unique set of ids

proj_id_nsf = nsf_csci.loc[nsf_idx, "PROJECT_ID"]
proj_id_full = df.loc[full_ids, "PROJECT_ID"]

In [75]:
print(proj_id_nsf[0:10])
print(proj_id_full[0:10])

8      99165
11     93291
19    119775
21     95713
23    107716
42    135705
46    110591
48    135469
71    102831
74    102440
Name: PROJECT_ID, dtype: object
47       64081
151     100444
206    1130812
207     931185
249    1085438
348     906554
397     343573
561     578496
609     660278
668     810689
Name: PROJECT_ID, dtype: object


In [76]:
ai_proj_ids = np.concatenate([proj_id_nsf, proj_id_full])
ai_proj_ids = np.unique(ai_proj_ids)
len(ai_proj_ids)

12694

In [77]:
ai_corpus = df[df["PROJECT_ID"].isin(ai_proj_ids)]

In [78]:
ai_corpus.head()

Unnamed: 0,index,original index,PROJECT_ID,ABSTRACT,FY,ORG_COUNT,PI_COUNT,nchar,final_frqwds_removed,PROJECT_TERMS,...,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
47,47,497627,64081,DESCRIPTION (provided by applicant): Healthy o...,2012,1,1,3104,"[healthy, older, listener, decline, cognitive,...",Acoustics; Address; Adult; Affect; aging brai...,...,EVANSTON,IL,602013149,UNITED STATES,9/1/2012,1/1/2013,93.866,2012,142560.0,
151,151,10470,100444,Magnetic fields on the Sun are ultimately the ...,2008,1,1,255,"[magnetic, sun, ultimately, source, space, wea...",magnetic field; Modeling; Source; The Sun; Un...,...,PHILADELPHIA,PA,19104-2875,UNITED STATES,,,43.001,2008,440000.0,
206,206,1094356,1130812,PROJECT SUMMARY / ABSTRACTModern digital patho...,2019,1,1,3058,"[modern, digital, pathology, department, treme...",actionable mutation; anticancer research; Arc...,...,NEW YORK,NY,100654805,UNITED STATES,1/11/2019,1/10/2020,93.398,2019,45016.0,
207,207,938623,931185,PROJECT SUMMARY / ABSTRACTModern digital patho...,2017,1,1,3059,"[modern, digital, pathology, department, treme...",actionable mutation; anticancer research; Arc...,...,NEW YORK,NY,100654805,UNITED STATES,1/11/2017,1/10/2018,93.398,2017,44044.0,
249,249,999639,1085438,PROJECT SUMMARYWhile much is known about the m...,2017,1,1,2959,"[maintenance, memory, wm, prioritizeinformatio...",Aging; aging brain; Architecture; base; Bayes...,...,NEW YORK,NY,100122300,UNITED STATES,9/12/2017,9/11/2018,93.867,2017,56694.0,


In [79]:
len(ai_corpus)

12694

In [90]:
ai_corpus["ABSTRACT"].iloc[1000]

'This Small Business Innovation Research Phase I project will research sound-object recognition algorithms for use by professional and consumer audio recording and live sound engineers. Algorithms for robust off-line instrument recognition, music loop retrieval, dialog/sound effect/music recognition, and on-the-fly machine listening will also be developed. Musicians and audio engineers have access to gigabytes of audio content yet, the state of the art for finding audio content is through text queries and navigating static file hierarchies. Currently, none of the audio software manufacturers provide tools for searching for audio loops by their audio content. Additionally, recording and live sound engineers have complex organization and navigation duties, which could be solved using real-time audio analysis algorithms. If successful, this effort will enable recognizing audio content using a top-down approach - using a fleet of hierarchical machine learning classifiers, trained on statis

In [91]:
ai_corpus["is_ai_eads"] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ai_corpus["is_ai_eads"] = True


In [92]:
ai_corpus.head()

Unnamed: 0,index,original index,PROJECT_ID,ABSTRACT,FY,ORG_COUNT,PI_COUNT,nchar,final_frqwds_removed,PROJECT_TERMS,...,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,is_ai_eads
47,47,497627,64081,DESCRIPTION (provided by applicant): Healthy o...,2012,1,1,3104,"[healthy, older, listener, decline, cognitive,...",Acoustics; Address; Adult; Affect; aging brai...,...,IL,602013149,UNITED STATES,9/1/2012,1/1/2013,93.866,2012,142560.0,,True
151,151,10470,100444,Magnetic fields on the Sun are ultimately the ...,2008,1,1,255,"[magnetic, sun, ultimately, source, space, wea...",magnetic field; Modeling; Source; The Sun; Un...,...,PA,19104-2875,UNITED STATES,,,43.001,2008,440000.0,,True
206,206,1094356,1130812,PROJECT SUMMARY / ABSTRACTModern digital patho...,2019,1,1,3058,"[modern, digital, pathology, department, treme...",actionable mutation; anticancer research; Arc...,...,NY,100654805,UNITED STATES,1/11/2019,1/10/2020,93.398,2019,45016.0,,True
207,207,938623,931185,PROJECT SUMMARY / ABSTRACTModern digital patho...,2017,1,1,3059,"[modern, digital, pathology, department, treme...",actionable mutation; anticancer research; Arc...,...,NY,100654805,UNITED STATES,1/11/2017,1/10/2018,93.398,2017,44044.0,,True
249,249,999639,1085438,PROJECT SUMMARYWhile much is known about the m...,2017,1,1,2959,"[maintenance, memory, wm, prioritizeinformatio...",Aging; aging brain; Architecture; base; Bayes...,...,NY,100122300,UNITED STATES,9/12/2017,9/11/2018,93.867,2017,56694.0,,True


In [93]:
ai_corpus.to_csv("../../data/dspg21RnD/Eads_AI_abstracts-KL.csv", index = False)

In [None]:
my_subcorpus[0:10]

In [None]:
# nsf_csci dataframe of NSF 
# my_subcorpus = list of indices 
nsf_csci["index"] = range(len(nsf_csci))


In [None]:
subcorpus_df = pd.DataFrame(my_subcorpus)

In [None]:
subcorpus_df["is_ai_eads"] = True

In [None]:
subcorpus_df = subcorpus_df.rename(columns = {0: "index"})

In [None]:
subcorpus_df

In [None]:
ai_test = pd.merge(nsf_csci, subcorpus_df, on="index", how = "right")


In [None]:
ai_test


In [None]:
ai_test.to_csv("../../data/dspg21RnD/Eads_AI_abstracts.csv", index = False)

Yesterday, when I just used the top 100 or so relative entropy, I had 685,677 out of 690,814 as the subcorpus.  Today, when picking out just a couple superkeywords, I got

This was a pretty complicated method that we had to adapt to our problem, so there are a lot of things we are going to have to work out such as deciding what to do about lemmatization (if we want to fuzzy match), work on the sensitivity of including a corpus,

210907 with a shorter list on the full corpus.  11,219 within NSF CSCI dataset with the shorter list.

In [None]:
my_subcorpus[0]

In [None]:
nsf_csci["ABSTRACT"].iloc[1]

In [None]:
my_subcorpus[1]

In [None]:
nsf_csci["ABSTRACT"].iloc[5]

In [None]:
my_subcorpus[2]

In [None]:
nsf_csci["ABSTRACT"].iloc[6]