# Kojak

Can we identify different meanings of the same word by what topic that word lies in?

We apply Latent Dirichlet Allocation to attempt to extract distinct topics in our corpus (taken from 4000 research papers) in order determine usage contexts contexts. 

In [1]:
import numpy as np
import pandas as pd
import codecs
import gensim
import re
import json

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Defining a custom lemmatizer/tokenizer with stopwords

stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"',"''",'""',"``",'”', '“', '?', '!', '’', 'et', 'al']
stop = set(stop)

def get_wordnet_pos_aux(word):
    
    treebank_tag = pos_tag([word])[0][1]
    
    if treebank_tag.startswith('J'):
        return word, wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return word, wordnet.VERB
    elif treebank_tag.startswith('N'):
        return word, wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return word, wordnet.ADV
    else:
        return word, 'n'
    
def get_wordnet_pos(words):
    return [get_wordnet_pos_aux(x) for x in words]

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t1,t2) for t1,t2 in get_wordnet_pos(word_tokenize(doc)) if t1 not in stop]
    
lt = LemmaTokenizer()

In [3]:
# Declare stopwords, preprocess the data from source file abstracts.json

stop = stopwords.words('english')
stop += ['?','!','.',',',':',';','[',']','[]','“' ]
stop += ['.', ',', '(', ')', "'", '"',"''",'""',"``",'”', '“', '?', '!', '’', 'et', 'al.', 'study', ""]
stop = set(stop)

class MyPapers(object):
    # a memory-friendly way to load a large corpora
     def __init__(self, dirname):
            self.dirname = dirname
 
     def __iter__(self):
        with open(self.dirname) as data_file:    
            data = json.load(data_file)
        # iterate through all file names in our directory
        for paper in data:
            sentences = sent_tokenize(paper['full_text'])
            for sentence in sentences:
                try:
                    line = re.sub(r'[?\.,!:;\(\)“\[\]]',' ',sentence)
                    line = [word for word in line.lower().split() if word not in stop]
                    yield line
                except:
                    print("Empty line found")
                    continue
                

In [4]:
#If we are using 'tiny_corpus.txt'
#corpus = [lt(c) for c in corpus]

# Declare what word we are searchig for
target = u'state'

#If we are using 'abstract_scraper/abstracts.json'
corpus = MyPapers('../data/train_data.json')

In [5]:
dictionary = gensim.corpora.dictionary.Dictionary(corpus)
text = [dictionary.doc2bow(c) for c in corpus]

In [6]:
id_lookup = {dictionary[id]:id for id in dictionary}

In [7]:
id_lookup['progressive']

2

In [23]:
LDA = gensim.models.ldamulticore.LdaMulticore(corpus = text, id2word=dictionary, num_topics = 150,workers = 3)

In [32]:
target_topics = LDA.get_term_topics(word_id = id_lookup[target], minimum_probability=0.003)

In [33]:
for t, p in target_topics:
    topic_words = LDA.get_topic_terms(topicid = t, topn = 15)
    words = [dictionary[w[0]] for w in topic_words]
    print(p, words)
        

0.00369623206532 ['practices', 'achieved', 'countries', 'political', 'contribute', 'quite', 'policy', 'air', 'developing', 'unemployment', 'educational', 'isolated', 'development', 'economic', 'economy']
0.00382743032762 ['us', 'variable', 'variables', 'definition', 'regression', 'mentioned', 'evaluate', 'steps', 'variations', 'let', 'next', 'interested', 'vehicle', 'binary', 'two']
0.00370139472187 ['reference', 'enhanced', 'extracts', 'welfare', 'papers', 'inputs', 'producers', 'topics', 'robustness', 'covered', 'obtaining', 'outputs', 'also', 'hierarchical', 'emerge']
0.00312431899667 ['estimate', 'decrease', 'complexity', 'equation', 'supply', 'decreased', 'unit', 'constraints', 'decisions', 'permanent', 'fruit', 'increase', 'actually', 'model', 'mathematical']
0.00467741048122 ['states', 'drugs', 'united', 'icu', 'sensing', 'long-term', 'maintaining', 'principles', 'examining', 'continuously', 'interference', 'nations', 'deployed', 'politics', 'union']


In [34]:
LDA.save('LDA_150')

In [7]:
# target_corpus will be a list of ony those papers containing the target word
target_corpus = []

for paper in corpus:
    if target in paper:
        target_corpus.append(paper)
        
len(target_corpus)

2052

In [8]:
text_mini = [dictionary.doc2bow(c) for c in target_corpus]

In [11]:
LDA_mini = gensim.models.ldamulticore.LdaMulticore(corpus = text_mini, id2word=dictionary, num_topics = 25,workers = 3)

In [12]:
LDA_mini.show_topics(num_topics=15)

[(18,
  '0.017*"state" + 0.002*"states" + 0.001*"transition" + 0.001*"conditions" + 0.001*"also" + 0.001*"system" + 0.001*"one" + 0.001*"social" + 0.001*"different" + 0.001*"local"'),
 (16,
  '0.014*"state" + 0.001*"process" + 0.001*"university" + 0.001*"context" + 0.001*"system" + 0.001*"power" + 0.001*"institutional" + 0.001*"changes" + 0.001*"information" + 0.001*"2012"'),
 (1,
  '0.015*"state" + 0.002*"control" + 0.001*"one" + 0.001*"matrix" + 0.001*"first" + 0.001*"two" + 0.001*"new" + 0.001*"based" + 0.001*"time" + 0.001*"system"'),
 (17,
  '0.016*"state" + 0.002*"also" + 0.002*"system" + 0.002*"process" + 0.002*"used" + 0.002*"time" + 0.002*"e" + 0.001*"may" + 0.001*"described" + 0.001*"local"'),
 (19,
  '0.011*"state" + 0.001*"also" + 0.001*"results" + 0.001*"system" + 0.001*"one" + 0.001*"dialog" + 0.001*"law" + 0.001*"license" + 0.001*"material" + 0.001*"2"'),
 (5,
  '0.010*"state" + 0.001*"members" + 0.001*"%" + 0.001*"run" + 0.001*"5" + 0.001*"us" + 0.001*"results" + 0.001*

In [46]:
target_topics_mini = LDA_mini.get_term_topics(word_id = id_lookup[target], minimum_probability=0.02)

In [47]:
for t, p in target_topics_mini:
    topic_words = LDA_mini.get_topic_terms(topicid = t, topn = 15)
    words = [dictionary[w[0]] for w in topic_words]
    print(p, words)

0.0253577839995 ['state', 'also', 'one', 'network', 'system', 'steady', 'better', 'states', 'e', 'dynamics', 'changes', 'many', 'spectral', 'population', 'transitions']
0.0235171939871 ['state', 'also', 'matrix', 'system', 'group', 'welfare', 'result', 'way', 'regulations', 'asymptotic', '=0', 'vector', 'global', 'national', 'hence']
0.0200564424083 ['state', 'inhibitory', 'social', 'welfare', 'time', 'well', 'development', 'paper', 'literature', 'skills', 'economic', 'cost', 'data', 'important', 'divine']
0.0240323341216 ['state', 'result', 'system', 'migration', 'human', 'health', 'welfare', 'analysis', 'security', 'information', 'three', 'would', 'however', 'power', 'may']
0.0210780655487 ['state', 'time', 'point', 'however', 'data', 'also', 'one', 'research', 'may', 'system', 'step', 'example', 'states', 'risk', 'science']
0.0278143472402 ['state', 'important', 'social', 'states', 'using', 'model', 'could', 'problem', 'integers', 'actual', 'human', 'production', 'national', 'first'

In [39]:
# The function takes as arguments a list of tokenized documents and a window size
# and returns each word in the document along with its window context as a tuple

def generate_windows(documents, window_size):
    maxlen = window_size*2
    
    for document in documents:
        L = len(document)
        # Choose the target word
        for index, word in enumerate(document):
            # Create the window
            s = index-window_size
            e = index+window_size+1
                    
            in_words = []
            context_words = []
            # Create the input/outputs for skipgrams
            for i in range(s, e):
                if i != index and 0 <= i < L:
                    #in_words.append([word])
                    context_words.append(document[i])
            x = word
            y = context_words

            #x = np.array(in_words,dtype=np.int32)
            #y = np_utils.to_categorical(context_words, V)
            yield(x,y)

In [44]:
def extract_contexts(documents, target, window_size = 6):
    
    context_vectors = []

    for document in documents:
        text = document
        if target in text:
            #print(target)
            windows = generate_windows([text],window_size)
            #print windows[:2]
            for w in windows:
                if w[0] == target:
                    context_vectors.append((w[1]))
                    
    return context_vectors
