# Kojak

Can we identify different meanings of the same word by what topic that word lies in?

We apply Latent Dirichlet Allocation to attempt to extract distinct topics in our corpus (taken from 4000 research papers) in order determine usage contexts contexts. 

In [1]:
import pdfminer
import numpy as np
import pandas as pd
import codecs
import gensim
import re
import json

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer


Using Theano backend.


In [60]:
#Opening our corpus

#>>>f = codecs.open("test", "r", "utf-8")
with codecs.open("tiny_corpus.txt","rb","utf-8") as f:
    corpus = f.readlines()

In [2]:
# Defining a custom lemmatizer/tokenizer with stopwords

stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"',"''",'""',"``",'”', '“', '?', '!', '’', 'et', 'al']
stop = set(stop)

def get_wordnet_pos_aux(word):
    
    treebank_tag = pos_tag([word])[0][1]
    
    if treebank_tag.startswith('J'):
        return word, wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return word, wordnet.VERB
    elif treebank_tag.startswith('N'):
        return word, wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return word, wordnet.ADV
    else:
        return word, 'n'
    
def get_wordnet_pos(words):
    return [get_wordnet_pos_aux(x) for x in words]

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t1,t2) for t1,t2 in get_wordnet_pos(word_tokenize(doc)) if t1 not in stop]
    
lt = LemmaTokenizer()

In [36]:
# Declare stopwords, preprocess the data from source file abstracts.json

stop = stopwords.words('english')
stop += ['?','!','.',',',':',';','[',']','[]','“' ]
stop += ['.', ',', '(', ')', "'", '"',"''",'""',"``",'”', '“', '?', '!', '’', 'et', 'al.', 'study', ""]
stop = set(stop)

class MyPapers(object):
    # a memory-friendly way to load a large corpora
     def __init__(self, dirname):
            self.dirname = dirname
 
     def __iter__(self):
        with open(self.dirname) as data_file:    
            data = json.load(data_file)
        # iterate through all file names in our directory
        for paper in data:
            #print(paper)
            #yield paper['full_text'].lower().split()
            line = [word for word in paper['full_text'].lower().split() if word not in stop]
            #print(line)
            line = [re.sub(r'[?\.,!:;\(\)“\[\]]','',l) for l in line]
            #print(line)
            yield line

In [37]:
#If we are using 'tiny_corpus.txt'
#corpus = [lt(c) for c in corpus]

# Declare what word we are searchig for
target = u'state'

#If we are using 'abstract_scraper/abstracts.json'
corpus = MyPapers('abstract_scraper/abstracts.json')

In [38]:
# target_corpus will be a list of ony those papers containing the target word
target_corpus = []

for paper in corpus:
    if target in paper:
        target_corpus.append(paper)
        
len(target_corpus)

29

In [39]:
# The function takes as arguments a list of tokenized documents and a window size
# and returns each word in the document along with its window context as a tuple

def generate_windows(documents, window_size):
    maxlen = window_size*2
    
    for document in documents:
        L = len(document)
        # Choose the target word
        for index, word in enumerate(document):
            # Create the window
            s = index-window_size
            e = index+window_size+1
                    
            in_words = []
            context_words = []
            # Create the input/outputs for skipgrams
            for i in range(s, e):
                if i != index and 0 <= i < L:
                    #in_words.append([word])
                    context_words.append(document[i])
            x = word
            y = context_words

            #x = np.array(in_words,dtype=np.int32)
            #y = np_utils.to_categorical(context_words, V)
            yield(x,y)

In [44]:
def extract_contexts(documents, target, window_size = 6):
    
    context_vectors = []

    for document in documents:
        text = document
        if target in text:
            #print(target)
            windows = generate_windows([text],window_size)
            #print windows[:2]
            for w in windows:
                if w[0] == target:
                    context_vectors.append((w[1]))
                    
    return context_vectors


In [45]:
dictionary = gensim.corpora.dictionary.Dictionary(corpus)
text = [dictionary.doc2bow(c) for c in extract_contexts(corpus, target,10)]

In [46]:
LDA = gensim.models.ldamodel.LdaModel(corpus = text, id2word=dictionary, num_topics = 6)

In [47]:
LDA.print_topics(35)

[(0,
  u'0.002*"" + 0.001*"pain" + 0.001*"appropriate" + 0.001*"cell" + 0.001*"highway" + 0.001*"syndrome" + 0.001*"primary" + 0.001*"cells" + 0.001*"health" + 0.001*"cancer"'),
 (1,
  u'0.001*"" + 0.001*"l" + 0.001*"slopes" + 0.001*"less" + 0.001*"observed" + 0.000*"consequently" + 0.000*"exercise" + 0.000*"overall" + 0.000*"increase" + 0.000*"height"'),
 (2,
  u'0.001*"" + 0.001*"bim" + 0.001*"patients" + 0.001*"ethics" + 0.001*"dmn" + 0.001*"\xa7" + 0.001*"complained" + 0.001*"migraine" + 0.000*"increase" + 0.000*"consent"'),
 (3,
  u'0.001*"" + 0.001*"network" + 0.001*"resting" + 0.001*"like" + 0.001*"networks" + 0.001*"failures" + 0.001*"important" + 0.000*"research" + 0.000*"university" + 0.000*"activity"'),
 (4,
  u'0.002*"cognitive" + 0.002*"" + 0.001*"function" + 0.001*"intake" + 0.001*"fluid" + 0.001*"stress" + 0.001*"functional" + 0.001*"pain" + 0.001*"studies" + 0.001*"may"'),
 (5,
  u'0.002*"" + 0.001*"network" + 0.001*"student" + 0.001*"mental" + 0.001*"young" + 0.001*"pe

## Below I just copied and pasted code for an RNN (Recurrent Neural Network)

In [14]:
# keras
np.random.seed(13)
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Activation, SimpleRNN, GRU, LSTM, Bidirectional, Convolution1D, MaxPooling1D, Merge, Dropout
from IPython.display import SVG
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot, plot_model
from keras.datasets import imdb, reuters
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop

Using Theano backend.


In [42]:
# For simplicity, one "sentence" per line & ensuring a count of two words min

ltzr = Lemmatizer()
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

corpus = [ltzr(c).encode('ascii', 'ignore') for c in corpus]
#print(corpus)

# Tokenize using Keras
my_filter='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
tokenizer = Tokenizer(filters=my_filter)
tokenizer.fit_on_texts(corpus)

# Convert tokenized sentences to sequence format
sequences = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)

# Vocab size
V = len(tokenizer.word_index) + 1

# Dimension of our network
dim = 100
window_size = 2

# What is this output? 
#sequences

[[5,
  1,
  151,
  152,
  2,
  20,
  1,
  153,
  7,
  82,
  154,
  6,
  83,
  4,
  16,
  84,
  3,
  3,
  155,
  85,
  156,
  32,
  86,
  87,
  33,
  86,
  157,
  4,
  158,
  7,
  4,
  88,
  7,
  10,
  6,
  83,
  4,
  16,
  159,
  3,
  89,
  160,
  2,
  90,
  15,
  3,
  161,
  162,
  163,
  164,
  1,
  165,
  4,
  166,
  5,
  91,
  167,
  25,
  34,
  168,
  8,
  91,
  32,
  2,
  92,
  169,
  170,
  171,
  6,
  172,
  4,
  173,
  35,
  2,
  10,
  93,
  10,
  174,
  94,
  4,
  54,
  32,
  175,
  36,
  35,
  4,
  95,
  2,
  21,
  55,
  1,
  176,
  56,
  177,
  25,
  178,
  7,
  179,
  8,
  36,
  33,
  2,
  21,
  4,
  37,
  7,
  10,
  2,
  19,
  96,
  180,
  181,
  182,
  6,
  36,
  97,
  87,
  33,
  183,
  38,
  1,
  184,
  185,
  2,
  1,
  186,
  187,
  2,
  21,
  188,
  1,
  189,
  1,
  9,
  6,
  1,
  98,
  190,
  1,
  33,
  2,
  21,
  4,
  99,
  100,
  2,
  10,
  191,
  25,
  192,
  8,
  193,
  57,
  96,
  4,
  16,
  194,
  101,
  11,
  39,
  195,
  25,
  196,
  58,
  8,
  22,
  197,
  

In [54]:
def generate_data(sequences, window_size, V):
    maxlen = window_size*2
    
    # For each line (sentence)
    for line in sequences:
        L = len(line)
        # Choose the target word
        for index, word in enumerate(line):
            # Create the window
            s = index-window_size
            e = index+window_size+1
                    
            in_words = []
            context_words = []
            # Create the input/outputs for skipgrams
            for i in range(s, e):
                if i != index and 0 <= i < L:
                    #in_words.append([word])
                    context_words.append(line[i])
            x = word
            y = context_words

            #x = np.array(in_words,dtype=np.int32)
            #y = np_utils.to_categorical(context_words, V)
            yield(x,y)

In [70]:
def print_window(x,y):
    index = tokenizer.word_index
    print(index.keys()[index.values().index(x)])
    print(map(lambda n: index.keys()[index.values().index(n)],y))

In [97]:
win_size = 4
count = 0
for x,y in generate_data(sequences, win_size, V):
    if [x] in tokenizer.texts_to_sequences(["charge", "state"]):
        print(count)
        print_window(x, y)
        count += 1

0
charge
['due', 'to', 'motion', 'of', 'constituent', 'density', 'stiffness', 'colour']
1
charge
['of', 'mass', 'e', 'and', 'constrain', 'by', 'the', 'quantum']
2
charge
['investigation', 'arrest', 'filing', 'of', 'trial', 'and', 'appeal']
3
charge
['court', 'disallowed', 'a', 'murder', 'against', 'keeler', 'under', 'california']
4
charge
['constant', 'h', 'the', 'elementary', 'e', 'and', 'the', 'boltzmann']
5
charge
['a', 'polarizers', 'for', 'rotate', 'rotating', 'charge', 'be', 'present']
6
charge
['for', 'rotate', 'charge', 'rotating', 'be', 'present', 'in', 'every']
7
state
['classical', 'everyday', 'case', 'the', 'of', 'a', 'quantum', 'system']
8
state
['a', 'general', 'oven', 'particle', 'sometimes', 'give', 'up', 'say']
9
state
['atom', 'in', 'an', 'oven', 'have', 'no', 'intrinsic', 'orientation']
10
state
['either', 'in', 'an', 'up', 'or', 'in', 'a', 'down']
11
state
['or', 'in', 'a', 'down']
12
state
['violates', 'a', 'federal', 'or', 'criminal', 'statute', 'or', 'in']
13
sta

In [114]:
Y = np.array(2*[3]+2*[1]+3*[3]+5*[1]+6*[2]).reshape(-1,1)

In [111]:
maxlen = 2*win_size
Z = []
X = []
for x,y in generate_data(sequences, win_size, V):
    if [x] in tokenizer.texts_to_sequences(["charge", "state"]):
        X.append(x)
        Z.append(y)
        
Z = sequence.pad_sequences(Z, maxlen=maxlen)
X = np.array(X).reshape(-1,1)
X = np.concatenate([X,Z], axis = 1)
print(X)

[[ 19  37   7  10   2  96 180 181 182]
 [ 19   2 210 106   6 211  11   1  41]
 [ 19   0  90 234 235   2 236   6 237]
 [ 19  27 255   3 256 113  46 257  14]
 [ 19  73 345   1 346 106   6   1 347]
 [ 19   3 361  15 362 363  19   4 364]
 [ 19  15 362  19 363   4 364   5 365]
 [ 13  39  72 378   1   2   3  41  78]
 [ 13   3 145  77 144 146  43  70 147]
 [ 13   9   5  17  77  34  54 104 388]
 [ 13 389   5  17  70  48   5   3 148]
 [ 13   0   0   0   0  48   5   3 148]
 [ 13 391   3 392  48  18  30  48   5]
 [ 13   0   0   0   0  52  42  99  81]
 [ 13  52  42  99  81  30  51  12 131]
 [ 13  51  12 131   3  66 397 398  13]
 [ 13  13  66 397 398  30  52 399  56]
 [ 13  12  11 411   1  66   6  14  53]]


In [115]:
print(X.shape, Y.shape)

((18, 9), (18, 1))


## Building a RNN

In [92]:
from sklearn.model_selection import train_test_split

In [116]:
# Train, test, split
X_train,X_test, y_train,  y_test = train_test_split(X,Y)

nb_epoch = 20

In [108]:
y_train

array([2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 2, 2])

In [119]:
max_features = 2000
# cut texts after this number of words (among top max_features most common words)
batch_size = 32

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen+1))
# Bidirectional LSTM!!!
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [120]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          nb_epoch=nb_epoch,
          validation_data=[X_test, y_test])

Train...
Train on 13 samples, validate on 5 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x11c01b750>

In [121]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          nb_epoch=nb_epoch,
          validation_data=[X_test, y_test])

Train...
Train on 13 samples, validate on 5 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x11fc6a6d0>

In [122]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          nb_epoch=nb_epoch,
          validation_data=[X_test, y_test])

Train...
Train on 13 samples, validate on 5 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x11c288f90>