In [15]:
import sys
sys.path.append("..")

import pickle
import numpy as np
from collections import defaultdict

from sklearn.decomposition import LatentDirichletAllocation as lda

from src.seeds import Seeds
from src.dataset import Dataset
from src.vectorizers import TokenVectorizer
from src.lda_utils import get_word_relevance, get_words_relevance, print_topics


In [2]:
d = Dataset()
seeds = Seeds()
vectors, vectorizer = TokenVectorizer.load_vectors_vectorizer(method="count")
lda_model = pickle.load(open("../data/models/IAC_exp_seed_minf_10_max_50%.pk", "rb"))



In [3]:
narcotics, weapons, investigation = seeds.get_final_filtered_seeds()
total = narcotics.union(weapons).union(investigation)

vocab = vectorizer.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(vocab))

In [4]:
topic_relevance = get_words_relevance(total, word2id, vocab, lda_model, normalize=True)
topic_relevance

{0: 23.728,
 1: 2.574,
 2: 10.009,
 3: 2.878,
 4: 8.32,
 5: 7.289,
 6: 2.607,
 7: 4.442,
 8: 4.776,
 9: 2.579,
 10: 4.522,
 11: 2.394,
 12: 3.563,
 13: 20.318}

In [7]:
relevant_topics = sorted(topic_relevance.items(), key=lambda x : -x[1])[:2]
relevant_set = set([e[0] for e in relevant_topics])
relevant_topics

[(0, 23.728), (13, 20.318), (2, 10.009)]

In [8]:
final_tokens = []

for year in [2000, 1980, 1960, 1940, 1920, 1900, 1860, 1840, 1820, 1800, 1760]:
    
    tokens = d.load_dataset(year=year, fields={"tokens", "topic"})    
    for t in tokens:
        if np.argmax(t["topic"]) in relevant_set:
            final_tokens.append(t["tokens"])
    
    print(len(final_tokens))

3992
15949
27928
29446
32530
33529
33723
33800
33817
33818
33819


In [13]:
freq = defaultdict(lambda:0)
for doc in final_tokens:
    # for w in doc:
    for w in set(doc):        
        freq[w] += 1
        
def sel_criterium(w):
    return (w in total) or ((len(w) >= 3) and (10 < freq[w] < 0.5*len(final_tokens)))
    
final_tokens = [[w for w in doc if sel_criterium(w)] for doc in final_tokens]

In [14]:
dv = TokenVectorizer(final_tokens, method="count")

vectors = dv.vectors()
print(f"Vocabulary length: {len(dv.vectorizer.vocabulary_)}")

Vocabulary length: 24308


In [23]:
model = lda(n_components=10, n_jobs=-1)
model.fit(vectors)

LatentDirichletAllocation(n_jobs=-1)

In [25]:
print_topics(model, 
             dv.vectorizer, 
             n_top_words=10, 
             only_interesting=False, 
             interesting_set=total)


Topic: 0
40912.49*car + 21766.07*arrest + 20837.02*man + 19230.94*gun + 15370.24*burglary + 14982.06*doubt + 14606.74*door + 14345.15*store + 14184.14*prove + 12312.06*steal

Topic: 1
34749.64*prosecutor + 34004.88*juror + 28016.52*judge + 18362.78*comment + 17562.72*examination + 16307.33*instruction + 15871.13*objection + 15772.78*prejudice + 15449.59*fair + 15012.11*death

Topic: 2
25642.1*test + 18899.16*section + 13760.37*alcohol + 11216.99*drive + 10294.83*blood + 9474.21*statute + 8979.82*code + 8687.84*vehicle + 7806.35*ilcs + 7537.04*west

Topic: 3
75725.05*murder + 36874.05*victim + 31550.56*instruction + 29636.91*death + 22217.76*gun + 20077.75*armed + 20054.15*shoot + 19710.06*attempt + 18768.55*kill + 17897.51*degree

Topic: 4
44955.21*search + 28883.82*warrant + 21025.18*arrest + 19860.63*drug + 19056.78*substance + 17467.61*cocaine + 17246.13*possession + 15230.22*control + 12130.41*information + 12027.32*united

Topic: 5
60657.94*petition + 45097.35*file + 42030.06*att