In [1]:
import sys
sys.path.append("..")

import pickle
import numpy as np
from collections import defaultdict

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.decomposition import LatentDirichletAllocation as lda

import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from src.seeds import Seeds
from src.dataset import Dataset
from src.vectorizers import TokenVectorizer
from src.lda_utils import get_word_relevance, get_words_relevance, print_topics

import warnings
warnings.filterwarnings('ignore')

### Loading the trained LDA model

In [2]:
d = Dataset()
seeds = Seeds()
vectors, vectorizer = TokenVectorizer.load_vectors_vectorizer(method="count")
lda_model = pickle.load(open("../data/models/IAC_exp_seed_minf_10_max_50%.pk", "rb"))

In [3]:
narcotics, weapons, investigation = seeds.get_final_filtered_seeds()
total = narcotics.union(weapons).union(investigation)

vocab = vectorizer.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(vocab))

### Finding most relevant topics for our study

In [4]:
topic_relevance = get_words_relevance(total, word2id, vocab, lda_model, normalize=True)
topic_relevance

{0: 23.728,
 1: 2.574,
 2: 10.009,
 3: 2.878,
 4: 8.32,
 5: 7.289,
 6: 2.607,
 7: 4.442,
 8: 4.776,
 9: 2.579,
 10: 4.522,
 11: 2.394,
 12: 3.563,
 13: 20.318}

In [5]:
selected_topics = 3

relevant_topics = sorted(topic_relevance.items(), key=lambda x : -x[1])[:selected_topics]
relevant_set = set([e[0] for e in relevant_topics])
relevant_topics

[(0, 23.728), (13, 20.318), (2, 10.009)]

### Creating a subset of tokens

In [6]:
final_tokens = []

for year in [2000, 1980, 1960, 1940, 1920, 1900, 1860, 1840, 1820, 1800, 1760]:
    
    tokens = d.load_dataset(year=year, fields={"tokens", "topic"})    
    for t in tokens:
        if np.argmax(t["topic"]) in relevant_set:
            final_tokens.append(t["tokens"])
    
    print(len(final_tokens))

3992
15949
27928
29446
32530
33529
33723
33800
33817
33818
33819


In [7]:
freq = defaultdict(lambda:0)
for doc in final_tokens:
    # for w in doc:
    for w in set(doc):
        freq[w] += 1
        
def sel_criterium(w):
    return (w in total) or ((len(w) >= 3) and (10 < freq[w] < 0.5*len(final_tokens)))
    
final_tokens = [[w for w in doc if sel_criterium(w)] for doc in final_tokens]

In [8]:
dv = TokenVectorizer(final_tokens, method="count")
vectors = dv.vectors()
print(f"Vocabulary length: {len(dv.vectorizer.vocabulary_)}")

Vocabulary length: 24308


### Grid search to find the optimal number of subtopics

In [9]:
search_params = {
    'n_components'  : list(range(8, 24, 2))
}

model = lda()
search = HalvingGridSearchCV(model, 
                             param_grid=search_params, 
                             min_resources="exhaust", 
                             factor=3, n_jobs=-1, 
                             cv=3, verbose=2)

search.fit(vectors)

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 11273
max_resources_: 33819
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 8
n_resources: 11273
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 1
n_candidates: 3
n_resources: 33819
Fitting 3 folds for each of 3 candidates, totalling 9 fits


HalvingGridSearchCV(cv=3, estimator=LatentDirichletAllocation(), n_jobs=-1,
                    param_grid={'n_components': [8, 10, 12, 14, 16, 18, 20,
                                                 22]},
                    refit=<function _refit_callable at 0x7f7a1a124ee0>,
                    verbose=2)

In [10]:
best_lda_model = search.best_estimator_
print("Best Model's Params: ", search.best_params_)
print("Best Log Likelihood Score: ", search.best_score_)

pickle.dump(best_lda_model, open("../data/models/FULL_exp_seed_t_0_2_13_minf_10_max_50%.pk", "wb"))

Best Model's Params:  {'n_components': 14}
Best Log Likelihood Score:  -67705552.06797697


In [9]:
best_lda_model = pickle.load(open("../data/models/FULL_exp_seed_t_0_2_13_minf_10_max_50%.pk", "rb"))

In [10]:
print_topics(best_lda_model, 
             dv.vectorizer, 
             n_top_words=10, 
             only_interesting=False)


Topic: 0
41526.87*plaintiff + 15818.44*indictment + 11484.34*instruction + 10753.32*property + 10516.26*ned + 9333.87*verdict + 9282.93*prove + 9270.49*there + 8964.61*steal + 8613.18*money

Topic: 1
22093.78*confession + 14491.38*respondent + 14256.24*attorney + 13580.18*miranda + 12867.04*claim + 12564.13*detective + 11107.03*interview + 10451.69*admit + 9378.95*suppress + 8543.58*assistance

Topic: 2
64608.86*car + 48187.01*arrest + 26095.53*man + 18799.52*station + 15880.7*drive + 15570.67*robbery + 15096.8*stop + 13696.52*street + 13234.92*door + 12284.61*identify

Topic: 3
39471.65*instruction + 25974.51*gun + 15960.05*weapon + 12679.91*attempt + 11245.52*battery + 10549.09*prove + 10377.14*intent + 9812.06*force + 9548.0*murder + 9382.27*doubt

Topic: 4
10652.16*test + 9131.69*expert + 6641.15*blood + 6478.09*grand + 5519.98*dna + 5042.3*sample + 3774.0*fingerprint + 3685.46*testing + 3321.33*analysis + 3220.88*admit

Topic: 5
53657.41*plea + 38822.91*judge + 30967.77*attorney 

In [11]:
panel = pyLDAvis.sklearn.prepare(best_lda_model, vectors, dv.vectorizer, mds='tsne')
panel

In [12]:
panel = pyLDAvis.sklearn.prepare(best_lda_model, vectors, dv.vectorizer)
panel