In [1]:
import sys
sys.path.append("..")

import json
import pickle
import numpy as np
from pprint import pprint
from collections import defaultdict

import matplotlib.pyplot as plt 

from lda import guidedlda as glda

from src.seeds import Seeds
from src.vectorizers import TokenVectorizer
from src.lda_utils import get_word_relevance, get_words_relevance, print_topics

import warnings
warnings.filterwarnings('ignore')

In [2]:
seeds = Seeds()

narcotics, weapons, investigation = seeds.get_final_filtered_seeds()

interesting_set = narcotics.union(weapons).union(investigation)
print(f"First five interesting: {list(interesting_set)[:5]}")

First five interesting: ['holdup', 'sulfate', 'smuggling', 'runaway', 'holstered']


In [3]:
vectors, vectorizer = TokenVectorizer.load_vectors_vectorizer(method="count")

In [4]:
vocab = vectorizer.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(vocab))

In [29]:
g_numTopics = 14

In [30]:
seed_topic_list = [narcotics, investigation, weapons]
seed_topics = {}

for index in range(g_numTopics):
    for i, st in enumerate(seed_topic_list):
        for word in st:
            if word in word2id:
                seed_topics[word2id[word]] = 0
            else:
                print(f"{word} not found in vocabulary")

In [31]:
g_alpha = 1/g_numTopics
g_beta = 1/g_numTopics
g_iter = 100

glda_model = glda.GuidedLDA(n_topics=g_numTopics, 
                            n_iter=g_iter, 
                            random_state=0, 
                            refresh=10, 
                            alpha=g_alpha, 
                            eta=g_beta)

glda_model.fit(vectors, 
               seed_topics=seed_topics, 
               seed_confidence=0.7)

INFO:lda:n_documents: 123915
INFO:lda:vocab_size: 48480
INFO:lda:n_words: 89949046
INFO:lda:n_topics: 14
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -965998649
INFO:lda:<10> log likelihood: -824040779
INFO:lda:<20> log likelihood: -760762122
INFO:lda:<30> log likelihood: -751793045
INFO:lda:<40> log likelihood: -749190151
INFO:lda:<50> log likelihood: -748239937
INFO:lda:<60> log likelihood: -747830939
INFO:lda:<70> log likelihood: -747609779
INFO:lda:<80> log likelihood: -747423694
INFO:lda:<90> log likelihood: -747272555
INFO:lda:<99> log likelihood: -747200636


<lda.guidedlda.GuidedLDA at 0x20e4adde670>

In [32]:
# pickle.dump(glda_model, open("../data/models/Guided_07_IAC_exp_seed_minf_10_max_50%.pk", "wb"))
# glda_model = pickle.load(open("../data/models/Guided_07_IAC_exp_seed_minf_10_max_50%.pk", "rb"))

In [35]:
print("Guided lda topics")
print_topics(glda_model, 
             vectorizer, 
             n_top_words=10, 
             only_interesting=False)

print("\nTopics with only interesting words")
print_topics(glda_model, 
             vectorizer, 
             n_top_words=10, 
             only_interesting=True, 
             interesting_set=interesting_set)

Guided lda topics

Topic: 0
0.01*people + 0.01*testify + 0.01*officer + 0.01*police + 0.01*testimony + 0.01*jury + 0.01*witness + 0.01*statement + 0.01*tell + 0.01*defense

Topic: 1
0.03*appellant + 0.02*appellee + 0.01*plaintiff + 0.01*error + 0.01*pay + 0.01*bill + 0.01*suit + 0.01*jury + 0.01*claim + 0.01*contract

Topic: 2
0.03*contract + 0.02*plaintiff + 0.02*agreement + 0.01*lease + 0.01*sale + 0.01*business + 0.01*purchase + 0.01*term + 0.01*work + 0.01*pay

Topic: 3
0.02*petitioner + 0.02*fee + 0.02*award + 0.01*pay + 0.01*support + 0.01*attorney + 0.01*claimant + 0.01*marriage + 0.01*child + 0.01*respondent

Topic: 4
0.02*board + 0.01*illinois + 0.01*section + 0.01*department + 0.01*employee + 0.01*decision + 0.01*review + 0.01*public + 0.01*commission + 0.01*school

Topic: 5
0.02*child + 0.02*respondent + 0.01*testify + 0.01*medical + 0.01*hospital + 0.01*parent + 0.01*mother + 0.01*care + 0.01*minor + 0.01*father

Topic: 6
0.03*city + 0.02*county + 0.02*section + 0.01*statut