In [1]:
import sys
sys.path.append("..")

import json
import pickle
import numpy as np
from pprint import pprint
from collections import defaultdict

import matplotlib.pyplot as plt 

from lda import guidedlda as glda

from src.seeds import Seeds
from src.vectorizers import TokenVectorizer
from src.lda_utils import get_word_relevance, get_words_relevance, print_topics

import warnings
warnings.filterwarnings('ignore')

In [2]:
seeds = Seeds()

narcotics, weapons, investigation = seeds.get_final_filtered_seeds()

interesting_set = narcotics.union(weapons).union(investigation)
print(f"First five interesting: {list(interesting_set)[:5]}")

First five interesting: ['crip', 'robberies', 'chara', 'amytal', 'beretta']


In [3]:
vectors, vectorizer = TokenVectorizer.load_vectors_vectorizer(method="count")

In [4]:
vocab = vectorizer.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(vocab))

In [5]:
seed_topic_list = [narcotics, investigation, weapons]
seed_topics = {}

for i, st in enumerate(seed_topic_list):
    for word in st:
        if word in word2id:
            seed_topics[word2id[word]] = i
        else:
            print(f"{word} not found in vocabulary")

In [8]:
g_numTopics = 14
g_alpha = 1/g_numTopics
g_beta = 1/g_numTopics
g_iter = 10

glda_model = glda.GuidedLDA(n_topics=g_numTopics, 
                            n_iter=g_iter, 
                            random_state=0, 
                            refresh=10, 
                            alpha=g_alpha, 
                            eta=g_beta)

glda_model.fit(vectors, 
               seed_topics=seed_topics, 
               seed_confidence=0.7)

INFO:lda:n_documents: 123915
INFO:lda:vocab_size: 48480
INFO:lda:n_words: 89949046
INFO:lda:n_topics: 1
INFO:lda:n_iter: 1


KeyboardInterrupt: 

In [None]:
# pickle.dump(glda_model, open("../data/models/Guided_07_IAC_exp_seed_minf_10_max_50%.pk", "wb"))
# glda_model = pickle.load(open("../data/models/Guided_07_IAC_exp_seed_minf_10_max_50%.pk", "rb"))

In [9]:
print("Guided lda topics")
print_topics(glda_model, 
             vectorizer, 
             n_top_words=10, 
             only_interesting=False)

print("\nTopics with only interesting words")
print_topics(glda_model, 
             vectorizer, 
             n_top_words=10, 
             only_interesting=True, 
             interesting_set=interesting_set)

Guided lda topics


AttributeError: 'GuidedLDA' object has no attribute 'components_'