# Topic modelling

The goal is to find three topics from the collection, the firt one regarding drugs, the second weapons, and the third investigation. 

We start with a classic model to then test the guided lda approach. We do not expect the first one to find the three topics we want, while the second should guide the topic modelling towards the required goal.

References:
https://medium.com/analytics-vidhya/how-i-tackled-a-real-world-problem-with-guidedlda-55ee803a6f0d


In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("..")

import json
import numpy as np
from collections import defaultdict

from sklearn.decomposition import LatentDirichletAllocation as lda
from lda import guidedlda as glda

# import pyLDAvis.sklearn
# pyLDAvis.enable_notebook()

from src.dataset import Dataset
from src.vectorizers import TokenVectorizer

In [None]:
dataset = Dataset()
# load only the year specified
# year = None # carica tutto
year = None # carico solo quel ventennio 

tokens = dataset.load_dataset(year=None, 
                              tokens=True, 
                              courts={"Illinois Appellate Court"})

### Filtra i dati, se vuoi

In [None]:
freq = defaultdict(lambda:0)
for doc in tokens:
    for w in doc:
    # for w in set(doc):        
        freq[w] += 1

In [None]:
narcotics = ['cannabis', 'cocaine', 'methamphetamine', 'drugs', 'drug', 'marijuana', 
             'ecstasy', 'lsd', 'ketamine', 'heroin', 'fentanyl', 'overdose']

weapons = ['gun', 'knife', 'weapon', 'firearm', 'rifle', 'carabine', 'shotgun', 'handgun', 
           'revolver', 'musket', 'pistol', 'derringer', 'assault', 'rifle', 'sword', 'blunt']

investigation = ['gang', 'mafia', 'serial',  'killer', 'rape', 'theft', 'recidivism', 
                 'arrest', 'robbery', 'cybercrime', 'cyber', 'crime']

interesting_set = set(narcotics + weapons + investigation)

In [None]:
def sel_criterium(w):
    return (w in interesting_set) or (len(w) >= 3) and (10 < freq[w] < 0.5*len(tokens))

tokens = [[w for w in doc if sel_criterium(w)] for doc in tokens]

### Vectorize the documents
The vectorized is a tfidf one, we use the output to fit the lda model.

In [None]:
dv = TokenVectorizer(tokens, method="count")

vectors = dv.vectors()
dv.save_vectors_vectorizer(vectors)
print(f"Vocabulary length: {len(dv.vectorizer.vocabulary_)}")

### Loading precomputed vectors, this 

In [None]:
vectors, vectorizer = TokenVectorizer.load_vectors_vectorizer(method="count")

## Classic LDA model

The number of topics is set to three, while alpha and beta have values proposed in the literature. 

Griffiths TL, Steyvers M (2004). “Finding Scientific Topics.” Proceedings of the National Academy of Sciences of the United States of America, 101, 5228–5235.

In [None]:
numTopics = 10
# alpha = 50/numTopics
alpha = 0.1
beta = 0.01

lda_model = lda(n_components = numTopics, 
                doc_topic_prior= alpha, 
                topic_word_prior = beta, 
                random_state=0, 
                n_jobs=-1)

lda_output = lda_model.fit_transform(vectors)

In [None]:
def print_topics(model, vectorizer, n_top_words=10, only_interesting=False, interesting_set={}):
    n_top_words = 10
    vocab = vectorizer.get_feature_names()
    topic_words = {}
    for topic, comp in enumerate(modelmodel.components_): 
        if only_interesting:
            word_idx = np.argsort(comp)[::-1]
            topic_words[topic] = [w for w in [vocab[i] for i in word_idx] 
                                  if w in interesting_set][:n_top_words]
        else: 
            word_idx = np.argsort(comp)[::-1][:n_top_words]
            topic_words[topic] = [vocab[i] for i in word_idx]        

    for topic, words in topic_words.items():
        print('\nTopic: %d' % topic)
        print('%s' % ', '.join(words))

### Topics relevant words

The next step is to check the words for each topic, results are interesting and expected, bu twe can't see a distinction between the topics we want.

In [None]:
print_topics(lda_model, 
             vectorizer, 
             n_top_words=10, 
             only_interesting=False)

### Consider only words of interest
We now print the word distribution, considering only interesting words

In [None]:
print_topics(lda_model, 
             vectorizer, 
             n_top_words=10, 
             only_interesting=True, 
             interesting_set=interesting_set)

We can see that the topics blends together even considering only the words of interest, LDA must be guided. 

## Guided LDA approach
We now guide the lda process by setting some seeds, exploiting the model defined by the GuidedLDA package.

In [None]:
word2id = dict((v, idx) for idx, v in enumerate(vocab))

In [None]:
seed_topic_list = [narcotics, investigation, weapons]
seed_topics = {}

for i, st in enumerate(seed_topic_list):
    for word in st:
        if word in word2id:
            seed_topics[word2id[word]] = i
        else:
            print(f"{word} not found in vocabulary")

In [None]:
glda_model = glda.GuidedLDA(n_topics=10, 
                       n_iter=250, 
                       random_state=0, 
                       refresh=10, 
                       alpha=alpha, 
                       eta=beta)

glda_model.fit(vectors, 
          seed_topics=seed_topics, 
          seed_confidence=0.90)

In [None]:
topic_word = glda_model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][::-1]
    interesting_topic_words = [w for w in topic_words if w in interesting_set][:n_top_words]
    print(f"Topic {i}:\n{' '.join(topic_words[:n_top_words * 2])}\n{' '.join(interesting_topic_words)}")

## LSI

In [None]:
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel

In [None]:
reverse_vocabulary = { dv.vectorizer.vocabulary_[k]:k for k in dv.vectorizer.vocabulary_}

In [None]:
model = LsiModel(vectors.transpose(), id2word=reverse_vocabulary, num_topics=numTopics) 
topics = model.get_topics()

In [None]:
topWords = []
for topicno in range(numTopics):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in model.show_topic(topicno, topn=10)], '\n')
    topWords.append([(x) for x, y in model.show_topic(topicno, topn=10)])