# Topic Model
In this part we run a topic model analysis 

In [1]:
!python -m pip install json-lines



In [2]:
import json_lines

with open('Illinois-20200302-xml/data/data.jsonl/data.jsonl', 'rb') as f: # opening file in binary(rb) mode 
    df = list(json_lines.reader(f))
    for item in json_lines.reader(f):
        print(item) #or use print(item['X']) for printing specific data
        break

In [3]:
import json

tot = len(df)
json_list = []

for n in range(0,tot):
    json_list.append(json.dumps(df[n]['casebody']))


In [4]:
import re
import nltk
import json
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text):
    # Convert words to lower case
    text = text.lower() 

    text = re.sub('<.*?>', ' ', text)
    text = re.sub('[a-zA-Z]+\\\\[a-zA-Z]+', ' ', text)
    text = re.sub('\\\\[a-zA-Z]+', ' ', text)
    text = re.sub('\\\\"', '"', text)
    text = re.sub('[0-9]+[a-zA-Z]+', '', text)
    text = re.sub('\*', '', text)
    text = re.sub('amp', '', text)
    text = re.sub('[0-9]+', '', text)
    
    #Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = " ".join([i for i in text.split() if i not in stop_words])
    
    #Remove punctuation
    words = nltk.word_tokenize(text)
    text = [word for word in words if word.isalnum()]

    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tompe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tompe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# for topic model

texts = []

for sentence in json_list:
    texts.append(clean_text(sentence))

In [6]:
# Necessary because pyLDAvis make some trouble

from IPython.display import HTML
css_str = '<style> \
.jp-icon-warn0 path {fill: var(--jp-warn-color0);} \
.bp3-button-text path { fill: var(--jp-inverse-layout-color3);} \
.jp-icon-brand0 path { fill: var(--jp-brand-color0);} \
text.terms { fill: #616161;} \
</style>'
display(HTML(css_str))

### Topic model preparation

In [24]:
# Set training parameters.
num_topics = 30
chunksize = 2000
passes = 35
iterations = 600
eval_every = None  # Don't evaluate model perplexity, takes too much time

  and should_run_async(code)


In [8]:
# Remove rare and common tokens.
import gensim
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
import gensim.corpora as corpora




In [9]:
# Create a dictionary representation of the documents.
id2word = corpora.Dictionary(texts)

corpus = [id2word.doc2bow(text) for text in texts]


## LDA

In [25]:

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=chunksize,
                                           passes=passes,
                                           alpha='auto',
                                           per_word_topics=True,
                                           eval_every=eval_every,
                                           iterations=iterations)

  and should_run_async(code)


In [26]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Print the Keyword in the 10 topics
#pprint(lda_model.print_topics())
#doc_lda = lda_model[corpus]

lda_topics = lda_model.get_topics()

for topicno in range(num_topics):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in lda_model.show_topic(topicno, topn=10)], '\n')

  and should_run_async(code)


Topic 0
[('contract', 0.04), ('agreement', 0.02), ('note', 0.01), ('would', 0.01), ('made', 0.01), ('time', 0.01), ('upon', 0.01), ('parties', 0.01), ('one', 0.01), ('money', 0.01)] 

Topic 1
[('bank', 0.06), ('trust', 0.04), ('stock', 0.03), ('corporation', 0.03), ('company', 0.03), ('lease', 0.03), ('trustee', 0.01), ('loan', 0.01), ('national', 0.01), ('business', 0.01)] 

Topic 2
[('said', 0.11), ('bill', 0.04), ('decree', 0.02), ('court', 0.02), ('upon', 0.02), ('complainant', 0.02), ('defendants', 0.01), ('made', 0.01), ('bond', 0.01), ('defendant', 0.01)] 

Topic 3
[('estate', 0.05), ('death', 0.03), ('deceased', 0.02), ('will', 0.01), ('testator', 0.01), ('probate', 0.01), ('life', 0.01), ('administrator', 0.01), ('died', 0.01), ('children', 0.01)] 

Topic 4
[('pension', 0.06), ('benefits', 0.05), ('contribution', 0.04), ('res', 0.04), ('estoppel', 0.03), ('liquor', 0.03), ('retirement', 0.03), ('judicata', 0.03), ('fund', 0.02), ('disability', 0.02)] 

Topic 5
[('respondent', 

In [27]:
tid = id2word.token2id['court']
lda_model.get_term_topics(tid)

[(2, 0.02158861),
 (5, 0.022726068),
 (6, 0.06882597),
 (9, 0.019694839),
 (11, 0.022138435),
 (12, 0.014439493),
 (16, 0.010350409),
 (20, 0.029297294),
 (22, 0.03261238),
 (23, 0.050701145),
 (24, 0.0158358),
 (28, 0.016544117),
 (29, 0.03536669)]

In [13]:
!pip install pyLDAvis



In [28]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import matplotlib.pyplot as plt
%matplotlib inline

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

In [29]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.545871456741377


## Latent Semantic Indexin

In [59]:
from gensim.models import LsiModel

model = LsiModel(corpus, id2word=id2word, num_topics=num_topics)

vectors = model[corpus]

type(vectors)

gensim.interfaces.TransformedCorpus

In [60]:
topics = model.get_topics()
topics.shape

(30, 386575)

In [61]:

for topicno in range(num_topics):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in model.show_topic(topicno, topn=10)], '\n')

Topic 0
[('court', 0.44), ('defendant', 0.4), ('trial', 0.22), ('evidence', 0.16), ('case', 0.15), ('would', 0.13), ('plaintiff', 0.13), ('said', 0.12), ('state', 0.12), ('people', 0.11)] 

Topic 1
[('defendant', 0.66), ('said', -0.21), ('trial', 0.16), ('people', 0.15), ('court', -0.14), ('upon', -0.13), ('property', -0.13), ('act', -0.12), ('plaintiff', -0.11), ('company', -0.1)] 

Topic 2
[('said', -0.52), ('court', 0.37), ('defendant', -0.25), ('trial', 0.2), ('company', -0.15), ('upon', -0.14), ('app', 0.14), ('section', 0.14), ('made', -0.12), ('motion', 0.11)] 

Topic 3
[('plaintiff', -0.74), ('court', 0.19), ('state', 0.18), ('people', 0.15), ('evidence', -0.14), ('section', 0.11), ('said', 0.11), ('county', 0.11), ('act', 0.1), ('jury', -0.1)] 

Topic 4
[('court', -0.42), ('act', 0.31), ('section', 0.24), ('trial', -0.22), ('state', 0.2), ('said', -0.19), ('city', 0.17), ('board', 0.16), ('public', 0.15), ('illinois', 0.12)] 

Topic 5
[('defendant', 0.46), ('evidence', -0.38),

In [62]:
coherence = CoherenceModel(model=model,
                              corpus=corpus,
                              dictionary=id2word,
                              coherence='u_mass')
print(coherence.get_coherence())

-1.16028219230003


In [30]:
tid = id2word.token2id['court']
lda_model.get_term_topics(tid)

[(2, 0.02158861),
 (5, 0.022726068),
 (6, 0.06882597),
 (9, 0.019694839),
 (11, 0.022138435),
 (12, 0.014439493),
 (16, 0.010350409),
 (20, 0.029297294),
 (22, 0.03261238),
 (23, 0.050701145),
 (24, 0.0158358),
 (28, 0.016544117),
 (29, 0.03536669)]

In [55]:
narcotics = ['alcohol','narcotics','narco','narcotic','cannabis', 'cocaine', 'methamphetamine', 'smart drugs', 'marijuana', 'MDMA', 'LSD', 'KETAMINA', 'heroin', 'fentanyl','drugs']
weapons = ['weapons','weapon','gun', 'knife', 'weapon', 'firearm', 'rifle', 'carabine', 'shotgun', 'assaults rifle', 'sword', 'blunt objects','pistol'] 
investigation = ['investigation','police','detective','gang', 'crime','robber', 'black','mafia', 'serial kiler', 'rape', 'thefts', 'recidivism', 'arrest', 'ethnicity', 'caucasian', 'afroamerican', 'native american', 'hispanic', 'gender', 'male', 'female', 'man', 'woman', 'girl', 'boy', 'robbery', 'cybercrime']

In [56]:
for n in narcotics:
    try:
        tid = id2word.token2id[n]
        print(n + ":")
        print(lda_model.get_term_topics(n))
    except:
        print(n + " not find")

alcohol:
[(4, 0.019034302)]
narcotics:
[]
narco:
[]
narcotic:
[]
cannabis:
[]
cocaine:
[]
methamphetamine not find
smart drugs not find
marijuana:
[]
MDMA not find
LSD not find
KETAMINA not find
heroin:
[]
fentanyl:
[]
drugs:
[]


In [57]:
for w in weapons:
    try:
        tid = id2word.token2id[w]
        print(w + ":")
        print(lda_model.get_term_topics(tid))
    except:
        print(w + " not find")

weapons:
[]
weapon:
[]
gun:
[]
knife:
[]
weapon:
[]
firearm:
[]
rifle:
[]
carabine not find
shotgun:
[]
assaults rifle not find
sword:
[]
blunt objects not find
pistol:
[]


In [58]:
for i in investigation:
    try:
        tid = id2word.token2id[i]
        print(i + ":")
        print(lda_model.get_term_topics(tid))
    except:
        print(i + " not find")

investigation:
[]
police:
[(18, 0.017122451)]
detective:
[]
gang:
[]
crime:
[(16, 0.010470388)]
robber:
[]
black:
[]
mafia:
[]
serial kiler not find
rape:
[]
thefts:
[]
recidivism:
[]
arrest:
[]
ethnicity:
[]
caucasian:
[]
afroamerican not find
native american not find
hispanic:
[]
gender:
[]
male:
[]
female:
[]
man:
[]
woman:
[]
girl:
[]
boy:
[]
robbery:
[]
cybercrime not find
