In [1]:
# prepare the dataset we’ll be working with.

import os
import csv

data = []
dirname = '../../../out'
if os.path.exists(dirname):   
    for filename in os.listdir(dirname):
        filename = os.path.join(dirname, filename)
        with open(filename, 'r') as csv_file:
            reader = csv.reader(csv_file)
            next(reader, None)
            for row in reader:
                data.append(row[0])

NUM_DOCUMENTS = len(data)
print(NUM_DOCUMENTS)
print(' \n'.join(data[:5]))

13956
Two Suspects Arrested in Home Invasion Robbery    NR17032ma 
Suspect Arrested for Chinatown Murders    NR17033ml 
Hit and Run Collision Leaves Pedestrian Dead    NR17033ne 
Fatal Stabbing of a 27-year-old Man   NR17035im 
Press Conference   NA17014ma


In [2]:
# Gensim doesn’t have an implementation for NMF so we’re only going to play with LDA and 
# LSI (Latent Semantic Indexing AKA Latent Semantic Analysis) models.

import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 20
STOPWORDS = stopwords.words('english')
 
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))
 
 
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
 
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

[(27, 1), (77, 1), (78, 1)]


In [4]:
# Let’s now display the topics the two models have inferred:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 5))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 5))
 
print("=" * 20)

LDA Model:
Topic #0: 0.104*"death" + 0.072*"man" + 0.063*"found" + 0.033*"stabbed" + 0.029*"dead"
Topic #1: 0.101*"lapd" + 0.052*"graduation" + 0.034*"academy" + 0.029*"arrested" + 0.024*"san"
Topic #2: 0.068*"child" + 0.059*"dies" + 0.021*"street" + 0.020*"pedestrian" + 0.020*"arrested"
Topic #3: 0.081*"chief" + 0.046*"media" + 0.045*"conference" + 0.037*"lapd" + 0.035*"beck"
Topic #4: 0.084*"lapd" + 0.045*"police" + 0.029*"los" + 0.029*"angeles" + 0.027*"officers"
Topic #5: 0.087*"los" + 0.086*"angeles" + 0.058*"reward" + 0.040*"offered" + 0.035*"homicide"
Topic #6: 0.035*"holiday" + 0.033*"lapd" + 0.030*"armed" + 0.022*"body" + 0.022*"public"
Topic #7: 0.149*"man" + 0.121*"shot" + 0.120*"killed" + 0.020*"collision" + 0.019*"house"
Topic #8: 0.221*"missing" + 0.082*"man" + 0.066*"woman" + 0.064*"checkpoint" + 0.058*"sobriety"
Topic #9: 0.063*"detectives" + 0.058*"suspect" + 0.048*"murder" + 0.047*"suspects" + 0.046*"robbery"
Topic #10: 0.063*"hit" + 0.063*"run" + 0.054*"driver" + 0.0

In [17]:
# Let’s now put the models to work and transform unseen documents to their topic distribution:

text = "A men found killed in the park."
bow = dictionary.doc2bow(clean_text(text))

# print(lda_model[bow])
# [(0, 0.020005183), (1, 0.020005869), (2, 0.02000626), (3, 0.020005472), (4, 0.020009108), (5, 0.020005926), (6, 0.81994385), (7, 0.020006068), (8, 0.020006327), (9, 0.020005994)]
# print(max(lda_model[bow], key=lambda item:item[1]))
print('lda:', sorted(lda_model[bow], key=lambda item:item[1], reverse=True)[:3])

# print(lsi_model[bow])
# [(0, 0.091615426138426506), (1, -0.0085557463300508351), (2, 0.016744863677828108), (3, 0.040508186718598529), (4, 0.014201267714185898), (5, -0.012208538275305329), (6, 0.031254053085582149), (7, 0.017529584659403553), (8, 0.056957633371540077), (9, 0.025989149894888153)]
print('lsi:', sorted(lsi_model[bow], key=lambda item:item[1], reverse=True)[:3])

lda: [(7, 0.22323854), (0, 0.22195585), (11, 0.21271174)]
lsi: [(11, 0.3321595519680068), (19, 0.16672444441247902), (18, 0.12997326773064355)]


# Scikit learn

In [18]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
    min_df=5, max_df=0.9, 
    stop_words='english', lowercase=True, 
    token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

# Plotting words and documents in 2D with SVD

In [19]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()


In [20]:
# plot documents in 2D
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)