In [6]:
import pandas as pd
import gensim

In [2]:
df = pd.read_csv("data/debate.csv")

In [63]:
trump = df[df["Speaker"] == "Trump"]["Text"]
clinton = df[df["Speaker"] == "Clinton"]["Text"]

# Exploration of comments

In [4]:
print(trump.str.len().mean())
print(trump.str.len().max())

341.78591549295777
2703


In [5]:
print(clinton.str.len().mean())
print(clinton.str.len().max())

440.2551440329218
2376


In [134]:
().sum()

175

# Basic gensim topic modeling

## Create a 2d array with the words in each document

In [161]:
documents_clean = trump.str.replace("\.|\?|-|!|'|,", "")
documents_clean = documents_clean[documents_clean.str.len() > 50]
stoplist = list(gensim.parsing.preprocessing.STOPWORDS) + \
            ["hes", "think", "weve", "thats", "said", "want", "look", "youve", "youre"]
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents_clean]

## Remove words that only appear once

In [162]:
from collections import defaultdict
frequency =  defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

## Generate a dictionary (word to id mapping)

In [163]:
dictionary = gensim.corpora.Dictionary(texts)
print(dictionary)

Dictionary(1059 unique tokens: ['thank', 'lester', 'jobs', 'country', 'theyre']...)


In [164]:
dictionary.filter_n_most_frequent(20)

## Generate the bag of words corpus

In [165]:
corpus = [dictionary.doc2bow(text) for text in texts]

## LDA Model

In [166]:
lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=10)

In [167]:
top_words = [[word for word, _ in lda.show_topic(topicno, topn=50)] for topicno in range(lda.num_topics)]

In [168]:
for topics in top_words:
    print(" ".join(topics[:10]))

doesnt millions donors took time let taxes companies percent hundreds
russia good deal happened world energy china jobs talks hundreds
war nato endorsed believe sean secretary hannity laws russia ill
tremendous bad companies bring chicago jobs mosul need use world
states disaster jobs happened lot believe happen whats didnt emails
obamacare wants tax russia big bad percent regulations president new
need cities law inner ive women care mosul got happen
nafta wall bad trade signed heard border deal percent better
isis good ive secretary world let talk politicians problem fed
didnt bad talk time experience got iran companies effective campaign


In [160]:
doc = 20
print(lda.get_document_topics(corpus[doc]))
[print(dictionary[id]) for id, _ in corpus[doc]];

[(0, 0.014288354240350678), (1, 0.014289257614619936), (2, 0.014287534104115342), (3, 0.014286256067873048), (4, 0.014286761888813731), (5, 0.014290424627816264), (6, 0.8714051778973243), (7, 0.014287957221450008), (8, 0.01429167503625736), (9, 0.014286601301379379)]
debate
saying
things
let
absolutely
case
