In [6]:
import pandas as pd
import gensim

In [2]:
df = pd.read_csv("data/debate.csv")

In [63]:
trump = df[df["Speaker"] == "Trump"]["Text"]
clinton = df[df["Speaker"] == "Clinton"]["Text"]

# Exploration of comments

In [4]:
print(trump.str.len().mean())
print(trump.str.len().max())

341.78591549295777
2703


In [5]:
print(clinton.str.len().mean())
print(clinton.str.len().max())

440.2551440329218
2376


# Basic gensim topic modeling

## Create a 2d array with the words in each document

In [108]:
documents_clean = clinton.str.replace("\.|\?|-|!|'|,", "")
stoplist = list(gensim.parsing.preprocessing.STOPWORDS) + \
            ["hes", "think", "weve", "thats", "said", "want", "look", "youve"]
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents_clean]

## Remove words that only appear once

In [109]:
from collections import defaultdict
frequency =  defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

## Generate a dictionary (word to id mapping)

In [110]:
dictionary = gensim.corpora.Dictionary(texts)
print(dictionary)

Dictionary(1100 unique tokens: ['donald', 'thank', 'lester', 'thanks', 'hosting']...)


In [111]:
dictionary.filter_n_most_frequent(20)

## Generate the bag of words corpus

In [112]:
corpus = [dictionary.doc2bow(text) for text in texts]

## LDA Model

In [119]:
lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=5)

In [120]:
top_words = [[word for word, _ in lda.show_topic(topicno, topn=50)] for topicno in range(lda.num_topics)]

In [121]:
for topics in top_words:
    print(" ".join(topics[:10]))

tax america dont lets million money campaign isis insurance zero
jobs new nuclear let national man america debt things economy
women economy right dont come worked new jobs energy government
court supreme tax able rights nuclear isis dont plan second
health women united support talk fact hope business states help
