## Read data

In [10]:
import os
import re
import operator
import warnings
import gensim
import numpy as np

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from pprint import pprint

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

In [2]:
import csv

doc_complete = []
with open('petitions_complete.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        doc_complete.append(row) 

In [3]:
def to_unicode(text):
    if type(text) == str:
    # Ignore errors even if the string is not proper UTF-8 or has
    # broken marker bytes.
    # Python built-in function unicode() can do this.
        return unicode(text, "utf-8", errors="ignore")
    else:
        # Assume the value object has proper __unicode__() method
        return unicode(text)

In [4]:
from nltk.corpus import stopwords 
from gensim.utils import lemmatize
import string

# clean 1 without train_texts
stops = set(stopwords.words('english'))
exclude = set(string.punctuation) 
ignore = [] # ["http_www","com","html","year_old"]
# manualy created ignore word list
def clean1(texts):
    texts = [[word for word in line if word not in stops] for line in texts]
    texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]
    texts = [[word for word in line if word not in exclude] for line in texts]
    texts = [[word for word in line if word not in ignore] for line in texts]
    return texts

In [5]:
%%time
train_texts = []
for doc in doc_complete:
    text = to_unicode(doc['title']) + " " + to_unicode(doc['body'])
    text = gensim.utils.simple_preprocess(text, deacc=True, min_len=3)
    train_texts.append(text)
    
train_texts = clean1(train_texts)

CPU times: user 39.6 s, sys: 244 ms, total: 39.9 s
Wall time: 39.9 s


In [6]:
bigram = gensim.models.Phrases(train_texts)
train_texts = [bigram[line] for line in train_texts]

2017-12-06 15:46:43,139 : INFO : collecting all words and their counts
2017-12-06 15:46:43,140 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2017-12-06 15:46:43,801 : INFO : collected 111051 word types from a corpus of 148088 words (unigram + bigrams) and 4095 sentences
2017-12-06 15:46:43,802 : INFO : using 111051 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


In [7]:
# get top n (=20) common words that are going to be ignored.
from collections import Counter
flat_texts = [j for i in train_texts for j in i]
top_words = Counter(flat_texts).most_common(1000)
common_words = []
for word, frequency in top_words:
    if (len(common_words) < 20):
        common_words.append(word)

def clean2(texts):
    texts = [[word for word in line if word not in common_words] for line in texts]
    return texts

In [8]:
train_texts = clean2(train_texts)

In [9]:
len(train_texts)

4095

## Build Word Vector Model

In [12]:
from gensim.models import Word2Vec
wv_model = Word2Vec(train_texts, size=1000, window=5, min_count=1, workers=4)

2017-12-06 15:47:01,556 : DEBUG : Fast version of gensim.models.word2vec is being used
2017-12-06 15:47:01,558 : INFO : collecting all words and their counts
2017-12-06 15:47:01,559 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-06 15:47:01,602 : INFO : collected 11250 word types from a corpus of 119952 raw words and 4095 sentences
2017-12-06 15:47:01,603 : INFO : Loading a fresh vocabulary
2017-12-06 15:47:01,643 : INFO : min_count=1 retains 11250 unique words (100% of original 11250, drops 0)
2017-12-06 15:47:01,644 : INFO : min_count=1 leaves 119952 word corpus (100% of original 119952, drops 0)
2017-12-06 15:47:01,703 : INFO : deleting the raw counts dictionary of 11250 items
2017-12-06 15:47:01,704 : INFO : sample=0.001 downsamples 13 most-common words
2017-12-06 15:47:01,705 : INFO : downsampling leaves estimated 119388 word corpus (99.5% of prior 119952)
2017-12-06 15:47:01,706 : INFO : estimated required memory for 11250 words and 1000 dimens

In [13]:
wv_model.wv.similarity('gun', 'firearm')

0.99998262544756988

## Preprocessing for CorEx

In [14]:
import numpy as np
import vis_topic as vt
import corex_topic as ct
import scipy.sparse as ss

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

raw_docs = [' '.join(clean_doc) for clean_doc in train_texts]

# Transform cleaned petition texts into a sparse matrix
vectorizer = CountVectorizer(stop_words='english', max_features = 20000, binary = True)
doc_word = vectorizer.fit_transform(raw_docs)
doc_word = ss.csr_matrix(doc_word)

doc_word.shape # n_docs x m_words

(4095, 11189)

In [80]:
# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names()))

In [81]:
# Remove all integers
not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words = [word for ind,word in enumerate(words) if not word.isdigit()]

doc_word.shape # n_docs x m_words

(4095, 11189)

## Original model

Train the original model

In [29]:
%%time
# Train the CorEx topic model with 20 topics (Takes about 3.74s)
seed = 1989
num_topics = 20
anchor_words = []

ct_model = ct.Corex(n_hidden=num_topics, words=words, max_iter=20, verbose=False, seed=seed)
ct_model.fit(doc_word, words=words);
# Print all topics from the CorEx topic model
topics = ct_model.get_topics()
for n in range(num_topics):
    topic_words,_ = zip(*ct_model.get_topics(topic=n, n_words=20))
    print '{}: '.format(n) + ', '.join(topic_words[:12])
    anchor_words.append(list(topic_words))

0: disease, patient, cancer, treatment, doctor, cure, disorder, medication, pain, awareness, symptom, illness
1: man_power, consent_government, code_word, states_declaration, racist_racist, station_law, independence_course, nature_nature, racist_code, god_respect, power_earth, opinion_mankind
2: god_trust, robbery_kidnapping, icon_max, rapper_conviction, year_september, wingate_rap, retrial_notice, christie_president, wingate_involvement, max_appeal, robbery_man, murder_feel
3: ocean, nasa, april, bus, balance, dolphin, proclaim, explosion, cabinet, abuse_power, shame, hunt
4: health, economy, tax, cost, benefit, increase, company, money, market, pay, healthcare, fund
5: election, investigation, vote, voter, candidate, hillary_clinton, voting, campaign, department_justice, fbi, ballot, office
6: internet, consumer, energy, information, technology, provider, service, device, car, access, fuel, safety
7: democracy, war, freedom, russia, ukraine, party, liberty, sanction, respect, regime,

## Calculate tsne cordniates

In [115]:
doc_vecs = []
for i in range(len(doc_complete)):
    doc_vec = list(ct_model.p_y_given_x[i])
    doc_vecs.append(doc_vec)

from scipy.stats import threshold
thresholded_doc_vecs = threshold(doc_vecs, 0.1)

topic_id = 4
doc_ids = []
for i in range(len(doc_complete)):
    if (thresholded_doc_vecs[i][topic_id] > 0.5):
        doc_ids.append(i)

print(len(doc_ids))

1412


stats.threshold is deprecated in scipy 0.17.0
  import sys


## PCA is fast, but only capture linear relationship

In [77]:
%%time
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
    
pca = PCA(n_components=2)
fiftyDimVecs = pca.fit_transform([doc_vec for doc_vec in doc_vecs])
# tsne = TSNE(n_components=2)
# twoDimVecs = tsne.fit_transform(fiftyDimVecs)

CPU times: user 17.3 ms, sys: 4.43 ms, total: 21.8 ms
Wall time: 17.8 ms


In [78]:
twoDimVecs[0]

array([-25.02761078,  58.449543  ], dtype=float32)

## t-SNE is too slow

In [72]:
%%time
import numpy as np
from sklearn.manifold import TSNE
X = np.array(thresholded_doc_vecs)
X_embedded = TSNE(n_components=2, random_state=1989).fit_transform(X)

CPU times: user 1min 38s, sys: 6.29 s, total: 1min 44s
Wall time: 1min 44s


In [73]:
X_embedded[0]

array([ 43.06148911, -41.58058167], dtype=float32)

## Write petition clustering to json

In [79]:
import csv

size = len(twoDimVecs)
import json
with open('tsne.json', 'a') as outfile:
    lines = []
    for i in range(size):
        line = {'cord_x': str(twoDimVecs[i][0]),
                'cord_y': str(twoDimVecs[i][1]),
                'title': str(doc_complete[i]['title']),
                'body': str(doc_complete[i]['body'])}
        color = 0;
        for j in range(num_topics):
            line['topic_' + str(j)] = doc_vecs[i][j]
            if (doc_vecs[i][j] > doc_vecs[i][color]):
                color = j
        line['topic_id'] = color
        lines.append(line)
    json.dump(lines, outfile, indent=2)

## Evaluate Topic Coherence

In [25]:
dictionary = Dictionary(train_texts)
corpus = [dictionary.doc2bow(text) for text in train_texts]

cttopics = []
for i in range(num_topics):
    _words = ct_model.get_topics(topic=i)
    cttopics.append((i, _words))
_cttopics = [[word for word, prob in topic] for topicid, topic in cttopics]
ct_coherence = CoherenceModel(topics=_cttopics[:10], texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()
print(ct_coherence)

2017-12-06 15:54:55,033 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-12-06 15:54:55,259 : INFO : built Dictionary(11250 unique tokens: [u'sinjar', u'deferment', u'yellow', u'narcotic', u'vani']...) from 4095 documents (total 119952 corpus positions)
2017-12-06 15:54:55,844 : INFO : using ParallelWordOccurrenceAccumulator(processes=7, batch_size=64) to estimate probabilities from sliding windows
2017-12-06 15:54:55,930 : INFO : 1 batches submitted to accumulate stats from 64 documents (1483 virtual)
2017-12-06 15:54:55,936 : INFO : 2 batches submitted to accumulate stats from 128 documents (3073 virtual)
2017-12-06 15:54:55,940 : INFO : 3 batches submitted to accumulate stats from 192 documents (4525 virtual)
2017-12-06 15:54:55,945 : INFO : 4 batches submitted to accumulate stats from 256 documents (6204 virtual)
2017-12-06 15:54:55,951 : INFO : 5 batches submitted to accumulate stats from 320 documents (7664 virtual)
2017-12-06 15:54:55,958 : INFO : 6 batches su

2017-12-06 15:54:56,475 : DEBUG : completed batch 4; 320 documents processed (7475 virtual)
2017-12-06 15:54:56,482 : INFO : 47 batches submitted to accumulate stats from 3008 documents (66862 virtual)
2017-12-06 15:54:56,480 : DEBUG : completed batch 4; 320 documents processed (7316 virtual)
2017-12-06 15:54:56,488 : INFO : 48 batches submitted to accumulate stats from 3072 documents (67350 virtual)
2017-12-06 15:54:56,479 : DEBUG : completed batch 4; 320 documents processed (7528 virtual)
2017-12-06 15:54:56,492 : DEBUG : completed batch 5; 384 documents processed (8507 virtual)
2017-12-06 15:54:56,516 : DEBUG : completed batch 5; 384 documents processed (8445 virtual)
2017-12-06 15:54:56,521 : DEBUG : completed batch 5; 384 documents processed (8619 virtual)
2017-12-06 15:54:56,537 : DEBUG : completed batch 5; 384 documents processed (7801 virtual)
2017-12-06 15:54:56,564 : DEBUG : completed batch 5; 384 documents processed (8722 virtual)
2017-12-06 15:54:56,567 : DEBUG : completed 

0.470615583471


In [44]:
str(anchor_words[0])

"[u'disease', u'patient', u'cancer', u'treatment', u'doctor', u'cure', u'disorder', u'medication', u'pain', u'awareness', u'symptom', u'illness', u'medicine', u'diagnosis', u'disability', u'brain', u'diabetes', u'condition', u'physician', u'organ']"

In [45]:
str(anchor_words[4])

"[u'health', u'economy', u'tax', u'cost', u'benefit', u'increase', u'company', u'money', u'market', u'pay', u'healthcare', u'fund', u'research', u'dollar', u'debt', u'industry', u'budget', u'study', u'business', u'product']"

In [46]:
move_anchor_words = []
for words in anchor_words:
    move_anchor_words.append(list(words))
move_anchor_words[0].append("health")
print str(move_anchor_words[0])
move_anchor_words[4].remove("health")
print str(move_anchor_words[4])

[u'disease', u'patient', u'cancer', u'treatment', u'doctor', u'cure', u'disorder', u'medication', u'pain', u'awareness', u'symptom', u'illness', u'medicine', u'diagnosis', u'disability', u'brain', u'diabetes', u'condition', u'physician', u'organ', 'health']
[u'economy', u'tax', u'cost', u'benefit', u'increase', u'company', u'money', u'market', u'pay', u'healthcare', u'fund', u'research', u'dollar', u'debt', u'industry', u'budget', u'study', u'business', u'product']


In [49]:
ct_model_move = ct.Corex(n_hidden=(num_topics), words=words, max_iter=20, verbose=False, seed=seed)
ct_model_move.fit(doc_word, words=words, anchors=move_anchor_words, anchor_strength=6);
for n in range(num_topics):
    topic_words,_ = zip(*ct_model_move.get_topics(topic=n, n_words=12))
    print '{}: '.format(n) + ', '.join(topic_words[:12])

0: health, treatment, disease, condition, patient, doctor, cancer, awareness, pain, illness, medicine, disability
1: man_power, consent_government, code_word, father_united, states_declaration, station_law, event_person, racist_racist, nature_nature, independence_course, band_assume, opinion_mankind
2: god_trust, rapper_conviction, wingate_involvement, murder_feel, christie_president, wingate_rap, state_robbery, icon_max, year_september, max_appeal, retrial_notice, robbery_man
3: april, hunt, ocean, nasa, explosion, cabinet, bus, balance, abuse_power, proclaim, dolphin, transit
4: money, benefit, company, pay, economy, business, cost, fund, tax, industry, dollar, budget
5: office, investigation, vote, election, campaign, candidate, corruption, voter, fraud, hillary_clinton, voting, attorney
6: service, safety, information, com, access, standard, technology, internet, consumer, vehicle, energy, car
7: power, war, freedom, force, party, democracy, leader, respect, ukraine, russia, libert

## Remove a word

In [238]:
## Remove a word in topic 3
def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]

words_to_remove = ['http', "html"]
remove_train_texts = []
for train_text in train_texts:
    x = remove_values_from_list(train_text, "")
    for word in words_to_remove:
        x = remove_values_from_list(x, word)
    remove_train_texts.append(x)
        
remove_raw_docs = [' '.join(clean_doc) for clean_doc in remove_train_texts]
remove_vectorizer = CountVectorizer(stop_words='english', max_features = 20000, binary = True)
remove_doc_word = remove_vectorizer.fit_transform(remove_raw_docs)
remove_doc_word = ss.csr_matrix(remove_doc_word)
remove_words = list(np.asarray(remove_vectorizer.get_feature_names()))

print len(remove_words)
print remove_doc_word.shape

10855
(4095, 10855)


In [204]:
remove_anchor_words = []
for words in anchor_words:
    x = remove_values_from_list(words, "")
    for word in words_to_remove:
        x = remove_values_from_list(x, word)
    remove_anchor_words.append(x)

In [301]:
ct_model_remove = ct.Corex(n_hidden=(num_topics), words=remove_words, max_iter=5, verbose=False, seed=seed)
ct_model_remove.fit(remove_doc_word, words=remove_words, anchors=remove_anchor_words, anchor_strength=6);
for n in range(num_topics):
    topic_words,_ = zip(*ct_model_remove.get_topics(topic=n, n_words=20))
    print '{}: '.format(n) + ', '.join(topic_words)

0: country, word, plan, destruction, article, africa, diversity, everybody, african, assimilation, species, network, asian, conservation, http_goo, diversity_code, whole_part, jackson, trail, hunter
1: release, band, separation, conviction, equal_station, course_human, independence, retrial, egypt, governor_chris, parliament, christie, max_appeal, icon_max, feel_rapper, range, michigan, railroad, moment, communist_party
2: disease, awareness, cancer, pain, suffering, medicine, disability, disorder, cure, medication, treatment, physician, symptom, brain, diabetes, diagnosis, syndrome, aviation, flight, condition
3: administration, party, animal, owner, sanction, regime, news, cruelty, human, opposition, chairman, consumption, channel, brutality, reporter, libya, fox, pet, cat, puppy
4: prison, situation, defense, army, award, sentence, afghanistan, shot, zone, general, direction, compensation, vietnam, film, deployment, brother, afghan, sgt, deal, lincoln
5: com, genocide, father, conse

## Split a topic

In [55]:
# Split topic 6
toSplitId = 6

In [56]:
def distance(word1, word2):
    return wv_model.wv.similarity(word1, word2)
 
def buildSimilarityMatrix(samples):
    numOfSamples = len(samples)
    matrix = np.zeros(shape=(numOfSamples, numOfSamples))
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            matrix[i,j] = distance(samples[i], samples[j])
    return matrix

samples = anchor_words[toSplitId]
sim_mat = buildSimilarityMatrix(samples)

In [57]:
import numpy as np
from sklearn.cluster import SpectralClustering
num_cluster = 2 # categorize the words into 2 clusters
mat = np.matrix(sim_mat)
res = SpectralClustering(num_cluster).fit_predict(mat)
ll = [[] for _ in range(num_cluster)]
for i in range(len(samples)):
    idx = res[i]
    word = samples[i]
    ll[idx].append(word)

print ll

[[u'internet', u'consumer', u'energy', u'information', u'technology', u'service', u'device', u'car', u'access', u'safety', u'standard', u'road', u'vehicle', u'com'], [u'provider', u'fuel', u'accident', u'content', u'forest', u'aviation']]


In [58]:
# create new anchor words
anchor_words_split = list(anchor_words)
del anchor_words_split[toSplitId]
for i in range(num_cluster):
    anchor_words_split.insert(toSplitId, ll[i])

In [59]:
# Anchor 'education' and 'weapon' to first topic, 'drug' and 'student' to second topic, so on...
# Apparently different words are selected for one topic to demonstrate whether the anchoring works
ct_model_split = ct.Corex(n_hidden=(num_topics + num_cluster - 1), words=words, max_iter=5, verbose=False, seed=seed)
ct_model_split.fit(doc_word, words=words, anchors=anchor_words_split, anchor_strength=6);
for n in range(num_topics + num_cluster - 1):
    topic_words,_ = zip(*ct_model_split.get_topics(topic=n, n_words=20))
    print '{}: '.format(n) + ', '.join(topic_words)

0: treatment, condition, disease, patient, doctor, cancer, awareness, pain, illness, medicine, disability, disorder, cure, medication, symptom, physician, brain, diagnosis, diabetes, organ
1: man_power, consent_government, code_word, father_united, states_declaration, racist_racist, event_person, station_law, god_respect, racist_code, power_earth, band_assume, independence_course, opinion_mankind, nature_nature, separation_government, government_founding, africa_african, immigration_assimilation, genocide_racist
2: god_trust, robbery_man, year_september, retrial_notice, state_robbery, wingate_rap, wingate_involvement, christie_president, murder_feel, robbery_kidnapping, icon_max, rapper_conviction, max_appeal, vote_bill, pluribus_unum, house_floor, veteran_exposure, health_registry, deed, veteran_mcclellan
3: april, ocean, nasa, explosion, balance, proclaim, bus, shame, abuse_power, cabinet, transit, museum, fiction, specialist, nasa_budget, sentence, year_sentence, crime, dolphin, dau

## Merge Topics by Joining

In [108]:
def merge_topics(idx1, idx2, topic_words):
    idx1, idx2 = min(idx1, idx2), max(idx2, idx1)
    res = []
    for idx, words in enumerate(topic_words):
        res.append(list(words))
        if(idx == idx2):
            res[idx1] = res[idx2] + res[idx1]
    del res[idx2]
    return res

topic_words = [['a','b','c'], ['d','e','f'], ['g','h','i'], ['j','k','l']]
res = merge_topics(0, 1, topic_words)
print(res)

[['d', 'e', 'f', 'a', 'b', 'c'], ['g', 'h', 'i'], ['j', 'k', 'l']]


In [109]:
# merge topic x and topic y
anchor_words_merge = merge_topics(5, 16, anchor_words)

In [110]:
ct_model_merge = ct.Corex(n_hidden=(num_topics - 1), words=words, max_iter=20, verbose=False, seed=seed)
ct_model_merge.fit(doc_word, words=words, anchors=anchor_words_merge, anchor_strength=6);
for n in range(num_topics - 1):
    topic_words,_ = zip(*ct_model_merge.get_topics(topic=n, n_words=20))
    print '{}: '.format(n) + ', '.join(topic_words)

0: treatment, condition, disease, patient, doctor, cancer, awareness, pain, illness, medicine, disability, disorder, cure, medication, symptom, physician, brain, diagnosis, diabetes, organ
1: man_power, consent_government, code_word, father_united, states_declaration, station_law, event_person, racist_racist, opinion_mankind, god_respect, nature_nature, band_assume, power_earth, separation_government, racist_code, independence_course, government_founding, africa_african, immigration_assimilation, genocide_racist
2: god_trust, robbery_man, robbery_kidnapping, murder_feel, retrial_notice, year_september, christie_president, rapper_conviction, wingate_involvement, state_robbery, max_appeal, wingate_rap, icon_max, pluribus_unum, vote_bill, house_floor, health_registry, veteran_exposure, deed, veteran_mcclellan
3: april, hunt, ocean, nasa, abuse_power, proclaim, explosion, dolphin, bus, cabinet, zimbabwe, transit, fiction, torture_murder, museum, nasa_budget, news, boston, marathon, york
4: