In [1]:
import pandas as pd
import nltk

In [2]:
# Load data frame
df = pd.read_csv('/media/zainkhan/USB30FD/suspended-clinton-tweets.txt', encoding='latin1', error_bad_lines=False, warn_bad_lines=False, header=None)
df = df[0]
df.head()

0    RT @KenRenar96: Open your eyes you vote for Hi...
1    SEE VIDEO! Hillary Clinton Plays Who'd You Rat...
2    RT @RealFKNNews: #US #media to #prematurely #P...
3    SEE VIDEO! Who Is Hillary Clinton?... https://...
4    Hillary Clinton #LoveTrumpsHate Large Black Cu...
Name: 0, dtype: object

In [3]:
# Remove stop words
df = df.str.lower().str.split()
stop = nltk.corpus.stopwords.words('english')
df = df.apply(lambda x: [item for item in x if item not in stop])
df.head()

0    [rt, @kenrenar96:, open, eyes, vote, hillary, ...
1    [see, video!, hillary, clinton, plays, who'd, ...
2    [rt, @realfknnews:, #us, #media, #prematurely,...
3    [see, video!, hillary, clinton?..., https://t....
4    [hillary, clinton, #lovetrumpshate, large, bla...
Name: 0, dtype: object

In [4]:
'''
# Stem dataframe
stemmer = nltk.stem.PorterStemmer()
def stem_sentences(tokens):
    return [stemmer.stem(token) for token in tokens]
df = df.apply(stem_sentences)
df.head()
'''

'\n# Stem dataframe\nstemmer = nltk.stem.PorterStemmer()\ndef stem_sentences(tokens):\n    return [stemmer.stem(token) for token in tokens]\ndf = df.apply(stem_sentences)\ndf.head()\n'

In [5]:
# Separate into sentences
sentences = df.tolist()
print(sentences[:2])

[['rt', '@kenrenar96:', 'open', 'eyes', 'vote', 'hillary', 'clinton.', 'salvation.', 'believes', 'ideals', 'equality!!!', '#hillaryforpr'], ['see', 'video!', 'hillary', 'clinton', 'plays', "who'd", 'rather?...', 'https://t.co/aho7bx9vwp', 'https://t.co/6egopos352']]


In [6]:
# Clean and remove illegal characters
import re
def clean(s):
    for i, w in enumerate(s):
        if contains_illegal(w):
            s[i] = ''
        else:
            if 'cli' in w:
                s[i] = 'clinton'
            elif 'hrc' in w or 'hil' in w:
                s[i] = 'hillary'
            elif 'tru' in w or 'trmp' in w:
                s[i] = 'trump'
            elif 'bern' in w:
                s[i] = 'bernie'
            elif 'sander' in w:
                s[i] = 'sanders'
            extras = '0123456789~_=*^,.-`%""+#&"!?<>:;/\\\'()[]{}$|\x91\x92\x93\x94\x96\x97'
            s[i] = s[i].translate({ord(c):'' for c in extras})
            if len(s[i]) < 4:
                s[i] = ''
    return s
                    
def contains_illegal(w):
    illegal = ['@', '#', 'htt', 'via', 'desd']
    if any(x in w for x in illegal):
        return True
    if re.match("^\d+?\.\d+?$", w) is not None:
        return True
    return False 

sentences = [clean(s) for s in sentences if len(s) > 0]
sentences = [list(filter(None, sentence)) for sentence in sentences]
print(sentences[:2])

[['open', 'eyes', 'vote', 'hillary', 'clinton', 'salvation', 'believes', 'ideals', 'equality'], ['video', 'hillary', 'clinton', 'plays', 'whod', 'rather']]


In [12]:
# Word Embedding model
from gensim.models import Word2Vec

model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)
model.train(sentences, total_examples=len(sentences), epochs=50)

(266956612, 364113550)

In [13]:
# List of terms, indices, term counts from model 
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]

# Sort by term counts so common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])

# Unzip terms, indices, and counts
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

In [14]:
word_vectors = pd.DataFrame(model.wv.vectors[term_indices,:], index=ordered_terms)
word_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
hillary,-0.497178,-1.723486,1.390769,0.001571,-0.221227,0.865141,-2.301404,-0.152843,0.483029,-2.640149,...,3.551202,-2.554028,2.360421,1.158633,1.035413,0.697721,1.701641,3.071995,-2.073334,0.674705
clinton,-0.600327,-0.074233,-0.296227,-0.125324,-1.075259,-1.366859,-1.097159,1.235372,1.062597,-0.815505,...,-3.004345,-0.512308,1.763045,0.226244,0.967690,0.452778,4.386120,3.813322,-0.783843,0.819526
trump,-1.403885,1.131211,-1.340645,0.551755,0.497083,0.171025,-2.263395,0.807959,1.075643,-3.459819,...,1.249166,-1.329785,3.481671,-0.703479,1.343903,1.677248,2.711553,2.743246,-1.650513,0.655152
donald,0.409104,1.414159,-3.821531,1.351775,-0.571937,-0.349408,-1.531504,-0.931816,-3.401134,-2.807292,...,0.535273,-0.454047,3.520427,0.117905,0.262099,1.849746,0.861835,1.543391,0.222242,0.556680
bernie,-0.255816,-0.807398,-0.048157,-2.234178,-1.587328,-0.815730,-3.036820,-2.529340,2.015233,-2.182841,...,-2.604358,0.962744,4.678576,0.742975,-2.607815,-0.606551,1.965902,3.003138,1.554911,0.473669
sanders,0.524028,-0.420572,-4.734846,-2.030780,1.757803,-1.511096,-3.265988,-1.307921,1.031652,0.027208,...,2.026377,-0.482749,5.710019,2.948722,-1.471121,-2.199971,2.394731,0.354576,-1.600788,-1.075996
vote,-1.432333,2.106539,-0.929155,-1.810061,0.656314,-1.328411,-0.571204,0.398147,-1.341315,-1.436740,...,-2.087525,1.017490,0.922236,3.046229,0.727765,-3.004274,2.218409,1.084557,1.781185,2.661366
bill,-2.526485,1.006227,-2.282074,-3.029749,-1.180200,0.265671,-2.763615,0.319053,3.353892,-3.741083,...,1.939577,-2.890322,3.404358,0.972578,3.204614,-0.749543,0.824890,1.835012,-0.971409,0.371062
president,-2.030789,1.537255,1.304332,-1.791765,-0.998611,-0.675154,0.942856,2.427041,-0.278920,-1.196924,...,2.637551,-0.789671,1.550813,0.522477,-2.178504,-0.212253,2.127444,4.485534,0.890232,-0.292042
campaign,-2.156035,1.300407,1.156600,-0.710910,-2.347738,1.916863,-1.568924,-2.452862,-1.099004,1.136775,...,1.824621,-1.603502,2.190997,1.480048,-1.245635,-1.716193,2.351245,-1.695131,2.188122,2.165874


In [15]:
def related_terms(model, token, topn=10):
    out = []
    for word, similarity in model.wv.most_similar(positive=[token], topn=topn):
        out.append([word, round(similarity, 3)])
    return out

for elem in related_terms(model, 'hillary'):
    print(elem)
print('\n')
for elem in related_terms(model, 'clinton'):
    print(elem)

['trump', 0.536]
['them', 0.347]
['camouflageknows', 0.334]
['bernie', 0.332]
['touchmenot', 0.329]
['fkwho', 0.321]
['breitbart', 0.321]
['america', 0.313]
['thats', 0.31]
['nation', 0.309]


['trump', 0.547]
['them', 0.453]
['without', 0.391]
['breitbart', 0.383]
['roge', 0.372]
['bernie', 0.368]
['already', 0.362]
['thats', 0.362]
['even', 0.352]
['much', 0.351]


In [17]:
def network(model, token, topn=10):
    base = related_terms(model, token, topn)
    network = []
    network.append([[i] for i in base])
    for i in range(topn, 0, -1):
        new_related = related_terms(model, base[topn - i][0], i)
        network.append([[base[topn-i][0]], new_related, [0], [topn - i + 1]])
        
        for layer in range(i//2, 0, -1):
            layered_related = related_terms(model, new_related[i//2 - layer][0], layer)
            network.append([[new_related[i//2 - layer][0]], layered_related, [i//2 - layer + 1], [topn - i + 1]])
    return network

center_word = 'clinton'
word_network = network(model, center_word)
print('Model centered around word ' + center_word)
for words in word_network:
    for items in words: 
        for word in items:
            print(word)
    print('\n')

Model centered around word clinton
['trump', 0.547]
['them', 0.453]
['without', 0.391]
['breitbart', 0.383]
['roge', 0.372]
['bernie', 0.368]
['already', 0.362]
['thats', 0.362]
['even', 0.352]
['much', 0.351]
['wait', 0.344]
['shes', 0.342]
['time', 0.339]
['break', 0.335]
['following', 0.33]
['felonious', 0.328]
['transformación', 0.327]
['part', 0.318]
['totally', 0.318]
['video', 0.317]


trump
['clinton', 0.547]
['hillary', 0.536]
['bernie', 0.426]
['them', 0.395]
['nytimes', 0.385]
['totally', 0.358]
['sanders', 0.347]
['republican', 0.342]
['businessinsider', 0.341]
['shes', 0.338]
['break', 0.337]
['media', 0.334]
['nation', 0.333]
['debate', 0.325]
['factcheck', 0.32]
['good', 0.319]
['elizabeth', 0.319]
['america', 0.316]
['reading', 0.309]
['cruz', 0.306]
0
1


clinton
['trump', 0.547]
['them', 0.453]
['without', 0.391]
['breitbart', 0.383]
['roge', 0.372]
['bernie', 0.368]
['already', 0.362]
['thats', 0.362]
['even', 0.352]
['much', 0.351]
1
1


hillary
['trump', 0.536]
['t