In [2]:
import pandas as pd
import nltk

In [3]:
# Load data frame
df = pd.read_csv('/media/zainkhan/USB30FD/suspended-clinton-tweets.txt', encoding='latin1', error_bad_lines=False, warn_bad_lines=False, header=None)
df = df[0]
df.head()

0    RT @KenRenar96: Open your eyes you vote for Hi...
1    SEE VIDEO! Hillary Clinton Plays Who'd You Rat...
2    RT @RealFKNNews: #US #media to #prematurely #P...
3    SEE VIDEO! Who Is Hillary Clinton?... https://...
4    Hillary Clinton #LoveTrumpsHate Large Black Cu...
Name: 0, dtype: object

In [4]:
# Remove stop words
df = df.str.lower().str.split()
stop = nltk.corpus.stopwords.words('english')
df = df.apply(lambda x: [item for item in x if item not in stop])
df.head()

0    [rt, @kenrenar96:, open, eyes, vote, hillary, ...
1    [see, video!, hillary, clinton, plays, who'd, ...
2    [rt, @realfknnews:, #us, #media, #prematurely,...
3    [see, video!, hillary, clinton?..., https://t....
4    [hillary, clinton, #lovetrumpshate, large, bla...
Name: 0, dtype: object

In [5]:
'''
# Stem dataframe
stemmer = nltk.stem.PorterStemmer()
def stem_sentences(tokens):
    return [stemmer.stem(token) for token in tokens]
df = df.apply(stem_sentences)
df.head()
'''

'\n# Stem dataframe\nstemmer = nltk.stem.PorterStemmer()\ndef stem_sentences(tokens):\n    return [stemmer.stem(token) for token in tokens]\ndf = df.apply(stem_sentences)\ndf.head()\n'

In [6]:
# Separate into sentences
sentences = df.tolist()
print(sentences[:2])

[['rt', '@kenrenar96:', 'open', 'eyes', 'vote', 'hillary', 'clinton.', 'salvation.', 'believes', 'ideals', 'equality!!!', '#hillaryforpr'], ['see', 'video!', 'hillary', 'clinton', 'plays', "who'd", 'rather?...', 'https://t.co/aho7bx9vwp', 'https://t.co/6egopos352']]


In [7]:
# Clean and remove illegal characters
import re
def clean(s):
    for i, w in enumerate(s):
        if contains_illegal(w):
            s[i] = ''
        else:
            if 'cli' in w:
                s[i] = 'clinton'
            elif 'hrc' in w or 'hil' in w:
                s[i] = 'hillary'
            elif 'tru' in w or 'trmp' in w:
                s[i] = 'trump'
            elif 'bern' in w:
                s[i] = 'bernie'
            elif 'sander' in w:
                s[i] = 'sanders'
            extras = '0123456789~_=*^,.-`%""+#&"!?<>:;/\\\'()[]{}$|\x91\x92\x93\x94\x96\x97'
            s[i] = s[i].translate({ord(c):'' for c in extras})
            if len(s[i]) < 4:
                s[i] = ''
    return s
                    
def contains_illegal(w):
    illegal = ['@', '#', 'htt', 'via', 'desd']
    if any(x in w for x in illegal):
        return True
    if re.match("^\d+?\.\d+?$", w) is not None:
        return True
    return False 

sentences = [clean(s) for s in sentences if len(s) > 0]
sentences = [list(filter(None, sentence)) for sentence in sentences]
print(sentences[:2])

[['open', 'eyes', 'vote', 'hillary', 'clinton', 'salvation', 'believes', 'ideals', 'equality'], ['video', 'hillary', 'clinton', 'plays', 'whod', 'rather']]


In [8]:
# Word Embedding model
from gensim.models import Word2Vec

model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)

In [9]:
# List of terms, indices, term counts from model 
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]

# Sort by term counts so common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])

# Unzip terms, indices, and counts
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

In [10]:
word_vectors = pd.DataFrame(model.wv.vectors[term_indices,:], index=ordered_terms)
word_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
hillary,-0.744495,0.198544,0.294436,0.541798,-2.073801,-1.305456,0.037008,0.051667,1.929500,-0.945881,...,0.926419,-0.064897,0.395202,0.995419,2.036580,0.517449,1.586023,-1.257627,0.774469,-1.831798
clinton,0.564356,1.165670,0.286212,-0.882391,0.619434,1.019110,-0.999391,-1.431036,0.485701,-0.779933,...,1.515417,0.313176,-0.751629,1.072428,0.526850,0.128892,0.022774,-0.111158,-0.665859,-0.547021
trump,0.440929,3.305319,0.151263,-0.251962,-0.052791,0.887238,-0.843216,-1.125610,1.619667,-0.459012,...,0.937963,0.791035,0.134200,0.755907,2.099725,0.262812,-0.630056,0.077961,0.209606,-1.610929
donald,1.796127,0.979277,-0.354990,1.133052,0.944817,-1.285856,0.125052,1.614978,2.781754,0.986878,...,0.048482,-1.634655,2.528589,-0.839654,1.020327,-0.719196,0.120122,-0.116653,-2.185925,-0.499000
bernie,-0.332492,3.826576,-0.101019,-0.517806,-1.568709,-0.483485,-1.247467,1.421254,1.076326,-2.479583,...,0.777314,-0.289824,-1.604616,-0.555680,0.736251,-1.348314,-0.142360,-1.905085,-0.005539,1.215814
sanders,-0.000289,2.328243,-1.681569,-0.201246,0.264732,-1.808154,-0.838537,-0.040165,2.052878,-1.918013,...,2.610362,-0.317272,-0.505354,0.321454,-1.744209,0.330520,-0.595840,-1.595468,0.092064,1.041520
vote,1.116564,1.762039,2.010917,-0.139877,0.983306,0.073076,1.406590,0.630758,-0.332707,0.305328,...,1.942875,-1.830349,-1.411612,-3.805000,1.560972,-1.053757,-0.153963,0.019518,2.438488,-2.604918
bill,-0.822366,1.942518,-1.008153,-0.245276,0.631868,-1.381268,1.237042,3.217082,0.127497,0.360126,...,-1.284108,2.445804,-0.755773,0.882540,2.576370,3.677175,-0.958118,0.594568,-0.484034,-1.827137
president,-1.587924,3.641011,0.219153,-0.776654,0.697552,0.611732,0.171341,2.158532,-0.071729,0.300150,...,-2.027057,0.259791,0.910133,-0.704375,0.811632,0.490356,-0.445838,0.550869,1.320791,-3.296235
campaign,-0.596075,0.283505,1.430297,-1.708702,2.085750,-0.434753,-3.067282,1.619366,3.108739,0.817062,...,1.155170,-1.188518,0.560601,1.630078,-0.510225,1.140200,-0.517700,-3.017252,-1.178992,-0.532207


In [11]:
def related_terms(model, token, topn=10):
    out = []
    for word, similarity in model.wv.most_similar(positive=[token], topn=topn):
        out.append([word, round(similarity, 3)])
    return out

for elem in related_terms(model, 'hillary'):
    print(elem)
print('\n')
for elem in related_terms(model, 'clinton'):
    print(elem)

['trump', 0.335]
['girlass', 0.313]
['them', 0.279]
['intlship', 0.272]
['alongwhen', 0.272]
['delegat', 0.27]
['breitbart', 0.27]
['zerohedge', 0.269]
['transgressions', 0.268]
['rworldnews', 0.268]


['breitbart', 0.368]
['trump', 0.346]
['them', 0.345]
['goodt', 0.337]
['audience', 0.332]
['shes', 0.309]
['claiming', 0.307]
['fudged', 0.306]
['antihate', 0.301]
['orgasm', 0.287]


  if np.issubdtype(vec.dtype, np.int):


In [19]:
def network(model, token, topn=10):
    base = related_terms(model, token, topn)
    network = []
    network.append([[i] for i in base])
    for i in range(topn, 0, -1):
        new_related = related_terms(model, base[topn - i][0], i)
        network.append([[base[topn-i][0]], new_related, [0], [topn - i + 1]])
        
        for layer in range(i//2, 0, -1):
            layered_related = related_terms(model, new_related[i//2 - layer][0], layer)
            network.append([[new_related[i//2 - layer][0]], layered_related, [i//2 - layer + 1], [topn - i + 1]])
    return network

center_word = 'clinton'
word_network = network(model, center_word)
print('Model centered around word ' + center_word)
for words in word_network:
    for items in words: 
        for word in items:
            print(word)
    print('\n')

Model centered around word clinton
['breitbart', 0.368]
['trump', 0.346]
['them', 0.345]
['goodt', 0.337]
['audience', 0.332]
['shes', 0.309]
['claiming', 0.307]
['fudged', 0.306]
['antihate', 0.301]
['orgasm', 0.287]


breitbart
['amid', 0.459]
['fresh', 0.443]
['push', 0.432]
['largest', 0.423]
['flowed', 0.421]
['claimed', 0.421]
['coverage', 0.42]
['suddenly', 0.419]
['raise', 0.418]
['boost', 0.415]
0
1


amid
['protests', 0.576]
['bonnycombs', 0.551]
['rollcall', 0.541]
['highlights', 0.503]
['matchups', 0.502]
1
1


fresh
['twtr', 0.492]
['update', 0.486]
['suddenly', 0.453]
['breitbart', 0.443]
2
1


push
['teaming', 0.48]
['militariza', 0.469]
['define', 0.466]
3
1


largest
['worlds', 0.773]
['profits', 0.737]
4
1


flowed
['output', 0.703]
5
1


trump
['bernie', 0.372]
['cruz', 0.348]
['clinton', 0.346]
['hillary', 0.335]
['nytimes', 0.334]
['baselessly', 0.321]
['cnnpolitics', 0.316]
['scathing', 0.306]
['prepares', 0.301]
0
2


bernie
['sanders', 0.488]
['underway', 0.447]