In [116]:
import pandas as pd
import nltk

In [117]:
# Load data frame
df = pd.read_csv('/media/zainkhan/USB30FD/suspended-clinton-tweets.txt', encoding='latin1', error_bad_lines=False, warn_bad_lines=False, header=None)
df = df[0]
df.head()

0    RT @KenRenar96: Open your eyes you vote for Hi...
1    SEE VIDEO! Hillary Clinton Plays Who'd You Rat...
2    RT @RealFKNNews: #US #media to #prematurely #P...
3    SEE VIDEO! Who Is Hillary Clinton?... https://...
4    Hillary Clinton #LoveTrumpsHate Large Black Cu...
Name: 0, dtype: object

In [118]:
# Remove stop words
df = df.str.lower().str.split()
stop = nltk.corpus.stopwords.words('english')
df = df.apply(lambda x: [item for item in x if item not in stop])
df.head()

0    [rt, @kenrenar96:, open, eyes, vote, hillary, ...
1    [see, video!, hillary, clinton, plays, who'd, ...
2    [rt, @realfknnews:, #us, #media, #prematurely,...
3    [see, video!, hillary, clinton?..., https://t....
4    [hillary, clinton, #lovetrumpshate, large, bla...
Name: 0, dtype: object

In [119]:
'''
# Stem dataframe
stemmer = nltk.stem.PorterStemmer()
def stem_sentences(tokens):
    return [stemmer.stem(token) for token in tokens]
df = df.apply(stem_sentences)
df.head()
'''

'\n# Stem dataframe\nstemmer = nltk.stem.PorterStemmer()\ndef stem_sentences(tokens):\n    return [stemmer.stem(token) for token in tokens]\ndf = df.apply(stem_sentences)\ndf.head()\n'

In [120]:
# Separate into sentences
sentences = df.tolist()
print(sentences[:2])

[['rt', '@kenrenar96:', 'open', 'eyes', 'vote', 'hillary', 'clinton.', 'salvation.', 'believes', 'ideals', 'equality!!!', '#hillaryforpr'], ['see', 'video!', 'hillary', 'clinton', 'plays', "who'd", 'rather?...', 'https://t.co/aho7bx9vwp', 'https://t.co/6egopos352']]


In [121]:
# Clean and remove illegal characters
import re
def clean(s):
    for i, w in enumerate(s):
        if contains_illegal(w) or w == 'rt':
            s[i] = ''
        else:
            extras = '0123456789~_=*^,.-`%""+#&"!?<>:;/\\\'()[]{}$|\x91\x92\x93\x94\x96'
            s[i] = s[i].translate({ord(c):'' for c in extras})
    return s
                    
def contains_illegal(w):
    illegal = ['@', '#', 'http']
    if any(x in w for x in illegal):
        return True
    if re.match("^\d+?\.\d+?$", w) is not None:
        return True
    return False 

sentences = [clean(s) for s in sentences if len(s) > 0]
sentences = [list(filter(None, sentence)) for sentence in sentences]
print(sentences[:2])

[['open', 'eyes', 'vote', 'hillary', 'clinton', 'salvation', 'believes', 'ideals', 'equality'], ['see', 'video', 'hillary', 'clinton', 'plays', 'whod', 'rather']]


In [122]:
# Word Embedding model
from gensim.models import Word2Vec

model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)

In [123]:
# List of terms, indices, term counts from model 
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]

# Sort by term counts so common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])

# Unzip terms, indices, and counts
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

In [124]:
word_vectors = pd.DataFrame(model.wv.vectors[term_indices,:], index=ordered_terms)
word_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
hillary,2.266802,-0.835703,-2.211888,1.210517,-1.629337,1.683752,0.292930,0.086757,-0.448574,-2.450287,...,0.543922,-0.217607,2.354306,-0.756709,0.963785,-1.513436,1.439472,-0.975534,-0.292315,0.451264
clinton,-0.110492,1.003615,0.423989,-0.531578,0.202484,0.361452,-0.026296,-0.503144,0.430680,-0.098683,...,-0.410798,0.387433,0.924131,-0.561925,-0.334797,2.126797,0.022549,-2.834901,0.282542,-0.193930
trump,0.862238,1.023195,0.800909,1.008625,-1.415664,-0.996452,-0.550722,-1.020919,-1.814890,-0.349568,...,2.218308,0.271850,-0.150819,-0.055247,1.787276,0.383357,1.600330,-0.937208,0.324585,1.175155
donald,-3.261839,-0.444208,-0.294289,0.433288,-1.366282,0.394920,-0.685915,-0.663529,0.918082,-1.789662,...,0.144202,1.785617,0.940766,-0.546153,1.945149,1.720082,1.274138,-1.726923,2.791526,0.889206
clintons,0.354605,2.693308,2.425469,0.221197,1.063317,0.842651,-1.167726,0.404382,-0.792877,-0.208898,...,0.151957,0.871656,1.463926,-0.001764,1.204325,1.064652,-1.423470,-1.541806,0.139217,1.080338
amp,-0.526172,-1.505209,0.184271,0.973208,1.504410,0.100418,0.015159,1.144144,-2.690995,0.137914,...,0.004607,1.015455,1.754405,-1.877618,-0.278353,-1.546722,-0.896384,-1.926708,-0.685667,-1.354437
bernie,-1.347858,0.991170,0.998375,0.547960,-1.002647,1.222768,0.676367,1.791156,0.328424,-1.375344,...,2.012342,1.705940,2.407101,-0.059927,-0.337366,2.229227,3.120187,-2.419306,1.772858,2.117069
sanders,2.629327,0.421656,1.231516,-5.404619,2.707169,-1.490014,-2.851798,-0.158847,0.980408,1.371244,...,4.375166,3.274045,0.985184,1.326479,-2.092365,3.785090,3.138370,0.676897,0.773596,0.751321
vote,-0.674324,-0.840902,2.401483,1.439385,1.773409,-0.696903,-0.757736,2.035098,-1.765086,1.098912,...,-1.488661,3.209519,0.302314,-0.111422,0.143841,0.779966,-1.127854,0.805179,-0.006622,-1.358189
bill,0.410111,-2.133702,2.266984,2.761466,-1.955151,-2.822045,-3.169248,-0.940609,2.189733,1.065167,...,2.683581,2.143640,-2.530527,-1.225476,5.123259,-0.061614,0.618315,-2.671910,0.323741,1.869883


In [125]:
def related_terms(model, token, topn=10):
    out = []
    for word, similarity in model.wv.most_similar(positive=[token], topn=topn):
        out.append([word, round(similarity, 3)])
    return out

for elem in related_terms(model, 'hillary'):
    print(elem)
print('\n')
for elem in related_terms(model, 'trump'):
    print(elem)

['hillarys', 0.482]
['hrc', 0.366]
['htt', 0.349]
['acecharla', 0.317]
['trump', 0.305]
['mrs', 0.303]
['via', 0.303]
['comentando', 0.272]
['libertad', 0.269]
['intentó', 0.267]


['trumps', 0.697]
['trum', 0.548]
['tr', 0.527]
['tru', 0.41]
['asemejan', 0.376]
['trump\x94', 0.343]
['t', 0.34]
['sharpest', 0.329]
['lastsecond', 0.328]
['newswatch', 0.321]


In [127]:
def network(model, token, topn=10):
    base = related_terms(model, token, topn)
    network = []
    network.append([[i] for i in base])
    for i in range(topn, 0, -1):
        new_related = related_terms(model, base[topn - i][0], i)
        network.append([new_related, [0], [topn - i + 1]])
        
        for layer in range(i//2, 0, -1):
            layered_related = related_terms(model, new_related[i//2 - layer][0], layer)
            network.append([layered_related, [i//2 - layer + 1], [topn - i + 1]])
    return network

center_word = 'donald'
word_network = network(model, center_word)
print('Model centered around word ' + center_word)
for words in word_network:
    for items in words: 
        for word in items:
            print(word)
    print('\n')

Model centered around word donald
['ltbgtdonaldltbgt', 0.71]
['expectativas', 0.432]
['ivanka', 0.391]
['melania', 0.372]
['acosarla', 0.358]
['mr', 0.346]
['cole', 0.343]
['highstakes', 0.34]
['rohff', 0.335]
['googlefriendly', 0.329]


['donald', 0.71]
['backtoback', 0.544]
['spprters', 0.523]
['dangerously', 0.518]
['politicsdonald', 0.507]
['rohff', 0.507]
['contrast', 0.503]
['showma', 0.499]
['darker', 0.498]
['unscrupulous', 0.497]
0
1


['ltbgtdonaldltbgt', 0.71]
['expectativas', 0.432]
['ivanka', 0.391]
['melania', 0.372]
['acosarla', 0.358]
1
1


['columbus', 0.62]
['dispatch', 0.619]
['businessinsider', 0.57]
['scathing', 0.565]
2
1


['madecalled', 0.964]
['slayer', 0.717]
['dsouzas', 0.713]
3
1


['incoherent', 0.89]
['toughest', 0.593]
4
1


['tweetclash', 0.746]
5
1


['tres', 0.673]
['puntos', 0.643]
['aplicación', 0.639]
['últimos', 0.629]
['siete', 0.626]
['nicolás', 0.61]
['hispanos', 0.61]
['noticias', 0.609]
['enfrentar', 0.604]
0
2


['siete', 0.934]
['puntos', 0.