In [1]:
import pandas as pd
import nltk

In [25]:
# Load data frame
df = pd.read_csv('/media/zainkhan/USB30FD/suspended-clinton-tweets.txt', encoding='latin1', error_bad_lines=False, warn_bad_lines=False, header=None)
df = df[0]
df.head()

0    RT @KenRenar96: Open your eyes you vote for Hi...
1    SEE VIDEO! Hillary Clinton Plays Who'd You Rat...
2    RT @RealFKNNews: #US #media to #prematurely #P...
3    SEE VIDEO! Who Is Hillary Clinton?... https://...
4    Hillary Clinton #LoveTrumpsHate Large Black Cu...
Name: 0, dtype: object

In [26]:
# Remove stop words
df = df.str.lower().str.split()
stop = nltk.corpus.stopwords.words('english')
df = df.apply(lambda x: [item for item in x if item not in stop])
df.head()

0    [rt, @kenrenar96:, open, eyes, vote, hillary, ...
1    [see, video!, hillary, clinton, plays, who'd, ...
2    [rt, @realfknnews:, #us, #media, #prematurely,...
3    [see, video!, hillary, clinton?..., https://t....
4    [hillary, clinton, #lovetrumpshate, large, bla...
Name: 0, dtype: object

In [27]:
# Stem dataframe 
stemmer = nltk.stem.PorterStemmer()
def stem_sentences(tokens):
    return [stemmer.stem(token) for token in tokens]
df = df.apply(stem_sentences)
df.head()

0    [rt, @kenrenar96:, open, eye, vote, hillari, c...
1    [see, video!, hillari, clinton, play, who'd, r...
2    [rt, @realfknnews:, #u, #media, #prematur, #pe...
3    [see, video!, hillari, clinton?..., https://t....
4    [hillari, clinton, #lovetrumpsh, larg, black, ...
Name: 0, dtype: object

In [28]:
# Separate into sentences and remove URLs
sentences = df.tolist()
print(sentences[:2])

[['rt', '@kenrenar96:', 'open', 'eye', 'vote', 'hillari', 'clinton.', 'salvation.', 'believ', 'ideal', 'equality!!!', '#hillaryforpr'], ['see', 'video!', 'hillari', 'clinton', 'play', "who'd", 'rather?...', 'https://t.co/aho7bx9vwp', 'https://t.co/6egopos352']]


In [29]:
def clean(s):
    return [w.strip(',.#"!?:;()\'') for w in s if 'http' not in w and '@' not in w and '#' not in w and '9' not in w]
sentences = [clean(s) for s in sentences if len(s) > 0]
print(sentences[:2])

[['rt', 'open', 'eye', 'vote', 'hillari', 'clinton', 'salvation', 'believ', 'ideal', 'equality'], ['see', 'video', 'hillari', 'clinton', 'play', "who'd", 'rather']]


In [30]:
# Word Embedding model
from gensim.models import Word2Vec

model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)

In [14]:
# List of terms, indices, term counts from model 
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]

# Sort by term counts so common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])

# Unzip terms, indices, and counts
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

In [15]:
word_vectors = pd.DataFrame(model.wv.vectors[term_indices,:], index=ordered_terms)
word_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
trump,0.910848,-0.410991,-1.300394,-1.786939,0.352859,0.346310,-0.135978,-0.030257,1.994620,1.330752,...,-0.096224,-0.350665,2.176208,3.436026,1.272468,0.238499,0.865564,3.090446,-1.570582,0.134266
donald,-0.721222,2.120931,1.559470,0.132883,1.089941,-0.559276,-0.018047,-0.810538,-0.685297,-0.805474,...,3.098275,0.288755,0.503695,0.452570,0.229112,-0.324159,-2.656030,1.966518,0.395246,-0.673875
rt,0.306768,0.590558,0.848485,-0.107809,0.558972,-0.873156,0.523891,-0.046795,0.354524,0.469104,...,0.979121,1.371631,0.104328,1.095127,1.821567,1.074387,-0.858346,0.964615,-1.280907,-0.334979
-,-0.100734,0.154378,1.146559,-0.098892,-0.123408,-1.731241,0.197319,0.047802,1.075346,0.547097,...,0.059068,0.435597,0.104823,0.994002,0.588292,0.877692,-0.947975,-0.160624,-0.001701,-0.832322
presid,-0.897980,0.230365,0.516805,-0.364607,-0.544775,-1.058298,0.340094,-1.465686,0.135937,1.586883,...,-0.938562,0.240010,-1.406729,-1.261261,-0.153476,1.818510,-0.637183,-0.210542,0.242731,-1.493574
rick,0.001461,-0.903236,0.537633,2.723791,-2.196663,-1.463818,-0.067577,2.087411,0.079200,-1.692905,...,-0.291142,0.520145,-0.835370,-0.933048,3.684911,0.735709,-0.885907,-2.523912,0.049516,-1.207811
president,-0.172558,0.170722,0.192296,0.516258,-1.825060,1.457329,-1.216013,0.467192,-0.581230,-1.382495,...,-1.885316,-0.987521,-0.286611,-1.801727,0.536207,1.041876,0.076116,-1.492806,-0.419686,0.064538
popp,-1.478466,0.112749,0.281616,2.163009,-2.125946,-0.343585,1.798992,0.603050,0.120004,-0.147151,...,-1.244306,0.864934,-0.106210,-1.052992,1.676566,0.263101,-0.779431,-3.249786,-0.448874,-0.835199
check,-1.061683,-0.102820,0.706331,1.655484,-1.307136,-1.032417,0.414832,3.203683,0.681678,0.604953,...,-0.295465,1.735544,0.297373,-1.407671,1.478031,-1.332097,-1.071383,-2.234506,-1.254648,-1.127024
heard,-1.307075,0.834937,-1.738430,-0.387169,-1.596483,-0.538724,-1.795624,2.567145,-1.168151,-1.255403,...,-3.019113,1.843954,-2.114285,-0.827182,-0.062029,-0.487536,0.730682,-2.470814,-0.171811,-1.630439


In [16]:
def related_terms(model, token, topn=10):
    out = []
    for word, similarity in model.wv.most_similar(positive=[token], topn=topn):
        out.append([word, round(similarity, 3)])
    return out

for elem in related_terms(model, 'hillary'):
    print(elem)
print('\n')
for elem in related_terms(model, 'trump'):
    print(elem)

['hillari', 0.582]
['jail.\x94', 0.479]
['also', 0.397]
['hrc', 0.381]
['ghostwriteri', 0.367]
['0.733', 0.366]
['nonsense', 0.354]
['complic', 0.354]
['overh', 0.354]
['125', 0.353]


['trump\x92', 0.745]
['trumpæ', 0.596]
['rt', 0.482]
['t', 0.475]
['tru', 0.445]
['th', 0.437]
['', 0.422]
['tr', 0.415]
['public', 0.395]
['o', 0.387]


In [18]:
def network(model, token, topn=10):
    base = related_terms(model, token, topn)
    network = []
    network.append([[i] for i in base])
    for i in range(topn, 0, -1):
        new_related = related_terms(model, base[topn - i][0], i)
        network.append([new_related, [0], [topn - i + 1]])
        
        for layer in range(i//2, 0, -1):
            layered_related = related_terms(model, new_related[i//2 - layer][0], layer)
            network.append([layered_related, [i//2 - layer + 1], [topn - i + 1]])
    return network

center_word = 'donald'
word_network = network(model, center_word)
print('Model centered around word ' + center_word)
for words in word_network:
    for items in words: 
        for word in items:
            print(word)
    print('\n')

Model centered around word donald
['&lt;b&gt;donald&lt;/b&gt', 0.741]
['rt', 0.454]
['', 0.44]
['mr', 0.386]
['via', 0.377]
['th', 0.371]
['jetlog', 0.369]
['him', 0.358]
['public', 0.358]
['-', 0.356]


['donald', 0.741]
['\x97', 0.543]
['nytimes', 0.491]
['fortunemagazine', 0.49]
['businessinsider', 0.469]
['a', 0.465]
['mr', 0.461]
['washingtonpost', 0.456]
['total', 0.448]
['recent', 0.446]
0
1


['&lt;b&gt;donald&lt;/b&gt', 0.741]
['rt', 0.454]
['', 0.44]
['mr', 0.386]
['via', 0.377]
1
1


['ù', 0.719]
['&lt;b&gt;donald&lt;/b&gt', 0.543]
['via', 0.483]
['t', 0.477]
2
1


['wsj', 0.623]
['approach', 0.525]
['in', 0.501]
3
1


['washingtonpost', 0.569]
['marketwatch', 0.53]
4
1


['washingtonpost', 0.518]
5
1


['-', 0.602]
['trump', 0.482]
['donald', 0.454]
['him', 0.45]
['presid', 0.441]
['whether', 0.44]
['', 0.43]
['th', 0.387]
['&lt;b&gt;donald&lt;/b&gt', 0.386]
0
2


['rt', 0.602]
['check', 0.558]
['presid', 0.545]
['popp', 0.532]
1
2


['trump\x92', 0.745]
['trumpæ', 0.596]
[