In [3]:
import pandas as pd
import nltk

In [4]:
# Load data frame
df = pd.read_csv('/media/zainkhan/USB30FD/suspended-clinton-tweets.txt', encoding='latin1', error_bad_lines=False, warn_bad_lines=False, header=None)
df = df[0]
df.head()

0    RT @KenRenar96: Open your eyes you vote for Hi...
1    SEE VIDEO! Hillary Clinton Plays Who'd You Rat...
2    RT @RealFKNNews: #US #media to #prematurely #P...
3    SEE VIDEO! Who Is Hillary Clinton?... https://...
4    Hillary Clinton #LoveTrumpsHate Large Black Cu...
Name: 0, dtype: object

In [5]:
# Remove stop words
df = df.str.lower().str.split()
stop = nltk.corpus.stopwords.words('english')
df = df.apply(lambda x: [item for item in x if item not in stop])
df.head()

0    [rt, @kenrenar96:, open, eyes, vote, hillary, ...
1    [see, video!, hillary, clinton, plays, who'd, ...
2    [rt, @realfknnews:, #us, #media, #prematurely,...
3    [see, video!, hillary, clinton?..., https://t....
4    [hillary, clinton, #lovetrumpshate, large, bla...
Name: 0, dtype: object

In [7]:
# Separate into sentences
sentences = df.tolist()
print(sentences[:2])

[['rt', '@kenrenar96:', 'open', 'eyes', 'vote', 'hillary', 'clinton.', 'salvation.', 'believes', 'ideals', 'equality!!!', '#hillaryforpr'], ['see', 'video!', 'hillary', 'clinton', 'plays', "who'd", 'rather?...', 'https://t.co/aho7bx9vwp', 'https://t.co/6egopos352']]


In [8]:
# Clean and remove illegal characters
import re
def clean(s):
    for i, w in enumerate(s):
        if contains_illegal(w):
            s[i] = ''
        else:
            if 'cli' in w:
                s[i] = 'clinton'
            elif 'hrc' in w or 'hil' in w:
                s[i] = 'hillary'
            elif 'tru' in w or 'trmp' in w:
                s[i] = 'trump'
            elif 'bern' in w:
                s[i] = 'bernie'
            elif 'sander' in w:
                s[i] = 'sanders'
            extras = '0123456789~_=*^,.-`%""+#&"!?<>:;/\\\'()[]{}$|\x91\x92\x93\x94\x96\x97'
            s[i] = s[i].translate({ord(c):'' for c in extras})
            if len(s[i]) < 4:
                s[i] = ''
    return s
                    
def contains_illegal(w):
    illegal = ['@', '#', 'htt', 'via', 'desd']
    if any(x in w for x in illegal):
        return True
    if re.match("^\d+?\.\d+?$", w) is not None:
        return True
    return False 

sentences = [clean(s) for s in sentences if len(s) > 0]
sentences = [list(filter(None, sentence)) for sentence in sentences]
print(sentences[:2])

[['open', 'eyes', 'vote', 'hillary', 'clinton', 'salvation', 'believes', 'ideals', 'equality'], ['video', 'hillary', 'clinton', 'plays', 'whod', 'rather']]


In [16]:
# Get adjective list to iterate over
adj = pd.read_csv('adjectives-character-richards.csv')
adj = adj.values.tolist()
adj = [word[0] for word in adj]
adj[:5]

['accessible', 'active', 'adaptable', 'admirable', 'adventurous']

In [24]:
# Make sentences unique
unique_sentences = [list(x) for x in set(tuple(x) for x in sentences)]
print(unique_sentences[:2])

[['trump', 'says', 'hillary', 'clinton', 'getting', 'pumped', 'debate', 'says', 'take', 'drug', 'test'], ['difference', 'hillary', 'sanders', 'attacked', 'clinton', 'record', 'clinton', 'attacked', 'bernie', 'fans']]


In [46]:
# Maps adjective occurences per sentence relative to a word
def adj_to_word(sentences, adj_list, target):
    adj_to_word = {}
    sentences_with_target = [s for s in sentences if target in s]

    for adj in adj_list:
        for sentence in sentences_with_target:
            if adj in sentence:
                if adj in adj_to_word:
                    adj_to_word[adj] += 1
                else:
                    adj_to_word[adj] = 1
    return adj_to_word

target_word = 'trump'
d = adj_to_word(sentences, adj, target_word)

In [47]:
dlist = []
for key, value in d.items():
    temp = [key,value]
    dlist.append(temp)
dlist.sort(key=lambda x: x[1], reverse=True)
print(dlist[:3])

[['intuitive', 994], ['honest', 822], ['criminal', 616]]


In [48]:
print(target_word)
for i in dlist:
    print(i)

trump
['intuitive', 994]
['honest', 822]
['criminal', 616]
['brutal', 347]
['false', 338]
['political', 317]
['kind', 305]
['winning', 279]
['open', 240]
['private', 218]
['liberal', 207]
['sane', 197]
['strong', 197]
['dirty', 160]
['stupid', 141]
['critical', 131]
['weak', 122]
['cold', 119]
['physical', 115]
['obvious', 106]
['dull', 102]
['tough', 101]
['angry', 100]
['responsible', 93]
['attractive', 92]
['clean', 87]
['conservative', 87]
['greedy', 84]
['fraudulent', 82]
['simple', 79]
['disturbing', 76]
['serious', 75]
['brilliant', 74]
['desperate', 71]
['proud', 69]
['insulting', 69]
['loyal', 68]
['crazy', 66]
['confused', 64]
['deep', 59]
['intense', 58]
['extreme', 54]
['dishonest', 53]
['independent', 51]
['wise', 51]
['hostile', 50]
['formal', 48]
['grand', 48]
['clever', 47]
['fiery', 47]
['focused', 43]
['secure', 41]
['original', 40]
['modern', 40]
['deceitful', 40]
['alert', 39]
['decisive', 39]
['impressive', 38]
['crude', 33]
['narrow', 33]
['solid', 32]
['fair', 31