# Trump Tweets

In [22]:
# The usual suspects ...
import logging
import pandas as pd

# And their accomplices ...
from gensim import corpora
from gensim import models
from gensim import similarities
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from collections import defaultdict
from pprint import pprint

# Settings
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [23]:
tweets = pd.read_csv('realDonaldTrump_poll_tweets.csv')

In [24]:
tweets.shape

(448, 3)

In [25]:
tweets.head()

Unnamed: 0,id,created_at,text
0,7.656299e+17,8/16/2016 19:22:57,"It's just a 2-point race, Clinton 38%, Trump 3..."
1,7.587319e+17,7/28/2016 18:32:31,"""@LallyRay: Poll: Donald Trump Sees 17-Point P..."
2,7.583505e+17,7/27/2016 17:16:56,Great new poll - thank you!\n#MakeAmericaGreat...
3,7.575775e+17,7/25/2016 14:05:27,Great POLL numbers are coming out all over. Pe...
4,7.536034e+17,7/14/2016 14:53:46,Another new poll. Thank you for your support! ...


In [26]:
# Text corpus
document = [i for i in tweets['text']]

In [27]:
# Removing common words and tokenize
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
for doc in document:
    list_of_words = [i.lower() for i in wordpunct_tokenize(doc) if i.lower() not in stop_words]
stop_words.update(list_of_words)

In [28]:
# Removing common words
texts = [[word for word in doc.lower().split() if word not in stop_words] for doc in document]

# Removing words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

pprint(texts)

[['clinton', 'trump'],
 ['poll:',
  'donald',
  'trump',
  'two',
  '-',
  'breitbart',
  '@realdonaldtrump"',
  'great!'],
 ['great', 'new', 'poll', '-', 'thank', 'you!', '#makeamericagreatagain'],
 ['great',
  'poll',
  'numbers',
  'coming',
  'people',
  'want',
  'another',
  'four',
  'years',
  'crooked',
  'hillary',
  'even'],
 ['another', 'new', 'poll.', 'thank', 'support!', '#imwithyou'],
 ['great', 'new', 'poll-', 'thank', 'america!', '#trump2016', '#imwithyou'],
 ['despite', 'spending', 'day', 'ads', 'nationwide', 'zero'],
 ['great', 'poll-', 'thank', 'you!'],
 ['new', 'poll', '-', 'thank', 'you!', '#trump2016'],
 ['new',
  'q',
  'poll',
  'going',
  'win',
  'make',
  'america',
  'great',
  'again!',
  '#trump2016'],
 ['poll',
  'done',
  '@abc',
  '@washingtonpost',
  'even',
  'many',
  'democrats',
  'good.'],
 ['hillary', 'clinton', 'change', 'old', 'spending', 'spending', 'polls!'],
 ['@abc', 'poll', 'dishonest', '-', 'good!'],
 ['many',
  'great',
  'things',
  '-

 ['good', 'new', 'quinnipiac', 'poll', 'came', '#1', 'iowa.'],
 ['people', 'register', '1%', 'polls.', 'never', 'thought'],
 ['great',
  'honor',
  'polling',
  'numbers',
  'like',
  'american',
  '&amp;',
  'numbers'],
 ['poll:', 'trump', 'beats', 'clinton', 'thank', 'you!'],
 ['presidential',
  'election',
  'today,',
  'according',
  '@surveyusa',
  'poll,',
  'donald',
  'trump'],
 ['trump', '40', 'percent', 'poll'],
 ['first',
  'place',
  'lot',
  'polls,',
  'tied',
  'first',
  'place',
  'ben',
  'carson',
  'one',
  'iowa',
  'poll.',
  'thought'],
 ['tracking', 'pollster', '#gop'],
 ['2016', 'nomination', '#1'],
 ['interviewed', '@gma', 'morning', '7:00.', 'thanks', 'great', 'poll'],
 ['"@roniseale:', 'big', 'leads', '#trump2016', 'great!'],
 ['every',
  'poll',
  'winning',
  'listen',
  'dopey',
  'trump',
  'think',
  "i'm",
  '@foxnews'],
 ['yet',
  'another',
  'weak',
  'hit',
  'candidate',
  'failing',
  'jeb',
  'low',
  'others',
  'gone'],
 ['released',
  'public

In [29]:
# Create dictionary of document
bag = corpora.Dictionary(texts)
bag.save('trump.dict')

# Converting document to a vector (bag-of-words)
corpus = [bag.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('trump.mm', corpus)

2018-06-25 12:24:56,703 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-06-25 12:24:56,713 : INFO : built Dictionary(661 unique tokens: ['clinton', 'trump', '-', '@realdonaldtrump"', 'breitbart']...) from 448 documents (total 3791 corpus positions)
2018-06-25 12:24:56,715 : INFO : saving Dictionary object under trump.dict, separately None
2018-06-25 12:24:56,717 : INFO : saved trump.dict
2018-06-25 12:24:56,722 : INFO : storing corpus in Matrix Market format to trump.mm
2018-06-25 12:24:56,723 : INFO : saving sparse matrix to trump.mm
2018-06-25 12:24:56,723 : INFO : PROGRESS: saving document #0
2018-06-25 12:24:56,732 : INFO : saved 448x661 matrix, density=1.253% (3710/296128)
2018-06-25 12:24:56,735 : INFO : saving MmCorpus index to trump.mm.index


We have assigned a unique integer id to all words appearing in the corpus by:
   
   1. sweeping across the texts
   2. collecting word counts and relevant statistics
   
Our corpus is a 448 x 661 matrix.

***

### Transformation: _tf-idf_

#### Step 1:

In [30]:
# Initialization
tfidf = models.TfidfModel(corpus)

2018-06-25 12:25:42,894 : INFO : collecting document frequencies
2018-06-25 12:25:42,895 : INFO : PROGRESS: processing document #0
2018-06-25 12:25:42,897 : INFO : calculating IDF weights for 448 documents and 660 features (3710 matrix non-zeros)


We have initialized (trained) a transaformation model. Different transformation may require different initialization parameters; however, in our case, ___tf-idf___, the "training" consists simply of going through the supplied corpus once and computing document frequencies of all its features. This is in comparison to ___Latent Semantic Analysis___ & ___Latent Dirichlet Allocation___ which are more involved and take more time.

|Note:|
|---|
|**A note on transaformations**<br>Transformations always convert between two specific vector spaces. The same vector space (= the same set of feature ids) must be used for training as well as for subsequent vector transformations. Failure to use the same input feature space, such as applying a different string preprocessing, using different feature ids, or using bag-of-words input vectors where ___tf-idf___ vectors are expceted, will result in feature mismatch during transformation calls and consequently in either garbage output and/or runtime exceptions.|

#### Step 2:
From now on, ___tf-idf___ is treated as a read-only object that can be used to convert any vector from the old representation (___bag-of-words___ integer counts) to the new representation (___tf-idf___ real-valued weights).

In [33]:
# Applying the transformation to the whole corpus
corpus_tfidf = tfidf[corpus]

We have transformed our corpus (the one we used for training) into a weighted vector. We can do this for any vector (provided they come from the same vector space), even if they are not used in the corpus at all. This can be achived by _folding-in_ for ___LSA___ and by _topic inference_ for ___LDA___.

#### Step 3:
We will transform our ___tf-idf___ corpus via [Latent Semantic Indexing](https://en.wikipedia.org/wiki/Latent_semantic_indexing) into a latent 2-D space (... num_topics = 2).

In [40]:
# Initializing an LSI transformation
lsi = models.LsiModel(corpus_tfidf, id2word=bag, num_topics=10)
corpus_lsi = lsi[corpus_tfidf]

2018-06-25 12:59:16,657 : INFO : using serial LSI version on this node
2018-06-25 12:59:16,659 : INFO : updating model with new documents
2018-06-25 12:59:16,684 : INFO : preparing a new chunk of documents
2018-06-25 12:59:16,689 : INFO : using 100 extra samples and 2 power iterations
2018-06-25 12:59:16,691 : INFO : 1st phase: constructing (661, 102) action matrix
2018-06-25 12:59:16,694 : INFO : orthonormalizing (661, 102) action matrix
2018-06-25 12:59:16,707 : INFO : 2nd phase: running dense svd on (102, 448) matrix
2018-06-25 12:59:16,713 : INFO : computing the final decomposition
2018-06-25 12:59:16,715 : INFO : keeping 2 factors (discarding 90.109% of energy spectrum)
2018-06-25 12:59:16,717 : INFO : processed documents up to #448
2018-06-25 12:59:16,720 : INFO : topic #0(3.923): 0.397*"thank" + 0.390*"you!" + 0.350*"#makeamericagreatagain" + 0.333*"#trump2016" + 0.305*"great" + 0.287*"new" + 0.197*"poll" + 0.158*"-" + 0.141*"poll-" + 0.115*"trump"
2018-06-25 12:59:16,721 : INFO

In [41]:
lsi.print_topics()

2018-06-25 12:59:18,785 : INFO : topic #0(3.923): 0.397*"thank" + 0.390*"you!" + 0.350*"#makeamericagreatagain" + 0.333*"#trump2016" + 0.305*"great" + 0.287*"new" + 0.197*"poll" + 0.158*"-" + 0.141*"poll-" + 0.115*"trump"
2018-06-25 12:59:18,787 : INFO : topic #1(2.977): 0.395*"trump" + -0.252*"you!" + 0.225*"carson" + 0.224*"rubio" + -0.222*"#makeamericagreatagain" + -0.221*"thank" + 0.189*"donald" + 0.181*"cruz" + 0.180*"leads" + -0.171*"#trump2016"


[(0,
  '0.397*"thank" + 0.390*"you!" + 0.350*"#makeamericagreatagain" + 0.333*"#trump2016" + 0.305*"great" + 0.287*"new" + 0.197*"poll" + 0.158*"-" + 0.141*"poll-" + 0.115*"trump"'),
 (1,
  '0.395*"trump" + -0.252*"you!" + 0.225*"carson" + 0.224*"rubio" + -0.222*"#makeamericagreatagain" + -0.221*"thank" + 0.189*"donald" + 0.181*"cruz" + 0.180*"leads" + -0.171*"#trump2016"')]

In [42]:
# Executing: bow->tfidf and tfidf->lsi
for doc in corpus_lsi:
    print(doc)

[(0, 0.078228657691018624), (1, 0.20441856013733928)]
[(0, 0.10399650442590408), (1, 0.22113317538330826)]
[(0, 0.78184555903613062), (1, -0.23477329139145126)]
[(0, 0.13205131474465895), (1, 0.084383665640546737)]
[(0, 0.23495335900996084), (1, -0.072539491674543627)]
[(0, 0.3974676632358502), (1, -0.16024415035280648)]
[(0, 0.0076834182013354301), (1, 0.01549544062069653)]
[(0, 0.53752700987304913), (1, -0.27054673114524025)]
[(0, 0.71406744540841161), (1, -0.21942684627541553)]
[(0, 0.25916878767844204), (1, 0.083987713234705955)]
[(0, 0.042599181539093436), (1, 0.062925457636498552)]
[(0, 0.025787739986490137), (1, 0.055911733886331361)]
[(0, 0.079048343057186193), (1, 0.069056347270417373)]
[(0, 0.18519949169128769), (1, 0.15045430580762129)]
[(0, 0.047529607153804158), (1, 0.066016686177484157)]
[(0, 0.075621671897237216), (1, 0.19366183286788022)]
[(0, 0.75789003015937495), (1, -0.36003551884615853)]
[(0, 0.59113227625988096), (1, -0.24793460781086224)]
[(0, 0.032065597568624774

In [45]:
# Model persistence: save(), load()
lsi.save('trump.lsi')
lsi = models.LsiModel.load('trump.lsi')

2018-06-25 13:06:28,068 : INFO : saving Projection object under trump.lsi.projection, separately None
2018-06-25 13:06:28,070 : INFO : saved trump.lsi.projection
2018-06-25 13:06:28,070 : INFO : saving LsiModel object under trump.lsi, separately None
2018-06-25 13:06:28,071 : INFO : not storing attribute projection
2018-06-25 13:06:28,072 : INFO : not storing attribute dispatcher
2018-06-25 13:06:28,073 : INFO : saved trump.lsi
2018-06-25 13:06:28,074 : INFO : loading LsiModel object from trump.lsi
2018-06-25 13:06:28,078 : INFO : loading id2word recursively from trump.lsi.id2word.* with mmap=None
2018-06-25 13:06:28,078 : INFO : setting ignored attribute projection to None
2018-06-25 13:06:28,079 : INFO : setting ignored attribute dispatcher to None
2018-06-25 13:06:28,079 : INFO : loaded trump.lsi
2018-06-25 13:06:28,080 : INFO : loading LsiModel object from trump.lsi.projection
2018-06-25 13:06:28,081 : INFO : loaded trump.lsi.projection


***
### Similarity

#### Step 1:

In [46]:
# Initializing the query structure: transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus])

2018-06-25 13:06:30,818 : INFO : creating matrix with 448 documents and 2 features


In [47]:
# Index persistence
index.save('trump.index')
index = similarities.MatrixSimilarity.load('trump.index')

2018-06-25 13:07:04,167 : INFO : saving MatrixSimilarity object under trump.index, separately None
2018-06-25 13:07:04,169 : INFO : saved trump.index
2018-06-25 13:07:04,170 : INFO : loading MatrixSimilarity object from trump.index
2018-06-25 13:07:04,173 : INFO : loaded trump.index


#### Step 2:

In [48]:
# Performing queries
doc = "Let's do great things together."
vec_bow = bag.doc2bow(doc.lower().split())

# Convert the query to LSI space
vec_lsi = lsi[vec_bow]

# Perform a similarity query against the corpus
sims = index[vec_lsi]
print(list(enumerate(sims)))

[(0, 0.36203462), (1, 0.45993647), (2, 0.97156531), (3, 0.90126747), (4, 0.95793784), (5, 0.93852115), (6, 0.46977273), (7, 0.88627368), (8, 0.96947181), (9, 0.98368949), (10, 0.73574883), (11, 0.46924236), (12, 0.833372), (13, 0.82696438), (14, 0.73015976), (15, 0.38457021), (16, 0.90610689), (17, 0.93163973), (18, 0.72137231), (19, 0.41462761), (20, 0.40338758), (21, 0.44694966), (22, 0.79741168), (23, 0.9736439), (24, 0.99507999), (25, 0.87230557), (26, 0.61850029), (27, 0.57778525), (28, 0.98431766), (29, 0.49416998), (30, 0.77613974), (31, 0.3893846), (32, 0.37724021), (33, 0.57327664), (34, 0.42013738), (35, 0.86085087), (36, 0.42788711), (37, 0.34621596), (38, 0.82707715), (39, 0.94854945), (40, 0.67233217), (41, 0.35608494), (42, 0.69666326), (43, 0.94924194), (44, 0.64765865), (45, 0.76504034), (46, 0.74760824), (47, 0.5340693), (48, 0.81252903), (49, 0.84984893), (50, 0.80576658), (51, 0.53680313), (52, 0.81335539), (53, 0.3652688), (54, 0.55538213), (55, 0.64989877), (56, 0.

In [49]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims)

[(389, 0.99998546), (205, 0.99996829), (387, 0.99992692), (268, 0.99971181), (139, 0.99969018), (303, 0.9995811), (228, 0.99841982), (334, 0.99823272), (295, 0.99810803), (302, 0.99810803), (213, 0.99778992), (241, 0.99708736), (396, 0.99563485), (24, 0.99507999), (375, 0.99501681), (192, 0.99399424), (108, 0.99337113), (87, 0.99249667), (81, 0.99084198), (416, 0.99056619), (97, 0.98838675), (80, 0.98835403), (180, 0.98833865), (190, 0.98801303), (437, 0.98766118), (435, 0.98648793), (158, 0.98618644), (231, 0.98536271), (28, 0.98431766), (9, 0.98368949), (247, 0.98340809), (57, 0.98279589), (189, 0.98148721), (70, 0.98145628), (59, 0.98122585), (219, 0.98120666), (200, 0.98091835), (101, 0.97823185), (410, 0.97548807), (102, 0.97508711), (187, 0.9746052), (264, 0.97427976), (23, 0.9736439), (196, 0.97268677), (91, 0.97208494), (2, 0.97156531), (8, 0.96947181), (56, 0.96944261), (314, 0.96781671), (274, 0.96662158), (399, 0.96618998), (358, 0.96588153), (238, 0.96349555), (382, 0.96307