# Trend detection on Donald Trump's tweets

This is using a very quick and dirty trend formula which calculates the tf*idf score taking the tf from the most recent 20 tweets at any point in time, and the idf score from the previous 80 tweets.

In [1]:
import csv, re
import numpy as np
import operator

In [2]:
stopwords = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])

In [3]:
tweets = []

Trump's tweets taken from

https://github.com/sashaperigo/Trump-Tweets/blob/master/data.csv

In [4]:
with open('trump_tweets.csv', 'r', encoding='UTF-8') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for idx, row in enumerate(spamreader):
        if len(row) > 2 and idx > 0:
            tweets.append(row)

In [5]:
tweets[0]

['Nielson Media Research final numbers on ACCEPTANCE SPEECH: TRUMP  32.2 MILLION.  CLINTON 27.8 MILLION.  Thank you!',
 '2016-07-30 23:32:40',
 '13850',
 '4130',
 '759592590106849280']

In [6]:
recent_window = []
for tweet in tweets:
    # rudimentary tokenisation method!
    tokens = set()
    
    for t in tweet[0].lower().split(" "):
        if t not in stopwords and len(t) > 1 and re.search('[a-zA-Z]', t):
            tokens.add(t)
    
    
    recent_window.append(tokens)
    if len(recent_window) > 100:
        recent_window.pop(0)
    
    if len(recent_window) >= 100:
        # Calculate tf-idf for 40 tweets
        dfs = dict()
        for recent_tweet in recent_window[:-20]:
            for w in recent_tweet:
                if w not in dfs: dfs[w] = 0
                dfs[w] = dfs[w] + 1
        
        tfs = dict()
        for recent_tweet in recent_window[-20:]:
            for w in recent_tweet:
                if w not in tfs: tfs[w] = 0
                tfs[w] = tfs[w] + 1
        
        tf_idfs = dict()
        for t in tfs:
            if t in dfs:
                df = dfs[t]
            else:
                df = 0
            df += 1
            idf = np.log(40.0/df)
            tf = tfs[t]
            tf_idfs[t] = tf * idf
        
        sorted_x = sorted(tf_idfs.items(), key=lambda x: x[1])
        print (sorted_x[0], tweet[1])

('thank', 1.2909841813155658) 2016-07-26 08:34:19
('thank', 1.3862943611198906) 2016-07-25 23:19:38
('thank', 1.4916548767777169) 2016-07-25 23:12:57
('thank', 1.4916548767777169) 2016-07-25 23:04:56
('hillary', 1.2887140327810265) 2016-07-25 22:14:29
('hillary', 1.2887140327810265) 2016-07-25 17:57:14
('hillary', 1.2887140327810265) 2016-07-25 17:45:54
('hillary', 1.2887140327810265) 2016-07-25 17:33:49
('clinton', 1.2039728043259361) 2016-07-25 17:32:59
('clinton', 1.2039728043259361) 2016-07-25 17:25:50
('clinton', 1.2039728043259361) 2016-07-25 17:24:28
('clinton', 1.2039728043259361) 2016-07-25 14:57:06
('clinton', 1.2039728043259361) 2016-07-25 12:46:33
('clinton', 1.2909841813155658) 2016-07-25 10:05:27
('people', 1.6094379124341003) 2016-07-25 10:01:25
('people', 1.6094379124341003) 2016-07-25 09:47:45
('people', 1.6094379124341003) 2016-07-25 09:42:55
('people', 1.6094379124341003) 2016-07-25 09:19:18
('people', 1.6094379124341003) 2016-07-25 08:27:35
('people', 1.609437912434