In [1]:
# to load and check model:
from keras import models
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re


In [2]:
# Preprocess Tweets Before Feeding to Model
data = pd.read_csv("../data/input/realdonaldtrump_20170120-20191231.csv")
data = data[['id','content', 'date']]
data['content'] = data['content'].apply(lambda x: x.lower())
data['content'] = data['content'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [3]:
data.head()

Unnamed: 0,id,content,date
0,8.22421e+17,it all begins today i will see you at 1100 am ...,1/20/2017 6:31
1,8.22502e+17,today we are not merely transferring power fro...,1/20/2017 11:51
2,8.22502e+17,power from washington dc and giving it back to...,1/20/2017 11:51
3,8.22502e+17,what truly matters is not which party controls...,1/20/2017 11:52
4,8.22502e+17,january 20th 2017 will be remembered as the da...,1/20/2017 11:53


In [4]:
data.tail()

Unnamed: 0,id,content,date
9685,1.21218e+18,thank you to the dcexaminer washington examin...,12/31/2019 19:03
9686,1.21218e+18,thank you steve the greatest witch hunt in us ...,12/31/2019 19:16
9687,1.21218e+18,our fantastic first ladyhttpstwittercomflotuss...,12/31/2019 19:22
9688,1.21218e+18,happy new year,12/31/2019 19:30
9689,1.21221e+18,pictwittercomevaeyd1agv,12/31/2019 21:12


In [5]:
data["content"] = data["content"].str.lower()
china_related_list = ['chinese', 'huawei', 'xi jinping', 'beijing', 'trade', 'tariff','tax']
train_china_related = data.set_index('content').filter(like='china', axis=0)
for word in china_related_list:
    train_china_related = pd.concat([train_china_related, data.set_index('content').filter(like=word, axis=0)]).drop_duplicates()
train_china_related.reset_index(inplace=True, drop=False)

In [6]:
train_china_related[['Date','Time']] = train_china_related.date.str.split(" ",expand = True,)
col_list = train_china_related.columns
train_china_related = train_china_related[[col_list[1], col_list[0], col_list[2], col_list[3], col_list[4]]]

In [7]:
train_china_related.head()

Unnamed: 0,id,content,date,Date,Time
0,8.30048e+17,the failing nytimes does major fake news chin...,2/10/2017 7:35,2/10/2017,7:35
1,8.42724e+17,north korea is behaving very badly they have b...,3/17/2017 8:07,3/17/2017,8:07
2,8.47573e+17,the meeting next week with china will be a ver...,3/30/2017 17:16,3/30/2017,17:16
3,8.50723e+17,it was a great honor to have president xi jinp...,4/8/2017 9:50,4/8/2017,9:50
4,8.51767e+17,i explained to the president of china that a t...,4/11/2017 6:59,4/11/2017,6:59


In [8]:
train_china_related['Date'] = pd.to_datetime(train_china_related['Date'])
train_china_related = train_china_related[train_china_related['Date']> pd.to_datetime('1/1/2018')]

In [9]:
data = train_china_related

In [10]:
data.head()

Unnamed: 0,id,content,date,Date,Time
43,9.61672e+17,i will be meeting with henry kissinger at 145p...,2/8/2018 12:44,2018-02-08,12:44
44,9.65203e+17,i never said russia did not meddle in the elec...,2/18/2018 6:33,2018-02-18,6:33
45,9.71403e+17,china has been asked to develop a plan for the...,3/7/2018 9:10,2018-03-07,9:10
46,9.72506e+17,chinese president xi jinping and i spoke at le...,3/10/2018 10:15,2018-03-10,10:15
47,9.78939e+17,received message last night from xi jinping of...,3/28/2018 5:16,2018-03-28,5:16


In [11]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['content'].values)
X = tokenizer.texts_to_sequences(data['content'].values)
X = pad_sequences(X)

In [12]:
# Load in all models to be used 
# Called LSTM_10 because 10 epoch
LSTM_bal = models.load_model('../output/models/LSTM_balanced_10')
LSTM_bal.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 29, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 29, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Generate Predictions for each model 
y_pred = LSTM_bal.predict(X)



In [14]:
data['negative_prob'] = y_pred[:,0]
data['positive_prob'] = y_pred[:,1]
data.loc[data['negative_prob'] <= data['positive_prob'], 'outcome'] = 1
data.loc[data['negative_prob'] > data['positive_prob'], 'outcome'] = -1
data = data.astype({"outcome": int})

In [15]:
data.head()

Unnamed: 0,id,content,date,Date,Time,negative_prob,positive_prob,outcome
43,9.61672e+17,i will be meeting with henry kissinger at 145p...,2/8/2018 12:44,2018-02-08,12:44,0.870762,0.129238,-1
44,9.65203e+17,i never said russia did not meddle in the elec...,2/18/2018 6:33,2018-02-18,6:33,0.000704,0.999296,1
45,9.71403e+17,china has been asked to develop a plan for the...,3/7/2018 9:10,2018-03-07,9:10,7.8e-05,0.999922,1
46,9.72506e+17,chinese president xi jinping and i spoke at le...,3/10/2018 10:15,2018-03-10,10:15,0.652814,0.347186,-1
47,9.78939e+17,received message last night from xi jinping of...,3/28/2018 5:16,2018-03-28,5:16,0.000747,0.999253,1


In [16]:
data.to_csv('../output/results/LSTM_balanced_full_results.csv')

In [17]:
# Get Results of Unbalanced Model
LSTM_unbal = models.load_model('../output/models/LSTM_unbalanced')
LSTM_unbal.summary()
# Generate Predictions for each model 
y_pred = LSTM_unbal.predict(X)
data['negative_prob'] = y_pred[:,0]
data['positive_prob'] = y_pred[:,1]

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 28, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [18]:
# 1 == Negative, 1 == Positive Sentiment
data['outcome'] = np.where(data['negative_prob']> 0.500, -1 ,1)

data.to_csv('../output/results/LSTM_unbalanced_full_results.csv')

In [19]:
data.head()

Unnamed: 0,id,content,date,Date,Time,negative_prob,positive_prob,outcome
43,9.61672e+17,i will be meeting with henry kissinger at 145p...,2/8/2018 12:44,2018-02-08,12:44,0.998678,0.001322,-1
44,9.65203e+17,i never said russia did not meddle in the elec...,2/18/2018 6:33,2018-02-18,6:33,0.999855,0.000145,-1
45,9.71403e+17,china has been asked to develop a plan for the...,3/7/2018 9:10,2018-03-07,9:10,0.944316,0.055684,-1
46,9.72506e+17,chinese president xi jinping and i spoke at le...,3/10/2018 10:15,2018-03-10,10:15,0.998923,0.001077,-1
47,9.78939e+17,received message last night from xi jinping of...,3/28/2018 5:16,2018-03-28,5:16,0.991059,0.00894,-1
