In [47]:
# to load and check model:
from keras import models
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re


In [60]:
# Preprocess Tweets Before Feeding to Model
data = pd.read_csv("../data/input/realdonaldtrump.csv")
data = data[['id','content', 'date']]
data['content'] = data['content'].apply(lambda x: x.lower())
data['content'] = data['content'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [52]:
data.head

<bound method NDFrame.head of                         id                                            content  \
0               1698308935  be sure to tune in and watch donald trump on l...   
1               1701461182  donald trump will be appearing on the view tom...   
2               1737479987  donald trump reads top ten financial tips on l...   
3               1741160716  new blog post celebrity apprentice finale and ...   
4               1773561338  my persona will never be that of a wallflower ...   
...                    ...                                                ...   
43347  1273405198698975232  joe biden was a total failure in government he...   
43348  1273408026968457216  will be interviewed on  seanhannity tonight at...   
43349  1273442195161387008                            pictwittercom3lm1spbu8x   
43350  1273442469066276864                            pictwittercomvpce5maduz   
43351  1273442528411385858                            pictwittercomvllc0bhw41  

In [61]:
date_time = data["date"].str.split(' ', expand=True)
data['date'] = date_time[0]
data['time'] = date_time[1]
#



In [66]:
data['date'] = pd.to_datetime(data['date'])
data[data['date'] > '2017-01-16']
data.head

<bound method NDFrame.head of                         id                                            content  \
0               1698308935  be sure to tune in and watch donald trump on l...   
1               1701461182  donald trump will be appearing on the view tom...   
2               1737479987  donald trump reads top ten financial tips on l...   
3               1741160716  new blog post celebrity apprentice finale and ...   
4               1773561338  my persona will never be that of a wallflower ...   
...                    ...                                                ...   
43347  1273405198698975232  joe biden was a total failure in government he...   
43348  1273408026968457216  will be interviewed on  seanhannity tonight at...   
43349  1273442195161387008                            pictwittercom3lm1spbu8x   
43350  1273442469066276864                            pictwittercomvpce5maduz   
43351  1273442528411385858                            pictwittercomvllc0bhw41  

In [41]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['content'].values)
X = tokenizer.texts_to_sequences(data['content'].values)
X = pad_sequences(X)

In [42]:
# Load in all models to be used 
# Called LSTM_10 because 10 epoch
LSTM_bal = models.load_model('../output/models/LSTM_balanced_10')
LSTM_bal.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 29, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 29, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [43]:
# Generate Predictions for each model 
y_pred = LSTM_bal.predict(X)
data['negative_prob'] = pd.Series(y_pred[:,0])
data['positive_prob'] = pd.Series(y_pred[:,1])
data.to_csv('../output/results/LSTM_balanced_results.csv')



In [50]:
# Get Results of Unbalanced Model
LSTM_unbal = models.load_model('../output/models/LSTM_unbalanced')
LSTM_unbal.summary()
# Generate Predictions for each model 
y_pred = LSTM_unbal.predict(X)
data['negative_prob'] = pd.Series(y_pred[:,0])
data['positive_prob'] = pd.Series(y_pred[:,1])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 28, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 28, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


KeyboardInterrupt: 

In [48]:
# 0 == Negative, 1 == Positive Sentiment
data['outcome'] = np.where(data['negative_prob']> 0.500, 0 ,1)

data.to_csv('../output/results/LSTM_unbalanced_full_results.csv')

In [49]:
data.head

<bound method NDFrame.head of                         id                                            content  \
0               1698308935  be sure to tune in and watch donald trump on l...   
1               1701461182  donald trump will be appearing on the view tom...   
2               1737479987  donald trump reads top ten financial tips on l...   
3               1741160716  new blog post celebrity apprentice finale and ...   
4               1773561338  my persona will never be that of a wallflower ...   
...                    ...                                                ...   
43347  1273405198698975232  joe biden was a total failure in government he...   
43348  1273408026968457216  will be interviewed on  seanhannity tonight at...   
43349  1273442195161387008                            pictwittercom3lm1spbu8x   
43350  1273442469066276864                            pictwittercomvpce5maduz   
43351  1273442528411385858                            pictwittercomvllc0bhw41  