In [1]:
# to load and check model:
from keras import models
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re
from datetime import timedelta

In [2]:
# Preprocess Tweets Before Feeding to Model
data = pd.read_csv("../data/input/realdonaldtrump_20170120-20191231.csv")
data = data[['id','content', 'date']]
data['content'] = data['content'].apply(lambda x: x.lower())
data['content'] = data['content'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [3]:
data.head()

Unnamed: 0,id,content,date
0,8.22421e+17,it all begins today i will see you at 1100 am ...,1/20/2017 6:31
1,8.22502e+17,today we are not merely transferring power fro...,1/20/2017 11:51
2,8.22502e+17,power from washington dc and giving it back to...,1/20/2017 11:51
3,8.22502e+17,what truly matters is not which party controls...,1/20/2017 11:52
4,8.22502e+17,january 20th 2017 will be remembered as the da...,1/20/2017 11:53


In [4]:
data.tail()

Unnamed: 0,id,content,date
9685,1.21218e+18,thank you to the dcexaminer washington examin...,12/31/2019 19:03
9686,1.21218e+18,thank you steve the greatest witch hunt in us ...,12/31/2019 19:16
9687,1.21218e+18,our fantastic first ladyhttpstwittercomflotuss...,12/31/2019 19:22
9688,1.21218e+18,happy new year,12/31/2019 19:30
9689,1.21221e+18,pictwittercomevaeyd1agv,12/31/2019 21:12


In [5]:
data["content"] = data["content"].str.lower()
china_related_list = ['chinese', 'huawei', 'xi jinping', 'beijing', 'trade', 'tariff','tax', 'xi']
train_china_related = data.set_index('content').filter(like='china', axis=0)
for word in china_related_list:
    train_china_related = pd.concat([train_china_related, data.set_index('content').filter(like=word, axis=0)]).drop_duplicates()
train_china_related.reset_index(inplace=True, drop=False)

In [6]:
train_china_related.head()

Unnamed: 0,content,id,date
0,the failing nytimes does major fake news chin...,8.30048e+17,2/10/2017 7:35
1,north korea is behaving very badly they have b...,8.42724e+17,3/17/2017 8:07
2,the meeting next week with china will be a ver...,8.47573e+17,3/30/2017 17:16
3,it was a great honor to have president xi jinp...,8.50723e+17,4/8/2017 9:50
4,i explained to the president of china that a t...,8.51767e+17,4/11/2017 6:59


In [7]:
# convert time zone to GMT-5 (New York Time) for SPX
train_china_related_spm = train_china_related
train_china_related_spm['date'] = pd.to_datetime(train_china_related_spm['date']) - timedelta(hours=5)
train_china_related_spm = train_china_related_spm[['id','content','date']]

In [8]:
train_china_related_spm['Time'],train_china_related_spm['Date']= train_china_related_spm['date'].apply(lambda x:x.time()), train_china_related_spm['date'].apply(lambda x:x.date())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_china_related_spm['Time'],train_china_related_spm['Date']= train_china_related_spm['date'].apply(lambda x:x.time()), train_china_related_spm['date'].apply(lambda x:x.date())


In [9]:
train_spm = train_china_related_spm
train_spm = train_spm[train_spm['Date']> pd.to_datetime('1/1/2018')]

In [10]:
# convert time zone to GMT+8 (Shanghai Time) for SSE
train_china_related_sse = train_china_related
train_china_related_sse['date'] = pd.to_datetime(train_china_related['date']) + timedelta(hours=8)
train_china_related_sse = train_china_related_sse[['id','content','date']]

In [11]:
train_china_related_sse['Time'],train_china_related_sse['Date']= train_china_related_sse['date'].apply(lambda x:x.time()), train_china_related_sse['date'].apply(lambda x:x.date())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_china_related_sse['Time'],train_china_related_sse['Date']= train_china_related_sse['date'].apply(lambda x:x.time()), train_china_related_sse['date'].apply(lambda x:x.date())


In [12]:
train_sse = train_china_related_sse
train_sse = train_sse[train_sse['Date']> pd.to_datetime('1/1/2018')]

**Get the prediction from the Model** -- spm data from balanced model

In [34]:
data = train_spm

In [35]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['content'].values)
X = tokenizer.texts_to_sequences(data['content'].values)
X = pad_sequences(X)

In [36]:
# Load in all models to be used 
# Called LSTM_10 because 10 epoch
LSTM_bal = models.load_model('../output/models/LSTM_balanced_10')
LSTM_bal.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 29, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 29, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [37]:
# Generate Predictions for each model 
y_pred = LSTM_bal.predict(X)



In [38]:
data['negative_prob'] = y_pred[:,0]
data['positive_prob'] = y_pred[:,1]
data.loc[data['negative_prob'] <= data['positive_prob'], 'outcome'] = 1
data.loc[data['negative_prob'] > data['positive_prob'], 'outcome'] = -1
data = data.astype({"outcome": int})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_prob'] = y_pred[:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['positive_prob'] = y_pred[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [39]:
data.head()

Unnamed: 0,id,content,date,Time,Date,negative_prob,positive_prob,outcome
43,9.61672e+17,i will be meeting with henry kissinger at 145p...,2018-02-08 07:44:00,07:44:00,2018-02-08,0.030826,0.969174,1
44,9.65203e+17,i never said russia did not meddle in the elec...,2018-02-18 01:33:00,01:33:00,2018-02-18,0.961403,0.038597,-1
45,9.71403e+17,china has been asked to develop a plan for the...,2018-03-07 04:10:00,04:10:00,2018-03-07,0.000141,0.999859,1
46,9.72506e+17,chinese president xi jinping and i spoke at le...,2018-03-10 05:15:00,05:15:00,2018-03-10,0.991466,0.008534,-1
47,9.78939e+17,received message last night from xi jinping of...,2018-03-28 00:16:00,00:16:00,2018-03-28,0.000669,0.999331,1


In [40]:
data1 = data[['date','Date','Time', 'outcome']]
data1['hour'] = pd.to_datetime(data1['date'], format='%Y%m%d %H:%M').dt.hour
data1["label"] = np.where(((data1['hour'] <= 16) & (data1['hour'] >= 9)),
                          'A','B') # A for during market B for before the market
data1["label"] = np.where(((data1['hour'] > 16) & (data1["label"] == 'B')),
                          'B1',data1["label"]) # B1 for after the market
data1["real_Date"] = np.where(data1['label'] == 'B1'  ,
                              pd.to_datetime(data1.Date.apply(str)) + pd.DateOffset(days=1), 
                              pd.to_datetime(data1.Date.apply(str)))
data1["label"] = np.where(data1['label'] == 'B1','B', data1['label']) # change back to B

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['hour'] = pd.to_datetime(data1['date'], format='%Y%m%d %H:%M').dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1["label"] = np.where(((data1['hour'] <= 16) & (data1['hour'] >= 9)),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1["label"] = np.where(((data1['hour'] > 16) & (dat

In [41]:
sum_g = data1.groupby(['real_Date','label']).agg({'outcome': 'sum'})

conditions  = [ sum_g['outcome'] > 0, sum_g['outcome'] == 0, sum_g['outcome'] < 0 ]
choices     = [ 1, 0, -1 ]
    
sum_g["real_outcome"] = np.select(conditions, choices, default=np.nan)
sum_g = sum_g.astype({"real_outcome": int})
sum_g.reset_index(inplace=True, drop=False)
sum_g["A"] = np.where(sum_g['label'] == 'A', sum_g["real_outcome"], None)
sum_g["B"] = np.where(sum_g['label'] == 'B', sum_g["real_outcome"], None)
sum_g=sum_g.drop(['outcome','real_outcome','label'], axis=1)

sum_g.head()

Unnamed: 0,real_Date,A,B
0,2018-01-02,,-1.0
1,2018-01-03,0.0,
2,2018-01-04,1.0,
3,2018-01-07,,-1.0
4,2018-01-08,1.0,


In [42]:
sum_g.to_csv('../output/results/LSTM_balanced_spm_results.csv')

**Get the prediction from the Model** -- spm data from unbalanced model

In [43]:
# Get Results of Unbalanced Model
LSTM_unbal = models.load_model('../output/models/LSTM_unbalanced')
LSTM_unbal.summary()
# Generate Predictions for each model 
y_pred = LSTM_unbal.predict(X)
data['negative_prob'] = y_pred[:,0]
data['positive_prob'] = y_pred[:,1]

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 28, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [44]:
# 1 == Negative, 1 == Positive Sentiment
data['outcome'] = np.where(data['negative_prob']> 0.500, -1 ,1)

In [45]:
data.head()

Unnamed: 0,id,content,date,Time,Date,negative_prob,positive_prob,outcome
43,9.61672e+17,i will be meeting with henry kissinger at 145p...,2018-02-08 07:44:00,07:44:00,2018-02-08,0.999612,0.000388,-1
44,9.65203e+17,i never said russia did not meddle in the elec...,2018-02-18 01:33:00,01:33:00,2018-02-18,0.982615,0.017385,-1
45,9.71403e+17,china has been asked to develop a plan for the...,2018-03-07 04:10:00,04:10:00,2018-03-07,0.964826,0.035174,-1
46,9.72506e+17,chinese president xi jinping and i spoke at le...,2018-03-10 05:15:00,05:15:00,2018-03-10,0.998237,0.001763,-1
47,9.78939e+17,received message last night from xi jinping of...,2018-03-28 00:16:00,00:16:00,2018-03-28,0.999651,0.000349,-1


In [46]:
data1 = data[['date','Date','Time', 'outcome']]
data1['hour'] = pd.to_datetime(data1['date'], format='%Y%m%d %H:%M').dt.hour
data1["label"] = np.where(((data1['hour'] <= 16) & (data1['hour'] >= 9)),
                          'A','B') # A for during market B for before the market
data1["label"] = np.where(((data1['hour'] > 16) & (data1["label"] == 'B')),
                          'B1',data1["label"]) # B1 for after the market
data1["real_Date"] = np.where(data1['label'] == 'B1'  ,
                              pd.to_datetime(data1.Date.apply(str)) + pd.DateOffset(days=1), 
                              pd.to_datetime(data1.Date.apply(str)))
data1["label"] = np.where(data1['label'] == 'B1','B', data1['label']) # change back to B

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['hour'] = pd.to_datetime(data1['date'], format='%Y%m%d %H:%M').dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1["label"] = np.where(((data1['hour'] <= 16) & (data1['hour'] >= 9)),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1["label"] = np.where(((data1['hour'] > 16) & (dat

In [47]:
sum_g = data1.groupby(['real_Date','label']).agg({'outcome': 'sum'})

conditions  = [ sum_g['outcome'] > 0, sum_g['outcome'] == 0, sum_g['outcome'] < 0 ]
choices     = [ 1, 0, -1 ]
    
sum_g["real_outcome"] = np.select(conditions, choices, default=np.nan)
sum_g = sum_g.astype({"real_outcome": int})
sum_g.reset_index(inplace=True, drop=False)
sum_g["A"] = np.where(sum_g['label'] == 'A', sum_g["real_outcome"], None)
sum_g["B"] = np.where(sum_g['label'] == 'B', sum_g["real_outcome"], None)
sum_g=sum_g.drop(['outcome','real_outcome','label'], axis=1)

sum_g.head()

Unnamed: 0,real_Date,A,B
0,2018-01-02,,-1.0
1,2018-01-03,-1.0,
2,2018-01-04,-1.0,
3,2018-01-07,,1.0
4,2018-01-08,-1.0,


In [48]:
sum_g.to_csv('../output/results/LSTM_unbalanced_spm_results.csv')

**Get the prediction from the Model** -- sse data from balanced model

In [49]:
data = train_sse
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['content'].values)
X = tokenizer.texts_to_sequences(data['content'].values)
X = pad_sequences(X)
# Load in all models to be used 
# Called LSTM_10 because 10 epoch
LSTM_bal = models.load_model('../output/models/LSTM_balanced_10')
LSTM_bal.summary()
# Generate Predictions for each model 
y_pred = LSTM_bal.predict(X)
data['negative_prob'] = y_pred[:,0]
data['positive_prob'] = y_pred[:,1]
data.loc[data['negative_prob'] <= data['positive_prob'], 'outcome'] = 1
data.loc[data['negative_prob'] > data['positive_prob'], 'outcome'] = -1
data = data.astype({"outcome": int})

data1 = data[['date','Date','Time', 'outcome']]
data1['hour'] = pd.to_datetime(data1['date'], format='%Y%m%d %H:%M').dt.hour
data1["label"] = np.where(((data1['hour'] <= 16) & (data1['hour'] >= 9)),
                          'A','B') # A for during market B for before the market
data1["label"] = np.where(((data1['hour'] > 16) & (data1["label"] == 'B')),
                          'B1',data1["label"]) # B1 for after the market
data1["real_Date"] = np.where(data1['label'] == 'B1'  ,
                              pd.to_datetime(data1.Date.apply(str)) + pd.DateOffset(days=1), 
                              pd.to_datetime(data1.Date.apply(str)))
data1["label"] = np.where(data1['label'] == 'B1','B', data1['label']) # change back to B


sum_g = data1.groupby(['real_Date','label']).agg({'outcome': 'sum'})

conditions  = [ sum_g['outcome'] > 0, sum_g['outcome'] == 0, sum_g['outcome'] < 0 ]
choices     = [ 1, 0, -1 ]
    
sum_g["real_outcome"] = np.select(conditions, choices, default=np.nan)
sum_g = sum_g.astype({"real_outcome": int})
sum_g.reset_index(inplace=True, drop=False)
sum_g["A"] = np.where(sum_g['label'] == 'A', sum_g["real_outcome"], None)
sum_g["B"] = np.where(sum_g['label'] == 'B', sum_g["real_outcome"], None)
sum_g=sum_g.drop(['outcome','real_outcome','label'], axis=1)

sum_g.to_csv('../output/results/LSTM_balanced_sse_results.csv')


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 29, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 29, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_prob'] = y_pred[:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['positive_prob'] = y_pred[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexe

In [50]:
data = train_sse
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['content'].values)
X = tokenizer.texts_to_sequences(data['content'].values)
X = pad_sequences(X)
# Load in all models to be used 
# Get Results of Unbalanced Model
LSTM_unbal = models.load_model('../output/models/LSTM_unbalanced')
LSTM_unbal.summary()
# Generate Predictions for each model 
y_pred = LSTM_unbal.predict(X)
data['negative_prob'] = y_pred[:,0]
data['positive_prob'] = y_pred[:,1]

# 1 == Negative, 1 == Positive Sentiment
data['outcome'] = np.where(data['negative_prob']> 0.500, -1 ,1)
data = data.astype({"outcome": int})

data1 = data[['date','Date','Time', 'outcome']]
data1['hour'] = pd.to_datetime(data1['date'], format='%Y%m%d %H:%M').dt.hour
data1["label"] = np.where(((data1['hour'] <= 16) & (data1['hour'] >= 9)),
                          'A','B') # A for during market B for before the market
data1["label"] = np.where(((data1['hour'] > 16) & (data1["label"] == 'B')),
                          'B1',data1["label"]) # B1 for after the market
data1["real_Date"] = np.where(data1['label'] == 'B1'  ,
                              pd.to_datetime(data1.Date.apply(str)) + pd.DateOffset(days=1), 
                              pd.to_datetime(data1.Date.apply(str)))
data1["label"] = np.where(data1['label'] == 'B1','B', data1['label']) # change back to B


sum_g = data1.groupby(['real_Date','label']).agg({'outcome': 'sum'})

conditions  = [ sum_g['outcome'] > 0, sum_g['outcome'] == 0, sum_g['outcome'] < 0 ]
choices     = [ 1, 0, -1 ]
    
sum_g["real_outcome"] = np.select(conditions, choices, default=np.nan)
sum_g = sum_g.astype({"real_outcome": int})
sum_g.reset_index(inplace=True, drop=False)
sum_g["A"] = np.where(sum_g['label'] == 'A', sum_g["real_outcome"], None)
sum_g["B"] = np.where(sum_g['label'] == 'B', sum_g["real_outcome"], None)
sum_g=sum_g.drop(['outcome','real_outcome','label'], axis=1)

sum_g.to_csv('../output/results/LSTM_unbalanced_sse_results.csv')


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 28, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_prob'] = y_pred[:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['positive_prob'] = y_pred[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outcome'] = np.where(data['negative_prob']> 0.500, -1 ,1)
A value is trying to be set on a copy of a slice from a DataFrame.
Tr