<a href="https://colab.research.google.com/github/undefinedzack/stock-market-prediction-using-sentiment-analysis/blob/master/Final_Merged_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Data Manipulation

import numpy as np
import pandas as pd
import re

# Preprocessing the input data

import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# Creating ngrams and vectorizing the data

from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import Phraser

# Tools for building a model

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
%cd drive/MyDrive/Colab_Data/
%ls -l

/content/drive/.shortcut-targets-by-id/1mnM1b6TaU1SW1anY35VRUPG3xIImlWjS/Colab_Data
total 465898
-rw------- 1 root root   2586020 Feb 20 13:52  200features_10minwords
-rw------- 1 root root 407958406 Feb 19 10:33  causeSheDidItThisWay.csv
-rw------- 1 root root    965379 Mar 31 16:44  df_final.csv
-rw------- 1 root root  34251814 Mar 31 11:47 'df_stocktwits_prepared_final (1).csv'
-rw------- 1 root root  15257600 Mar 31 11:34  df_stocktwits_prepared_final.csv
-rw------- 1 root root   5617630 Mar 31 05:47  News.csv
drwx------ 2 root root      4096 Mar 19 13:58  [0m[01;34msaved_model[0m/
-rw------- 1 root root    479968 Feb 20 13:09  stock_data.csv
-rw------- 1 root root   7076794 Feb 16 13:29  stockerbot-export1.csv
-rw------- 1 root root    167083 Mar 31 05:46  stocks.csv
-rw------- 1 root root   1752624 Feb 16 13:31  tweet_sentiment.csv
-rw------- 1 root root    959890 Feb 20 13:28  tweets_labelled.csv


In [None]:
from google.colab import files 
uploaded = files.upload()

Saving df_final.csv to df_final.csv


In [None]:
df1 = pd.read_csv('df_stocktwits_prepared_final.csv')
df2 = pd.read_csv('df_final.csv')

In [None]:
df1=df1[df1.columns[0:3]]

In [None]:
df1=df1.drop('time', axis=1)
df1['clean_text'] = df1.groupby(['created_date'])['clean_text'].transform(lambda x : ' '.join(x))
  
# drop duplicate data
df1 = df1.drop_duplicates()

In [None]:
df1

Unnamed: 0,created_date,clean_text
0,01-01-2019,thinking about buying a lot and hold it for lo...
10,02-01-2019,trying for the 20 MA Will we see any dips ? Bo...
69,03-01-2019,$$$ _ Raytheon UK to secure hundreds of aerosp...
155,04-01-2019,_url Boeing remains top pick for 2019 by Baird...
215,05-01-2019,The Top Stocks On The Dow Jones Also _url The ...
...,...,...
35557,02-06-2019,I am seeing on that list what is more unreliab...
35633,03-06-2019,over the wekend states planes have bad parts D...
36001,04-06-2019,same here Still waiting for $ba to buy below 3...
36176,05-06-2019,$BA: Press Release published article on June 0...


In [None]:
###### CLEANING EACH STRING
def clean(tweet :str) -> str:
  pat1= r'@[A-Za-z0-9]+'
  pat2= r'https?://[A-Za-z0-9./]+'
  combined_pat=r'|'.join((pat1,pat2))
  pat3= r'[^a-zA-Z]'
  combined_pat2=r'|'.join((combined_pat,pat3))
  
  # removing HTML
  text = BeautifulSoup(tweet, "lxml").get_text()

  # remove non-letters
  letters_only = re.sub(combined_pat2, " ", text)

  # converting to lower-case
  lowercase_letters = letters_only.lower()

  return lowercase_letters


##### LEMMATIZATION
def lemmatize(tokens :list) -> list:
  lemmatizer = WordNetLemmatizer()
  ps= PorterStemmer()
  stop_words = set(stopwords.words("english"))  
 
  # lemmatize
  lemmatized_tokens = list(map(lemmatizer.lemmatize, tokens))

  # remove stop words
  meaningful_words = list(filter(lambda x : x not in stop_words, lemmatized_tokens))

  tweets = [ps.stem(word) for word in meaningful_words]
  return tweets


###### ALL TOGETHER
def preprocess(tweet :str) -> list:

  # clean tweet
  clean_tweet = clean(tweet)

  # tokenize
  tokens = word_tokenize(clean_tweet)

  # lemmatize
  lemmaz = lemmatize(tokens)

  return lemmaz


###### CLEANING WHOLE DATA BY PROCESSING EACH TWEET ONE BY ONE
def get_clean_data(tweets):
  return np.array(list(map(preprocess, tweets )))

  
###### BUILDING TRIGRAMS MODEL
def build_trigrams_model(cleaned_data):
  #creating n grams
  bigrams = Phrases(sentences=cleaned_data)
  trigrams = Phrases(sentences=bigrams[cleaned_data])
  
  # creating trigram model
  embedding_vector_size = 256
  trigrams_model = Word2Vec(
      sentences = trigrams[bigrams[cleaned_data]],
      size = embedding_vector_size,
      min_count=3, window=5, workers=4)
  
  return trigrams_model


###### VECTORIZING DATA
def vectorize_data(data, vocab: dict) -> list:
    print('Vectorize sentences...')
    keys = list(vocab.keys())
    filter_unknown = lambda word: vocab.get(word, None) is not None
    encode = lambda tweet: list(map(keys.index, filter(filter_unknown, tweet)))
    vectorized = list(map(encode, data))
    print('Vectorize sentences... (done)')
    return vectorized


###### FINAL DATA WITH PADDING
def vectorised_padded_data(cleaned_data):
  
  bigrams = Phrases(sentences=cleaned_data)
  trigrams = Phrases(sentences=bigrams[cleaned_data])
  X_data = trigrams[bigrams[cleaned_data]]
  
  print('Convert sentences to sentences with ngrams... (done)')
  input_length = 150
  
  trigrams_model = build_trigrams_model(cleaned_data)
  X_pad = pad_sequences(
      sequences=vectorize_data(X_data, vocab=trigrams_model.wv.vocab),
      maxlen=input_length,
      padding='post')
  return X_pad


###### CLUBBING VECTORIZATION AND PADDING FUCTION
def suitable_data(tweets):
  cleaned_data = get_clean_data(tweets)
  return vectorised_padded_data(cleaned_data)
  



In [None]:
X_pad = suitable_data(df1['clean_text'])



Convert sentences to sentences with ngrams... (done)
Vectorize sentences...
Vectorize sentences... (done)


In [None]:
my_model = tf.keras.models.load_model('saved_model')

In [None]:
outputs=my_model.predict(x=X_pad)

In [None]:

df1=df1.assign(sentiments = outputs)
df1

Unnamed: 0,created_date,clean_text,sentiments
0,01-01-2019,thinking about buying a lot and hold it for lo...,0.994443
10,02-01-2019,trying for the 20 MA Will we see any dips ? Bo...,0.001840
69,03-01-2019,$$$ _ Raytheon UK to secure hundreds of aerosp...,0.999811
155,04-01-2019,_url Boeing remains top pick for 2019 by Baird...,0.002085
215,05-01-2019,The Top Stocks On The Dow Jones Also _url The ...,0.999998
...,...,...,...
35557,02-06-2019,I am seeing on that list what is more unreliab...,1.000000
35633,03-06-2019,over the wekend states planes have bad parts D...,0.999999
36001,04-06-2019,same here Still waiting for $ba to buy below 3...,0.999999
36176,05-06-2019,$BA: Press Release published article on June 0...,0.999961


In [None]:
df2.head(10)

Unnamed: 0,Time,Open,High,Low,Close,Adj Close,Volume,bullish,volume_before_1,volume_diff_last_hour,volume_before_2,volume_diff_last_2_hour,volume_before_3,volume_diff_last_3_hour,volume_before_4,volume_diff_last_4_hour,volume_before_5,volume_diff_last_5_hour,close_before_1,close_diff_last_hour,close_before_2,close_diff_last_2_hour,close_before_3,close_diff_last_3_hour,close_before_4,close_diff_last_4_hour,close_before_5,close_diff_last_5_hour,close_before_6,close_diff_last_6_hour,close_before_7,close_diff_last_7_hour,close_before_8,close_diff_last_8_hour,close_before_9,close_diff_last_9_hour,SMA_15,SMA_30,SMA_indicator,SMA_indicator_before_1,SMA_indicator_before_2,SMA_indicator_before_3,SMA_indicator_before_4,Upper_Bollinger,Lower_Bollinger,Middle_Bollinger,close_diff_Upper_Bollinger,close_diff_Lower_Bollinger,Bollinger_indicator,Bollinger_indicator_before_1,Bollinger_indicator_before_2,Bollinger_indicator_before_3,Bollinger_indicator_before_4,true_range,holiday_day_diff_before,date,weekday,time,mean_sentiment_score,median_sentiment_score,std_sentiment_score,max_sentiment_score,min_sentiment_score
0,02-01-2019 09:30,315.5,319.709992,313.899994,318.440002,318.440002,724286,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,318.440002,318.440002,0,,,,,,,318.440002,,,,,,,,,34,2,2,1,0.132733,0.174651,0.190971,0.471599,-0.323868
1,02-01-2019 10:30,318.439301,322.307007,317.350006,320.520294,320.520294,394681,0.0,724286.0,,,,,,,,,,318.440002,,,,,,,,,,,,,,,,,,319.480148,319.480148,0,0.0,,,,322.422125,316.538171,319.480148,-1.901831,3.982123,2.0,,,,,4.957001,34,2,2,2,0.227726,0.227726,0.041528,0.257091,0.198361
2,02-01-2019 11:30,320.567505,321.658691,319.093506,319.54779,319.54779,250973,0.0,394681.0,-329605.0,724286.0,,,,,,,,320.520294,2.080292,318.440002,,,,,,,,,,,,,,,,319.502696,319.502696,0,0.0,0.0,,,321.584453,317.420938,319.502696,-2.036663,2.126852,2.0,2.0,,,,2.565186,34,2,2,3,,,,,
3,02-01-2019 12:30,319.798492,323.690002,319.25,323.34079,323.34079,274696,0.0,250973.0,-143708.0,394681.0,-329605.0,724286.0,,,,,,319.54779,-0.972504,320.520294,2.080292,318.440002,,,,,,,,,,,,,,320.462219,320.462219,0,0.0,0.0,0.0,,324.659851,316.264588,320.462219,-1.319061,7.076202,2.0,2.0,2.0,,,4.440002,34,2,2,4,,,,,
4,02-01-2019 13:30,323.443207,323.790008,320.679993,321.76001,321.76001,265741,0.0,274696.0,23723.0,250973.0,-143708.0,394681.0,-329605.0,724286.0,,,,323.34079,3.792999,319.54779,-0.972504,320.520294,2.080292,318.440002,,,,,,,,,,,,320.721777,320.721777,0,0.0,0.0,0.0,0.0,324.53786,316.905694,320.721777,-2.777851,4.854315,2.0,2.0,2.0,2.0,,3.110016,34,2,2,5,-0.176063,-0.176063,,-0.176063,-0.176063
5,02-01-2019 14:30,322.0,322.7323,319.529999,320.820007,320.820007,365267,0.0,265741.0,-8955.0,274696.0,23723.0,250973.0,-143708.0,394681.0,-329605.0,724286.0,,321.76001,-1.58078,323.34079,3.792999,319.54779,-0.972504,320.520294,2.080292,318.440002,,,,,,,,,,320.738149,320.738149,0,0.0,0.0,0.0,0.0,324.1523,317.323998,320.738149,-3.332292,3.496009,2.0,2.0,2.0,2.0,2.0,3.202301,34,2,2,6,0.008445,0.008445,0.173952,0.131448,-0.114558
6,02-01-2019 15:30,320.670013,323.850006,320.244385,323.839996,323.839996,360878,0.0,365267.0,99526.0,265741.0,-8955.0,274696.0,23723.0,250973.0,-143708.0,394681.0,-329605.0,320.820007,-0.940002,321.76001,-1.58078,323.34079,3.792999,319.54779,-0.972504,320.520294,2.080292,318.440002,,,,,,,,321.18127,321.18127,0,0.0,0.0,0.0,0.0,325.081483,317.281057,321.18127,-1.241487,6.558939,2.0,2.0,2.0,2.0,2.0,3.605621,34,2,2,7,0.167821,0.13443,0.23036,0.467212,-0.184281
7,03-01-2019 09:30,318.950012,319.5784,312.339996,312.899994,312.899994,1238390,1.0,360878.0,-4389.0,365267.0,99526.0,265741.0,-8955.0,274696.0,23723.0,250973.0,-143708.0,323.839996,3.019989,320.820007,-0.940002,321.76001,-1.58078,323.34079,3.792999,319.54779,-0.972504,320.520294,2.080292,318.440002,,,,,,320.146111,320.146111,0,0.0,0.0,0.0,0.0,327.02567,313.266551,320.146111,-14.125676,-0.366557,4.0,2.0,2.0,2.0,2.0,11.5,33,3,3,1,0.122598,0.108125,0.137926,0.377099,-0.220602
8,03-01-2019 10:30,312.76001,313.744385,309.399994,311.519989,311.519989,841506,1.0,1238390.0,877512.0,360878.0,-4389.0,365267.0,99526.0,265741.0,-8955.0,274696.0,23723.0,312.899994,-10.940002,323.839996,3.019989,320.820007,-0.940002,321.76001,-1.58078,323.34079,3.792999,319.54779,-0.972504,320.520294,2.080292,318.440002,,,,319.187653,319.187653,0,0.0,0.0,0.0,0.0,327.818029,310.557276,319.187653,-16.29804,0.962713,3.0,4.0,2.0,2.0,2.0,4.344391,33,3,3,2,,,,,
9,03-01-2019 11:30,312.024994,316.649994,311.380005,314.100006,314.100006,734290,1.0,841506.0,-396884.0,1238390.0,877512.0,360878.0,-4389.0,365267.0,99526.0,265741.0,-8955.0,311.519989,-1.380005,312.899994,-10.940002,323.839996,3.019989,320.820007,-0.940002,321.76001,-1.58078,323.34079,3.792999,319.54779,-0.972504,320.520294,2.080292,318.440002,,318.678888,318.678888,0,0.0,0.0,0.0,0.0,327.428809,309.928967,318.678888,-13.328803,4.171039,3.0,3.0,4.0,2.0,2.0,5.269989,33,3,3,3,,,,,


In [None]:
df2['created_date'] = df2['Time'].transform(lambda x : x.split(' ')[0])
df2.head()


Unnamed: 0,Time,Open,High,Low,Close,Adj Close,Volume,bullish,volume_before_1,volume_diff_last_hour,volume_before_2,volume_diff_last_2_hour,volume_before_3,volume_diff_last_3_hour,volume_before_4,volume_diff_last_4_hour,volume_before_5,volume_diff_last_5_hour,close_before_1,close_diff_last_hour,close_before_2,close_diff_last_2_hour,close_before_3,close_diff_last_3_hour,close_before_4,close_diff_last_4_hour,close_before_5,close_diff_last_5_hour,close_before_6,close_diff_last_6_hour,close_before_7,close_diff_last_7_hour,close_before_8,close_diff_last_8_hour,close_before_9,close_diff_last_9_hour,SMA_15,SMA_30,SMA_indicator,SMA_indicator_before_1,SMA_indicator_before_2,SMA_indicator_before_3,SMA_indicator_before_4,Upper_Bollinger,Lower_Bollinger,Middle_Bollinger,close_diff_Upper_Bollinger,close_diff_Lower_Bollinger,Bollinger_indicator,Bollinger_indicator_before_1,Bollinger_indicator_before_2,Bollinger_indicator_before_3,Bollinger_indicator_before_4,true_range,holiday_day_diff_before,date,weekday,time,mean_sentiment_score,median_sentiment_score,std_sentiment_score,max_sentiment_score,min_sentiment_score,created_date
0,02-01-2019 09:30,315.5,319.709992,313.899994,318.440002,318.440002,724286,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,318.440002,318.440002,0,,,,,,,318.440002,,,,,,,,,34,2,2,1,0.132733,0.174651,0.190971,0.471599,-0.323868,02-01-2019
1,02-01-2019 10:30,318.439301,322.307007,317.350006,320.520294,320.520294,394681,0.0,724286.0,,,,,,,,,,318.440002,,,,,,,,,,,,,,,,,,319.480148,319.480148,0,0.0,,,,322.422125,316.538171,319.480148,-1.901831,3.982123,2.0,,,,,4.957001,34,2,2,2,0.227726,0.227726,0.041528,0.257091,0.198361,02-01-2019
2,02-01-2019 11:30,320.567505,321.658691,319.093506,319.54779,319.54779,250973,0.0,394681.0,-329605.0,724286.0,,,,,,,,320.520294,2.080292,318.440002,,,,,,,,,,,,,,,,319.502696,319.502696,0,0.0,0.0,,,321.584453,317.420938,319.502696,-2.036663,2.126852,2.0,2.0,,,,2.565186,34,2,2,3,,,,,,02-01-2019
3,02-01-2019 12:30,319.798492,323.690002,319.25,323.34079,323.34079,274696,0.0,250973.0,-143708.0,394681.0,-329605.0,724286.0,,,,,,319.54779,-0.972504,320.520294,2.080292,318.440002,,,,,,,,,,,,,,320.462219,320.462219,0,0.0,0.0,0.0,,324.659851,316.264588,320.462219,-1.319061,7.076202,2.0,2.0,2.0,,,4.440002,34,2,2,4,,,,,,02-01-2019
4,02-01-2019 13:30,323.443207,323.790008,320.679993,321.76001,321.76001,265741,0.0,274696.0,23723.0,250973.0,-143708.0,394681.0,-329605.0,724286.0,,,,323.34079,3.792999,319.54779,-0.972504,320.520294,2.080292,318.440002,,,,,,,,,,,,320.721777,320.721777,0,0.0,0.0,0.0,0.0,324.53786,316.905694,320.721777,-2.777851,4.854315,2.0,2.0,2.0,2.0,,3.110016,34,2,2,5,-0.176063,-0.176063,,-0.176063,-0.176063,02-01-2019


In [None]:
df2_open=df2[['created_date', 'Open']]
df2_close=df2[['created_date', 'Close']]
df2_open

Unnamed: 0,created_date,Open
0,02-01-2019,315.500000
1,02-01-2019,318.439301
2,02-01-2019,320.567505
3,02-01-2019,319.798492
4,02-01-2019,323.443207
...,...,...
1745,30-12-2019,327.744995
1746,30-12-2019,327.049988
1747,30-12-2019,327.269989
1748,31-12-2019,325.410004


In [None]:
df2_close

Unnamed: 0,created_date,Close
0,02-01-2019,318.440002
1,02-01-2019,320.520294
2,02-01-2019,319.547790
3,02-01-2019,323.340790
4,02-01-2019,321.760010
...,...,...
1745,30-12-2019,327.059998
1746,30-12-2019,327.289612
1747,30-12-2019,326.540008
1748,31-12-2019,324.709992


In [None]:
#Taking 1st entry of open value and last entry of close value for the date
df2_open = df2_open.groupby(['created_date']).first()
df2_close = df2_close.groupby(['created_date']).last()
df2_open

Unnamed: 0_level_0,Open
created_date,Unnamed: 1_level_1
01-02-2019,386.829987
01-03-2019,445.709992
01-04-2019,386.359985
01-05-2019,378.529999
01-07-2019,364.880005
...,...
31-01-2019,386.500000
31-05-2019,345.850006
31-07-2019,348.489990
31-10-2019,344.700012


In [None]:
df2_close

Unnamed: 0_level_0,Close
created_date,Unnamed: 1_level_1
01-02-2019,387.399994
01-03-2019,440.910004
01-04-2019,391.540008
01-05-2019,376.700012
01-07-2019,356.029999
...,...
31-01-2019,385.420013
31-05-2019,341.630005
31-07-2019,341.200012
31-10-2019,339.940002


In [None]:
final=pd.merge(df2_open,df2_close,on='created_date')

In [None]:
final

Unnamed: 0_level_0,Open,Close
created_date,Unnamed: 1_level_1,Unnamed: 2_level_1
01-02-2019,386.829987,387.399994
01-03-2019,445.709992,440.910004
01-04-2019,386.359985,391.540008
01-05-2019,378.529999,376.700012
01-07-2019,364.880005,356.029999
...,...,...
31-01-2019,386.500000,385.420013
31-05-2019,345.850006,341.630005
31-07-2019,348.489990,341.200012
31-10-2019,344.700012,339.940002


In [None]:
#Merging Sentiment and open close
full_final=pd.merge(df1,final,on='created_date',how='inner')
full_final

Unnamed: 0,created_date,clean_text,sentiments,Open,Close
0,02-01-2019,trying for the 20 MA Will we see any dips ? Bo...,0.001840,315.500000,323.839996
1,03-01-2019,$$$ _ Raytheon UK to secure hundreds of aerosp...,0.999811,318.950012,310.809998
2,04-01-2019,_url Boeing remains top pick for 2019 by Baird...,0.002085,316.404999,326.880005
3,07-01-2019,Dow futures are up this thing's going to rip !...,0.999939,329.790008,328.059998
4,08-01-2019,trade ideas: _url In the last six months 42 se...,0.999989,334.250000,340.484985
...,...,...,...,...,...
103,31-05-2019,Many would agree with you Oscar United CEO say...,1.000000,345.850006,341.630005
104,03-06-2019,over the wekend states planes have bad parts D...,0.999999,338.200012,338.549988
105,04-06-2019,same here Still waiting for $ba to buy below 3...,0.999999,342.570007,344.619995
106,05-06-2019,$BA: Press Release published article on June 0...,0.999961,346.220001,348.750000


In [None]:
#sign = lambda a: 1 if a>0 else -1 if a<0 else 0
full_final['status'] = np.sign(full_final['Close'] - full_final['Open'])
full_final

Unnamed: 0,created_date,clean_text,sentiments,Open,Close,status
0,02-01-2019,trying for the 20 MA Will we see any dips ? Bo...,0.001840,315.500000,323.839996,1.0
1,03-01-2019,$$$ _ Raytheon UK to secure hundreds of aerosp...,0.999811,318.950012,310.809998,-1.0
2,04-01-2019,_url Boeing remains top pick for 2019 by Baird...,0.002085,316.404999,326.880005,1.0
3,07-01-2019,Dow futures are up this thing's going to rip !...,0.999939,329.790008,328.059998,-1.0
4,08-01-2019,trade ideas: _url In the last six months 42 se...,0.999989,334.250000,340.484985,1.0
...,...,...,...,...,...,...
103,31-05-2019,Many would agree with you Oscar United CEO say...,1.000000,345.850006,341.630005,-1.0
104,03-06-2019,over the wekend states planes have bad parts D...,0.999999,338.200012,338.549988,1.0
105,04-06-2019,same here Still waiting for $ba to buy below 3...,0.999999,342.570007,344.619995,1.0
106,05-06-2019,$BA: Press Release published article on June 0...,0.999961,346.220001,348.750000,1.0
