<a href="https://colab.research.google.com/github/undefinedzack/stock-market-prediction-using-sentiment-analysis/blob/master/resurrection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# Data Manipulation

import numpy as np
import pandas as pd
import re

# Preprocessing the input data

import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Creating ngrams and vectorizing the data

from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import Phraser

# Tools for building a model

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences

In [23]:
%cd drive/MyDrive/Colab_Data/

%ls -l

[Errno 2] No such file or directory: 'drive/MyDrive/Colab_Data/'
/content/drive/MyDrive/Colab_Data
total 410952
-rw------- 1 root root   2586020 Feb 20 13:52 200features_10minwords
-rw------- 1 root root 407958406 Feb 19 10:33 causeSheDidItThisWay.csv
-rw------- 1 root root    479968 Feb 20 13:09 stock_data.csv
-rw------- 1 root root   7076794 Feb 16 13:29 stockerbot-export1.csv
-rw------- 1 root root   1752624 Feb 16 13:31 tweet_sentiment.csv
-rw------- 1 root root    959890 Feb 20 13:28 tweets_labelled.csv


In [24]:
# creating dataframe

# df = pd.read_csv('stock_data.csv')
# df = pd.read_csv('stockerbot-export1.csv')

df = pd.read_csv('tweet_sentiment.csv')

In [27]:
df.head()


Unnamed: 0,cleaned_tweets,sentiment
0,video offic mind busi david solomon tell gs in...,0
1,price lumber lb f sinc hit ytd high maci turna...,0
2,say american dream dead,-1
3,barri silbert extrem optimist bitcoin predict ...,1
4,satellit avoid attack space junk circl earth paid,-1


In [26]:
sentiments = []
for i in df['Sentiment']:
  if i==-1:
    sentiments.append(0)
  else:
    sentiments.append(1)

KeyError: ignored

# Setup for Cleaning

In [28]:
non_letters = '[^A-Za-z\s]'

In [29]:
def clean(tweet :str) -> str:
  
  # removing HTML
  text = BeautifulSoup(tweet, "lxml").get_text()

  # remove non-letters
  letters_only = re.sub(non_letters, " ", text)

  # converting to lower-case
  lowercase_letters = letters_only.lower()

  return lowercase_letters

In [30]:
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words("english")) 

def lemmatize(tokens :list) -> list:
  
  # lemmatize
  lemmatized_tokens = list(map(lemmatizer.lemmatize, tokens))

  # remove stop words
  # meaningful_words = list(filter(lambda x : x not in stop_words, lemmatized_tokens))

  return lemmatized_tokens


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
def preprocess(tweet :str) -> list:

  # clean tweet
  clean_tweet = clean(tweet)

  # tokenize
  tokens = word_tokenize(clean_tweet)

  # lemmatize
  lemmaz = lemmatize(tokens)

  return lemmaz

# Data Cleaning



In [35]:
df['cleaned_tweets'].head()

0    video offic mind busi david solomon tell gs in...
1    price lumber lb f sinc hit ytd high maci turna...
2                              say american dream dead
3    barri silbert extrem optimist bitcoin predict ...
4    satellit avoid attack space junk circl earth paid
Name: cleaned_tweets, dtype: object

In [37]:
tweets = np.array(df['cleaned_tweets'].astype(str))

cleaned_data = np.array(list(map(lambda x : preprocess(x), tweets )))

  This is separate from the ipykernel package so we can avoid doing imports until


In [38]:
# bigrams

%%time
bigrams = Phrases(sentences=cleaned_data)

CPU times: user 387 ms, sys: 2.89 ms, total: 390 ms
Wall time: 391 ms


In [39]:
# trigrams

%%time
trigrams = Phrases(sentences=bigrams[cleaned_data])



CPU times: user 1.27 s, sys: 6.17 ms, total: 1.28 s
Wall time: 1.29 s


In [40]:
# creating trigram model

%%time
embedding_vector_size = 256
trigrams_model = Word2Vec(
    sentences = trigrams[bigrams[cleaned_data]],
    size = embedding_vector_size,
    min_count=3, window=5, workers=4)





CPU times: user 15.8 s, sys: 14.6 ms, total: 15.9 s
Wall time: 13.8 s


In [41]:
print("Vocabulary size:", len(trigrams_model.wv.vocab))

Vocabulary size: 10679


In [44]:
trigrams_model.wv.most_similar('crypto')

[('first', 0.9999532103538513),
 ('two', 0.9999401569366455),
 ('complet', 0.9999380111694336),
 ('start', 0.9999365210533142),
 ('plan', 0.9999344348907471),
 ('want', 0.9999339580535889),
 ('cbio', 0.999933123588562),
 ('fdx', 0.9999327063560486),
 ('order', 0.9999290704727173),
 ('run', 0.9999281167984009)]

In [45]:
def vectorize_data(data, vocab: dict) -> list:
    print('Vectorize sentences...', end='\r')
    keys = list(vocab.keys())
    filter_unknown = lambda word: vocab.get(word, None) is not None
    encode = lambda review: list(map(keys.index, filter(filter_unknown, review)))
    vectorized = list(map(encode, data))
    print('Vectorize sentences... (done)')
    return vectorized

print('Convert sentences to sentences with ngrams...', end='\r')
X_data = trigrams[bigrams[cleaned_data]]
print('Convert sentences to sentences with ngrams... (done)')
input_length = 150
X_pad = pad_sequences(
    sequences=vectorize_data(X_data, vocab=trigrams_model.wv.vocab),
    maxlen=input_length,
    padding='post')
print('Transform sentences to sequences... (done)')

Convert sentences to sentences with ngrams...Convert sentences to sentences with ngrams... (done)
Vectorize sentences...



Vectorize sentences... (done)
Transform sentences to sequences... (done)


In [47]:
df.head()

Unnamed: 0,cleaned_tweets,sentiment
0,video offic mind busi david solomon tell gs in...,0
1,price lumber lb f sinc hit ytd high maci turna...,0
2,say american dream dead,-1
3,barri silbert extrem optimist bitcoin predict ...,1
4,satellit avoid attack space junk circl earth paid,-1


In [48]:
sentiments = df['sentiment']
# sentiments = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(
    X_pad,
    sentiments,
    test_size=0.05,
    shuffle=True,
    random_state=42)

In [49]:
def build_model(embedding_matrix: np.ndarray, input_length: int):
    model = Sequential()
    model.add(Embedding(
        input_dim = embedding_matrix.shape[0],
        output_dim = embedding_matrix.shape[1], 
        input_length = input_length,
        weights = [embedding_matrix],
        trainable=True))
    model.add(Bidirectional(LSTM(128, recurrent_dropout=0.1)))
    model.add(Dropout(0.25))
    model.add(Dense(64))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    return model

model = build_model(
    embedding_matrix=trigrams_model.wv.vectors,
    input_length=input_length)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 256)          2733824   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 3,144,577
Trainable params: 3,144,577
Non-trainable params: 0
____________________________________________

In [None]:
model.compile(
    loss="binary_crossentropy",
    optimizer='adam',
    metrics=['accuracy'])

history = model.fit(
    x=X_train,
    y=y_train,
    validation_data=(X_test, y_test),
    batch_size=100,
    epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5