<a href="https://colab.research.google.com/github/undefinedzack/stock-market-prediction-using-sentiment-analysis/blob/master/All_in_one_for_datewiseFile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Data Manipulation

import numpy as np
import pandas as pd
import re

# Preprocessing the input data

import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# Creating ngrams and vectorizing the data

from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import Phraser

# Tools for building a model

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/Colab_Data/
%ls -l

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/MyDrive/Colab_Data/'
/content/drive/.shortcut-targets-by-id/1mnM1b6TaU1SW1anY35VRUPG3xIImlWjS/Colab_Data
total 464955
-rw------- 1 root root   2586020 Feb 20 13:52  200features_10minwords
-rw------- 1 root root 407958406 Feb 19 10:33  causeSheDidItThisWay.csv
-rw------- 1 root root  34251814 Mar 31 11:47 'df_stocktwits_prepared_final (1).csv'
-rw------- 1 root root  15257600 Mar 31 11:34  df_stocktwits_prepared_final.csv
-rw------- 1 root root   5617630 Mar 31 05:47  News.csv
drwx------ 2 root root      4096 Mar 19 13:58  [0m[01;34msaved_model[0m/
-rw------- 1 root root    479968 Feb 20 13:09  stock_data.csv
-rw------- 1 root root   7076794 Feb 16 13:29  stockerbot-export1.csv
-rw------- 1 root root    167083 Mar 31 05:46  stocks.csv
-rw------- 1 root root   1752624 Feb 16 13:31  tweet_sentiment.csv
-rw------- 1 

In [None]:
from google.colab import files 
uploaded = files.upload()

Saving df_stocktwits_prepared_final.csv to df_stocktwits_prepared_final (1).csv


In [22]:
df1 = pd.read_csv('df_stocktwits_prepared_final.csv')

In [23]:
df1=df1[df1.columns[0:3]]

In [24]:
df1=df1.drop('time', axis=1)
df1['clean_text'] = df1.groupby(['created_date'])['clean_text'].transform(lambda x : ' '.join(x))
  
# drop duplicate data
df1 = df1.drop_duplicates()

In [25]:
df1.head()

Unnamed: 0,created_date,clean_text
0,01-01-2019,thinking about buying a lot and hold it for lo...
10,02-01-2019,trying for the 20 MA Will we see any dips ? Bo...
69,03-01-2019,$$$ _ Raytheon UK to secure hundreds of aerosp...
155,04-01-2019,_url Boeing remains top pick for 2019 by Baird...
215,05-01-2019,The Top Stocks On The Dow Jones Also _url The ...


In [27]:
###### CLEANING EACH STRING
def clean(tweet :str) -> str:
  pat1= r'@[A-Za-z0-9]+'
  pat2= r'https?://[A-Za-z0-9./]+'
  combined_pat=r'|'.join((pat1,pat2))
  pat3= r'[^a-zA-Z]'
  combined_pat2=r'|'.join((combined_pat,pat3))
  
  # removing HTML
  text = BeautifulSoup(tweet, "lxml").get_text()

  # remove non-letters
  letters_only = re.sub(combined_pat2, " ", text)

  # converting to lower-case
  lowercase_letters = letters_only.lower()

  return lowercase_letters


##### LEMMATIZATION
def lemmatize(tokens :list) -> list:
  lemmatizer = WordNetLemmatizer()
  ps= PorterStemmer()
  stop_words = set(stopwords.words("english"))  
 
  # lemmatize
  lemmatized_tokens = list(map(lemmatizer.lemmatize, tokens))

  # remove stop words
  meaningful_words = list(filter(lambda x : x not in stop_words, lemmatized_tokens))

  tweets = [ps.stem(word) for word in meaningful_words]
  return tweets


###### ALL TOGETHER
def preprocess(tweet :str) -> list:

  # clean tweet
  clean_tweet = clean(tweet)

  # tokenize
  tokens = word_tokenize(clean_tweet)

  # lemmatize
  lemmaz = lemmatize(tokens)

  return lemmaz


###### CLEANING WHOLE DATA BY PROCESSING EACH TWEET ONE BY ONE
def get_clean_data(tweets):
  return np.array(list(map(preprocess, tweets )))

  
###### BUILDING TRIGRAMS MODEL
def build_trigrams_model(cleaned_data):
  #creating n grams
  bigrams = Phrases(sentences=cleaned_data)
  trigrams = Phrases(sentences=bigrams[cleaned_data])
  
  # creating trigram model
  embedding_vector_size = 256
  trigrams_model = Word2Vec(
      sentences = trigrams[bigrams[cleaned_data]],
      size = embedding_vector_size,
      min_count=3, window=5, workers=4)
  
  return trigrams_model


###### VECTORIZING DATA
def vectorize_data(data, vocab: dict) -> list:
    print('Vectorize sentences...')
    keys = list(vocab.keys())
    filter_unknown = lambda word: vocab.get(word, None) is not None
    encode = lambda tweet: list(map(keys.index, filter(filter_unknown, tweet)))
    vectorized = list(map(encode, data))
    print('Vectorize sentences... (done)')
    return vectorized


###### FINAL DATA WITH PADDING
def vectorised_padded_data(cleaned_data):
  
  bigrams = Phrases(sentences=cleaned_data)
  trigrams = Phrases(sentences=bigrams[cleaned_data])
  X_data = trigrams[bigrams[cleaned_data]]
  
  print('Convert sentences to sentences with ngrams... (done)')
  input_length = 150
  
  trigrams_model = build_trigrams_model(cleaned_data)
  X_pad = pad_sequences(
      sequences=vectorize_data(X_data, vocab=trigrams_model.wv.vocab),
      maxlen=input_length,
      padding='post')
  return X_pad


###### CLUBBING VECTORIZATION AND PADDING FUCTION
def suitable_data(tweets):
  cleaned_data = get_clean_data(tweets)
  return vectorised_padded_data(cleaned_data)
  



In [28]:
X_pad = suitable_data(df1['clean_text'])



Convert sentences to sentences with ngrams... (done)
Vectorize sentences...
Vectorize sentences... (done)


In [29]:
my_model = tf.keras.models.load_model('saved_model')

In [30]:
outputs=my_model.predict(x=X_pad)

In [31]:
outputs

array([[0.9944427 ],
       [0.00184023],
       [0.9998113 ],
       [0.00208467],
       [0.9999976 ],
       [0.9999583 ],
       [0.99993914],
       [0.99998945],
       [0.99999857],
       [0.99999964],
       [0.99999326],
       [0.9993007 ],
       [0.99999774],
       [0.99892956],
       [0.99993366],
       [1.        ],
       [0.9999989 ],
       [0.9999995 ],
       [0.9999999 ],
       [0.9998357 ],
       [0.9999808 ],
       [0.9998783 ],
       [0.9999711 ],
       [0.9999788 ],
       [0.9999948 ],
       [0.9999965 ],
       [0.9999716 ],
       [1.        ],
       [0.99998844],
       [0.9999974 ],
       [0.99930656],
       [0.9999995 ],
       [0.99996704],
       [0.99999785],
       [0.99999845],
       [0.9968208 ],
       [0.9976361 ],
       [0.9999038 ],
       [1.        ],
       [0.99999774],
       [0.9998039 ],
       [0.9999853 ],
       [0.99999803],
       [0.99966025],
       [0.08130741],
       [0.9999988 ],
       [0.99995065],
       [0.999