In [1]:
# import important modules
import numpy as np
import pandas as pd

In [2]:
conda install -c anaconda nltk


Note: you may need to restart the kernel to use updated packages.


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB # classifier 
from sklearn.metrics import (
    f1_score,
    classification_report,
    confusion_matrix
    
)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# text preprocessing modules
from string import punctuation 
# text preprocessing modules
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression

In [4]:
# seeding
np.random.seed(123)

In [5]:
# load data
data = pd.read_csv("stock_data.csv")

In [6]:
# show dataset sample 
data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [7]:
# check the shape of the data: dataset contains 5791 lines of text 
data.shape

(5791, 2)

In [8]:
# check missing values in data: dataset has no missing values 
data.isnull().sum()

Text         0
Sentiment    0
dtype: int64

In [9]:
# evalute sentiment distribution
# about 1.7 times more positive than negative reviews 
data.Sentiment.value_counts()

 1    3685
-1    2106
Name: Sentiment, dtype: int64

In [10]:
# download package for stopwords 
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# preprocessing data into the right format 
# clean data by removing stopwords, numbers and punctuation 
# define a function text_cleaning to clean the text 
stop_words =  stopwords.words('english')
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, remove stop words and convert words to base form using lemmatisation
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    # remove URLs
    text =  re.sub(r'http\S+',' link ', text)
    # remove numbers 
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) 
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer() 
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    
    # return a list of words
    return(text)

In [12]:
# clean the review
data["cleaned_text"] = data["Text"].apply(text_cleaning)

In [13]:
# Check the cleaned text 
data.head()

Unnamed: 0,Text,Sentiment,cleaned_text
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,Kickers watchlist XIDE TIT SOQ PNK CPW BPZ AJ ...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user AAP MOVIE return FEA GEED indicator trade...
2,user I'd be afraid to short AMZN - they are lo...,1,user I afraid short AMZN looking like near mon...
3,MNTA Over 12.00,1,MNTA Over
4,OI Over 21.37,1,OI Over


In [14]:
# split data into feature and target variables 
# cleaned text will be the variable for feature training 
# Sentiment is the target variables 
X = data["cleaned_text"]
y = data.Sentiment.values

In [15]:
# split dataset into train and test data 
# test data is 15% of the dataset 
# split data into train and validate
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=y,
)

In [16]:
# Transform the cleaned text into numerical values using vectorisation
# to conduct preprocessing and training, use pipelines class to sequentially apply a list of transforms and final estimator
# Create a classifier in pipeline
sentiment_classifier = Pipeline(steps=[
                               ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('naive_bayes',MultinomialNB())
                                 ])

In [17]:
# train the sentiment classifier 
sentiment_classifier.fit(X_train,y_train)

In [18]:
# create a prediction from the validation dataset 
# test model performance on valid data 
y_preds = sentiment_classifier.predict(X_valid)

In [19]:
# evaluate model's performance using the f1_score evaluation metric 
# since there is more positive sentiment than negative sentiment, use F1 score to evaluate 
# the F1 score is more balanced as it takes into account the harmonic mean of precision and recall
from sklearn.metrics import f1_score

In [20]:
f1_score(y_valid,y_preds)

0.8298039215686275

In [24]:
# From F1 score, conclude that accuracy of model is about 83.0%