In [48]:
import numpy as np
import pandas as pd
import nltk as nl

## **Dataset processing**
This dataset was retrieved from: https://www.kaggle.com/datasets/yash612/stockmarket-sentiment-dataset/discussion?sort=hotness 

In [49]:
df = pd.read_csv(r'../Sentiment Analysis/stock_data.csv')
df


Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
...,...,...
5786,Industry body CII said #discoms are likely to ...,-1
5787,"#Gold prices slip below Rs 46,000 as #investor...",-1
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1


### **Remove unwanted observations**

(Duplicates, variables that won't be used, irrelevant values, empty spaces, etc)

In [50]:
print('The amount of not allowed values are: ',len([i for i in df.Sentiment if not ((i==1) or (i==-1))]))
print('The amount of empty spaces is:\n', df.isna().sum())
print('The amount of duplicated rows is:', df.duplicated().sum())
na_list = df.isna()['Sentiment']
df.drop([i for i in range(len(df)) if na_list[i]==True])
df = df.drop_duplicates()


The amount of not allowed values are:  0
The amount of empty spaces is:
 Text         0
Sentiment    0
dtype: int64
The amount of duplicated rows is: 0


### **NLP using NLTK**
*Natural language processing using Natural Language Toolkit*

In [None]:
import nltk.stem as stm
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nl.download('popular')
nl.download('stopwords')
stpw = stopwords.words('english')


### **Tokenization**
Use word tokenization to use words as input for the model

In [52]:
def tokenize(text):
    text = nl.word_tokenize(text)
    return text
df['Text'] = df['Text'].apply(tokenize)
df.head()

Unnamed: 0,Text,Sentiment
0,"[Kickers, on, my, watchlist, XIDE, TIT, SOQ, P...",1
1,"[user, :, AAP, MOVIE, ., 55, %, return, for, t...",1
2,"[user, I, 'd, be, afraid, to, short, AMZN, -, ...",1
3,"[MNTA, Over, 12.00]",1
4,"[OI, Over, 21.37]",1


### **Lemmatization**
Lemmatize the words in the input texts to reduce the model dimmensionality

In [53]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN

def lemm(tokens):
    lemmatizer = stm.WordNetLemmatizer()
    pos_tag_list = nl.pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(word,get_wordnet_pos(tag)) for word,tag in pos_tag_list]
    return lemmatized

df['Text'] = df['Text'].apply(lemm)
df.head()

### **Remove unnecessary tokens**
stopwords and punctuation signs

In [57]:
def filter_tokens(tokens,ref_list=None):
    if ref_list is None:
        ref_list = set()
    else:
        ref_list = set(ref_list)
    filtokens = [token for token in tokens if token not in ref_list]
    return filtokens

df['Text'] = df['Text'].apply(filter_tokens,ref_list=stpw)
df['Text'] = df['Text'].apply(filter_tokens,ref_list=string.punctuation)
df

Unnamed: 0,Text,Sentiment
0,"[Kickers, watchlist, XIDE, TIT, SOQ, PNK, CPW,...",1
1,"[user, AAP, MOVIE, 55, return, FEA/GEED, indic...",1
2,"[user, I, 'd, afraid, short, AMZN, look, like,...",1
3,"[MNTA, Over, 12.00]",1
4,"[OI, Over, 21.37]",1
...,...,...
5786,"[Industry, body, CII, say, discoms, likely, su...",-1
5787,"[Gold, price, slip, Rs, 46,000, investor, book...",-1
5788,"[Workers, Bajaj, Auto, agree, 10, wage, cut, p...",1
5789,"[Sharemarket, LIVE, Sensex, day, ’, high, 600,...",1


## **Model preparation**

In [60]:
import sklearn as skl
from sklearn.naive_bayes import GaussianNB


data_count = np.shape(df)[1]
data_count

ModuleNotFoundError: No module named 'sklearn'