In [496]:
import numpy as np
import pandas as pd
import nltk as nl

## **Dataset processing**
This dataset was retrieved from: https://www.kaggle.com/datasets/yash612/stockmarket-sentiment-dataset/discussion?sort=hotness 

In [497]:
df = pd.read_csv(r'../Sentiment Analysis/stock_data.csv')
df


Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
...,...,...
5786,Industry body CII said #discoms are likely to ...,-1
5787,"#Gold prices slip below Rs 46,000 as #investor...",-1
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1


### **Remove unwanted observations**

(Duplicates, variables that won't be used, irrelevant values, empty spaces, etc)

In [498]:
print('The amount of not allowed values are: ',len([i for i in df.Sentiment if not ((i==1) or (i==-1))]))
print('The amount of empty spaces is:\n', df.isna().sum())
print('The amount of duplicated rows is:', df.duplicated().sum())
na_list = df.isna()['Sentiment']
df.drop([i for i in range(len(df)) if na_list[i]==True])
df = df.drop_duplicates()
df['Sentiment'] = df['Sentiment'].replace({1: 'Pos', -1: 'Neg'})


The amount of not allowed values are:  0
The amount of empty spaces is:
 Text         0
Sentiment    0
dtype: int64
The amount of duplicated rows is: 0


### **NLP using NLTK**
*Natural language processing using Natural Language Toolkit*

In [499]:
import nltk.stem as stm
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nl.download('popular')
nl.download('stopwords')
stpw = stopwords.words('english')


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\tom_p\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\tom_p\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\tom_p\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\tom_p\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\tom_p\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

### **Tokenization**
Use word tokenization to use words as input for the model

In [500]:
def tokenize(text):
    text = nl.word_tokenize(text)
    return text
df['Text'] = df['Text'].apply(tokenize)
df.head()

Unnamed: 0,Text,Sentiment
0,"[Kickers, on, my, watchlist, XIDE, TIT, SOQ, P...",Pos
1,"[user, :, AAP, MOVIE, ., 55, %, return, for, t...",Pos
2,"[user, I, 'd, be, afraid, to, short, AMZN, -, ...",Pos
3,"[MNTA, Over, 12.00]",Pos
4,"[OI, Over, 21.37]",Pos


### **Lemmatization**
Lemmatize the words in the input texts to reduce the model dimmensionality

In [501]:
def junction(tokens):
    text = ' '.join(tokens)
    return text

def stemmer(text):
  ps = nl.porter.PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN

def lemm(tokens):
    lemmatizer = stm.WordNetLemmatizer()
    pos_tag_list = nl.pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(word,get_wordnet_pos(tag)) for word,tag in pos_tag_list]
    return lemmatized



# df['Text'] = df['Text'].apply(lemm)
df['Text'] = df['Text'].apply(junction)
df['Text'] = df['Text'].apply(stemmer)
df['Text'] = df['Text'].apply(tokenize)
df.head()

Unnamed: 0,Text,Sentiment
0,"[kicker, on, my, watchlist, xide, tit, soq, pn...",Pos
1,"[user, :, aap, movi, ., 55, %, return, for, th...",Pos
2,"[user, i, 'd, be, afraid, to, short, amzn, -, ...",Pos
3,"[mnta, over, 12.00]",Pos
4,"[oi, over, 21.37]",Pos


### **Remove unnecessary tokens**
stopwords and punctuation signs

In [502]:
def filter_tokens(tokens,ref_list=None):
    if ref_list is None:
        ref_list = set()
    else:
        ref_list = set(ref_list)
    filtokens = [token.lower() for token in tokens if token.lower() not in ref_list]
    return filtokens





df['Text'] = df['Text'].apply(filter_tokens,ref_list=stpw)
# df['Text'] = df['Text'].apply(filter_tokens,ref_list=string.punctuation)
df['Text'] = df['Text'].apply(junction)
df.head()

Unnamed: 0,Text,Sentiment
0,kicker watchlist xide tit soq pnk cpw bpz aj t...,Pos
1,user : aap movi . 55 % return fea/ge indic 15 ...,Pos
2,user 'd afraid short amzn - look like near-mon...,Pos
3,mnta 12.00,Pos
4,oi 21.37,Pos


## **Model preparation**

In [503]:
import sklearn as skl
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

shape = np.shape(df)
data_count = shape[0]
print('The number of reviews are ',data_count)

The number of reviews are  5791


#### Encode the dataset information using TFiDF Vectorization

In [504]:
corpus = list(df['Text'])
vectorizer = skl.feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(corpus)
keys = vectorizer.get_feature_names_out()
Xarr = X.toarray()
encoded_df = pd.DataFrame(data= Xarr,columns=keys)

In [505]:
lb = LabelBinarizer()
y_encoded = lb.fit_transform(df['Sentiment'])

In [506]:
seed = 25
test_size = 0.2
X = Xarr
# y = y_encoded.ravel()
y = df['Sentiment']
X_train, X_test, y_train, y_test = skl.model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)

#### Split the dataset in the training data and testing data

In [507]:
pipe = make_pipeline(LogisticRegression(random_state=seed,max_iter=500),verbose=True)
pipe.fit(X_train, y_train,)  # apply scaling on training data
accuracy=pipe.score(X_test,y_test)
# c_val = cross_validate(pipe,X,y)['test_score']
y_predicted = pipe.predict(X_test)
print('Accuracy',accuracy)
# print('La cross validation es: ',c_val)
print('Report: ',classification_report(y_test,y_predicted))

[Pipeline]  (step 1 of 1) Processing logisticregression, total=  14.1s
Accuracy 0.7575496117342536
Report:                precision    recall  f1-score   support

         Neg       0.78      0.43      0.55       408
         Pos       0.75      0.94      0.83       751

    accuracy                           0.76      1159
   macro avg       0.77      0.68      0.69      1159
weighted avg       0.76      0.76      0.74      1159



In [511]:
X_display = ["The market plunges into the red books"]
X_bow = vectorizer.transform(X_display)
y_pred = pipe.predict(X_bow)
print('Predicted Sentiment: ', y_pred[0])

Predicted Sentiment:  Neg
