# Text Sentiment Analysis - Stock Data

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 

# Gettitng the Data

In [2]:
data = pd.read_csv('stock_data.csv')
data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
data.tail()

Unnamed: 0,Text,Sentiment
5786,Industry body CII said #discoms are likely to ...,-1
5787,"#Gold prices slip below Rs 46,000 as #investor...",-1
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1
5790,"#Sensex, #Nifty climb off day's highs, still u...",1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5791 entries, 0 to 5790
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       5791 non-null   object
 1   Sentiment  5791 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.6+ KB


In [5]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

# Cleaning the Text

# Using Stemming technique
Stemming follows an algorithm with steps to perform on the words which makes it faster. Whereas, in lemmatization, you used WordNet corpus and a corpus for stop words as well to produce lemma which makes it slower than stemming.

In [6]:
corpus = []
for i in range(0,len(data['Text'])):
    #sentence = nltk.sent_tokenize(data['Text'][i]) 
    review = re.sub('[^a-zA-z]' , ' ',data['Text'][i])
    review = review.lower()
    review = review.split()
    review =[ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)

# Vectorization of Text
Using CountVectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus)
y= data['Sentiment']

# Train test split

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state=101)

# Model Training and Evaluation

In [11]:
from sklearn.naive_bayes import MultinomialNB

In [12]:
nb = MultinomialNB()

In [13]:
nb.fit(X_train,y_train)

MultinomialNB()

In [14]:
pred = nb.predict(X_test)

In [15]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [16]:
print(confusion_matrix(y_test,pred))

[[376 259]
 [137 966]]


In [17]:
print(accuracy_score(y_test,pred))

0.7721518987341772


In [18]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

          -1       0.73      0.59      0.66       635
           1       0.79      0.88      0.83      1103

    accuracy                           0.77      1738
   macro avg       0.76      0.73      0.74      1738
weighted avg       0.77      0.77      0.77      1738



# Vetorization of Text
Using TfidfVectorizer and it is supposed to perform better than CountVectorizer. Let's compare.

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()

In [20]:
X = tf.fit_transform(corpus)
y = data['Sentiment']

# Train Test Split

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state=101)

# Model Trainig and Evaluation

In [22]:
nb.fit(X_train,y_train)

MultinomialNB()

In [23]:
pred1 = nb.predict(X_test)

In [24]:
print(confusion_matrix(y_test,pred1))

[[ 218  417]
 [  43 1060]]


In [25]:
print(accuracy_score(y_test,pred1))

0.7353279631760644


In [26]:
print(classification_report(y_test,pred1))

              precision    recall  f1-score   support

          -1       0.84      0.34      0.49       635
           1       0.72      0.96      0.82      1103

    accuracy                           0.74      1738
   macro avg       0.78      0.65      0.65      1738
weighted avg       0.76      0.74      0.70      1738



Here in this case we found that CountVectorizer performs slightly better than TfidfVectorizer.

Now let us try Lemmatization technique instead of stemming

# Cleaning Text and using Lemmatization

In [27]:
corpus1 = []
for i in range(0,len(data['Text'])):
    #sentence = nltk.sent_tokenize(data['Text'][i]) 
    review = re.sub('[^a-zA-z]' , ' ',data['Text'][i])
    review = review.lower()
    review = review.split()
    review =[wordnet.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = " ".join(review)
    corpus1.append(review)

In [34]:
X = cv.fit_transform(corpus1)# Vectorization using CountVetorizer
y = data['Sentiment']

# Train Test Split

In [35]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state=101)

# Model Training and Evaluation

In [36]:
nb.fit(X_train,y_train)

MultinomialNB()

In [37]:
pred2 = nb.predict(X_test)

In [38]:
print(confusion_matrix(y_test,pred2))

[[370 265]
 [150 953]]


In [39]:
print(accuracy_score(y_test,pred2))

0.7612197928653625


In [40]:
print(classification_report(y_test,pred2))

              precision    recall  f1-score   support

          -1       0.71      0.58      0.64       635
           1       0.78      0.86      0.82      1103

    accuracy                           0.76      1738
   macro avg       0.75      0.72      0.73      1738
weighted avg       0.76      0.76      0.76      1738



Now Vectorization of text using Tfidfvectorizer

In [42]:
X =tf.fit_transform(corpus1)
y = data['Sentiment']

# Train Test Split

In [43]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state=101)

# Model Training and Evaluation

In [45]:
nb.fit(X_train,y_train)

MultinomialNB()

In [46]:
y_pred=nb.predict(X_test)

In [47]:
accuracy_score(y_test,y_pred)

0.7209436133486766

In [48]:
confusion_matrix(y_test,y_pred)

array([[ 195,  440],
       [  45, 1058]], dtype=int64)

In [49]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.81      0.31      0.45       635
           1       0.71      0.96      0.81      1103

    accuracy                           0.72      1738
   macro avg       0.76      0.63      0.63      1738
weighted avg       0.75      0.72      0.68      1738



Here stemming worked better than lemmatization technique.