## Stock Market Prediction using Sentiment Analysis

In [2]:
import pandas as pd
import numpy as np
import nltk

In [3]:
df=pd.read_csv("headlines.csv")

### Sorting headlines that only contain "Business" 

In [4]:
headlinecat=[cat for cat in df["headline_category"].value_counts().index if 'business' in cat]

In [5]:
df1=df.loc[df["headline_category"].str.find("business")!=-1].reset_index(drop=True)

In [6]:
df1.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010104,business.india-business,Car dealers caught in Bihar sales tax ruling
1,20010522,business.india-business,Re-negotiation best: Deshmukh; lenders' SOS to...
2,20010522,business.india-business,Samsung says hello to cellular unit in India
3,20010522,business.india-business,Govt lifts port-linked curbs on imports
4,20010522,business.india-business,RIL plans to mop up to Rs 1;000cr


### Sorting through headlines that contain "bse" or "sensex" 

In [10]:
df1["headline_text"]=df1["headline_text"].str.lower()
df2=df1.loc[df1["headline_text"].str.find("sensex" or "bse")!=-1].reset_index(drop=True)

In [11]:
df2.shape

(5706, 3)

In [7]:
import matplotlib.pyplot as plt
import random
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [12]:
stopwords_english= stopwords.words('english')
stemmer= PorterStemmer()

In [13]:
tokenizer=TweetTokenizer(preserve_case=False, reduce_len=True)

### Preprocessing 

In [14]:
def process(headline):
    tokenized =tokenizer.tokenize(headline)
    cleaned=[]
    for word in tokenized:
        if (word not in stopwords_english and word not in string.punctuation):
            cleaned.append(word)
    
    stemmed=[]
    for word in cleaned:
        stem_word= stemmer.stem(word)
        stemmed.append(stem_word)            
    return stemmed

### Defining polarity/subjectivity and sentiment 

In [15]:
import textblob

In [16]:
from textblob import TextBlob

In [17]:
def subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def polarity(text):
    return TextBlob(text).sentiment.polarity

In [19]:
df2["Subjectivity"]=df2["headline_text"].apply(subjectivity)
df2["Polarity"]=df2["headline_text"].apply(polarity)

In [20]:
df2["sentiment"]=df2["Polarity"].apply(lambda x: '1' if x>=0 else '0')

In [21]:
df2.head()

Unnamed: 0,publish_date,headline_category,headline_text,Subjectivity,Polarity,sentiment
0,20010529,business.india-business,sensex stays in the bull zone; gains 60 points,0.0,0.0,1
1,20010715,business.india-business,money wise brsensex faces a roller coaster,0.9,0.7,1
2,20010727,business.india-business,for sensex; worst is yet to come,1.0,-1.0,0
3,20010804,business.india-business,sensex recovers by 2.27% over last week,0.066667,0.0,1
4,20010807,business.india-business,sensex slides by 11 points,0.0,0.0,1


### Making modifications to obtain training and testing set 

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
import sklearn

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
vectorizer = TfidfVectorizer(ngram_range=(1,3),lowercase=False)

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
score=df2["sentiment"]

In [28]:
df2["headlines cleaned"]=df2["headline_text"].apply(process)

In [29]:
cleaned=df2["headlines cleaned"].tolist()    

In [30]:
cl=[]
for head in cleaned:
    a=" ".join([str(elem) for elem in head])
    cl.append(a)

In [31]:
Xtrain,Xtest,ytrain,ytest = train_test_split(cl[int(len(cl)/2):],score[int(len(cl)/2):],test_size=0.25,random_state=21)

In [32]:
Xtrain = vectorizer.fit_transform(Xtrain)
Xtest = vectorizer.transform(Xtest)

### Naive Bayes Classifier

In [33]:
from sklearn.naive_bayes import MultinomialNB

In [34]:
naive= MultinomialNB()
naive.fit(Xtrain,ytrain)

MultinomialNB()

In [35]:
prediciton = naive.predict(Xtest)

In [36]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

matrix = confusion_matrix(ytest,prediciton)
print(matrix)
print('The model accuracy is {}'.format(round(accuracy_score(ytest,prediciton),3)))

[[ 42 153]
 [  0 519]]
The model accuracy is 0.786


### Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression 

In [38]:
lr = LogisticRegression()

In [39]:
lr.fit(Xtrain,ytrain)

LogisticRegression()

In [40]:
y_pred= lr.predict(Xtest)

In [41]:
matrix1 = confusion_matrix(ytest,y_pred)
print(matrix1)
print('The model accuracy is {}'.format(round(accuracy_score(ytest,y_pred),3)))

[[ 65 130]
 [  6 513]]
The model accuracy is 0.81


### Decision Tree Classifier 

In [42]:
from sklearn.tree import DecisionTreeClassifier

In [43]:
clf=DecisionTreeClassifier()

In [44]:
clf.fit(Xtrain,ytrain)

DecisionTreeClassifier()

In [45]:
y_pred1=clf.predict(Xtest)

In [46]:
matrix2 = confusion_matrix(ytest,y_pred1)
print(matrix2)
print('The model accuracy is {}'.format(round(accuracy_score(ytest,y_pred1),3)))

[[154  41]
 [114 405]]
The model accuracy is 0.783


### SVM Classifier 

In [47]:
from sklearn import svm

In [48]:
model = svm.SVC(C=100,kernel='rbf',gamma=0.01)

In [49]:
model.fit(Xtrain, ytrain)

SVC(C=100, gamma=0.01)

In [50]:
y_pred2=model.predict(Xtest)

In [51]:
matrix3 = confusion_matrix(ytest,y_pred2)
print(matrix3)
print('The model accuracy is {}'.format(round(accuracy_score(ytest,y_pred2),3)))

[[141  54]
 [ 22 497]]
The model accuracy is 0.894


### Random Forest Classifier 

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
cf = RandomForestClassifier(n_estimators=100,criterion='entropy')

In [57]:
cf.fit(Xtrain, ytrain)

RandomForestClassifier(criterion='entropy')

In [58]:
y_pred3=cf.predict(Xtest)

In [59]:
matrix4 = confusion_matrix(ytest,y_pred3)
print(matrix4)
print('The model accuracy is {}'.format(round(accuracy_score(ytest,y_pred3),3)))

[[134  61]
 [ 35 484]]
The model accuracy is 0.866


### SVM Model gives the best accuracy.