In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
df=pd.read_csv("headlines.csv")

In [3]:
headlinecat=[cat for cat in df["headline_category"].value_counts().index if 'business' in cat]

In [4]:
headlinecat

['business.india-business',
 'business.international-business',
 'business.personal-finance',
 'business',
 'india-business-news-wire',
 'business.faqs.income-tax-faqs',
 'business.mf-simplified.mf-news',
 'business.mf-simplified.jargon-busters.debt',
 'business.faqs.aadhar-faqs',
 'business.mf-simplified.jargon-busters.equity',
 'business.faqs.gst-faqs',
 'business.mf-simplified.faq']

In [4]:
df1=df.loc[df["headline_category"].str.find("business")!=-1].reset_index(drop=True)

In [6]:
df1.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010104,business.india-business,Car dealers caught in Bihar sales tax ruling
1,20010522,business.india-business,Re-negotiation best: Deshmukh; lenders' SOS to...
2,20010522,business.india-business,Samsung says hello to cellular unit in India
3,20010522,business.india-business,Govt lifts port-linked curbs on imports
4,20010522,business.india-business,RIL plans to mop up to Rs 1;000cr


In [27]:
import matplotlib.pyplot as plt
import random
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [5]:
df1.shape

(155587, 3)

In [18]:
df1["headline_text"]=df1["headline_text"].str.lower()

In [19]:
df2=df1.loc[df1["headline_text"].str.find("sensex" or "bse")!=-1].reset_index(drop=True)

In [20]:
df2.shape


(5706, 3)

In [21]:
df2.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010529,business.india-business,sensex stays in the bull zone; gains 60 points
1,20010715,business.india-business,money wise brsensex faces a roller coaster
2,20010727,business.india-business,for sensex; worst is yet to come
3,20010804,business.india-business,sensex recovers by 2.27% over last week
4,20010807,business.india-business,sensex slides by 11 points


### Tokenizing  (making a list of words)

In [28]:
tokenizer=TweetTokenizer(preserve_case=False, reduce_len=True)

In [29]:
tokenized=[]
for headline in df1["headline_text"]:
    tokenized.append(tokenizer.tokenize(headline))

In [30]:
tokenized

[['car', 'dealers', 'caught', 'in', 'bihar', 'sales', 'tax', 'ruling'],
 ['re-negotiation',
  'best',
  ':',
  'deshmukh',
  ';',
  'lenders',
  "'",
  'sos',
  'to',
  'centre'],
 ['samsung', 'says', 'hello', 'to', 'cellular', 'unit', 'in', 'india'],
 ['govt', 'lifts', 'port-linked', 'curbs', 'on', 'imports'],
 ['ril', 'plans', 'to', 'mop', 'up', 'to', 'rs', '1', ';', '000cr'],
 ['eicher', 'crashes', 'into', 'the', 'red', ';', 'to', 'cut', '200', 'jobs'],
 ['indo', 'rama', 'posts', 'rs18cr', 'net'],
 ['jet', ';', 'sahara', 'to', 'wait', '&', 'watch'],
 ['auto', 'body', 'drives', 'in', 'new', 'classification', 'norms'],
 ['are', 'us', '64', "'", 's', 'sunny', 'days', 'over', '?'],
 ['dpc', 'may', 'contest', 'merc', 'jurisdiction'],
 ['brave', 'new', 'biotech'],
 ['the', 'beginning', 'of', 'salary', 'fall', '?'],
 ['dgca', 'proposal', 'on', 'airfares', 'upsets', 'airlines'],
 ['zee', 'and', 'the', 'art', 'of', 'programming', 'a', 'turnaround'],
 ['learn', 'to', 'check', 'a', 'right', 'i

In [36]:
tokenized[1]

[]

In [13]:
len(tokenized)

155587

### Removing stopwords and punctuation 

In [31]:
stopwords_english= stopwords.words('english')

In [32]:
cleaned=[[] for _ in range(len(tokenized))]
for i in range(0,len(tokenized)):
    for word in tokenized[i]:
        if (word not in stopwords_english and word not in string.punctuation):
            cleaned[i].append(word)

In [33]:
cleaned

[['car', 'dealers', 'caught', 'bihar', 'sales', 'tax', 'ruling'],
 ['re-negotiation', 'best', 'deshmukh', 'lenders', 'sos', 'centre'],
 ['samsung', 'says', 'hello', 'cellular', 'unit', 'india'],
 ['govt', 'lifts', 'port-linked', 'curbs', 'imports'],
 ['ril', 'plans', 'mop', 'rs', '1', '000cr'],
 ['eicher', 'crashes', 'red', 'cut', '200', 'jobs'],
 ['indo', 'rama', 'posts', 'rs18cr', 'net'],
 ['jet', 'sahara', 'wait', 'watch'],
 ['auto', 'body', 'drives', 'new', 'classification', 'norms'],
 ['us', '64', 'sunny', 'days'],
 ['dpc', 'may', 'contest', 'merc', 'jurisdiction'],
 ['brave', 'new', 'biotech'],
 ['beginning', 'salary', 'fall'],
 ['dgca', 'proposal', 'airfares', 'upsets', 'airlines'],
 ['zee', 'art', 'programming', 'turnaround'],
 ['learn', 'check', 'right', 'isi', 'mark'],
 ['mseb', 'plans', 'acid', 'test', 'dpc', 'phase', 'ii'],
 ['ashok', 'leyland', 'cut', 'jobs', 'divisions'],
 ['fixing',
  'former',
  'dse',
  'ed',
  "sodhi's",
  'pay',
  'package',
  'lacks',
  'transparenc

### Stemming

In [34]:
stemmer= PorterStemmer()
stemmed=[[] for _ in range(len(cleaned))]
for i in range(0,len(cleaned)):
    for word in cleaned[i]:
        stem_word= stemmer.stem(word)
        stemmed[i].append(stem_word)

In [35]:
stemmed

[['car', 'dealer', 'caught', 'bihar', 'sale', 'tax', 'rule'],
 ['re-negoti', 'best', 'deshmukh', 'lender', 'so', 'centr'],
 ['samsung', 'say', 'hello', 'cellular', 'unit', 'india'],
 ['govt', 'lift', 'port-link', 'curb', 'import'],
 ['ril', 'plan', 'mop', 'rs', '1', '000cr'],
 ['eicher', 'crash', 'red', 'cut', '200', 'job'],
 ['indo', 'rama', 'post', 'rs18cr', 'net'],
 ['jet', 'sahara', 'wait', 'watch'],
 ['auto', 'bodi', 'drive', 'new', 'classif', 'norm'],
 ['us', '64', 'sunni', 'day'],
 ['dpc', 'may', 'contest', 'merc', 'jurisdict'],
 ['brave', 'new', 'biotech'],
 ['begin', 'salari', 'fall'],
 ['dgca', 'propos', 'airfar', 'upset', 'airlin'],
 ['zee', 'art', 'program', 'turnaround'],
 ['learn', 'check', 'right', 'isi', 'mark'],
 ['mseb', 'plan', 'acid', 'test', 'dpc', 'phase', 'ii'],
 ['ashok', 'leyland', 'cut', 'job', 'divis'],
 ['fix', 'former', 'dse', 'ed', "sodhi'", 'pay', 'packag', 'lack', 'transpar'],
 ['dd', 'metro', 'grab'],
 ['dpc', 'signal', 'climb', 'see', 'scope', 'cut', '

### Dividing into training and testing set

In [36]:
import textblob

In [37]:
from textblob import TextBlob

In [38]:
def subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def polarity(text):
    return TextBlob(text).sentiment.polarity

In [39]:
df2["Subjectivity"]=df2["headline_text"].apply(subjectivity)
df2["Polarity"]=df2["headline_text"].apply(polarity)

In [40]:
df2.head()

Unnamed: 0,publish_date,headline_category,headline_text,Subjectivity,Polarity
0,20010529,business.india-business,sensex stays in the bull zone; gains 60 points,0.0,0.0
1,20010715,business.india-business,money wise brsensex faces a roller coaster,0.9,0.7
2,20010727,business.india-business,for sensex; worst is yet to come,1.0,-1.0
3,20010804,business.india-business,sensex recovers by 2.27% over last week,0.066667,0.0
4,20010807,business.india-business,sensex slides by 11 points,0.0,0.0


In [41]:
df2["sentiment"]=df2["Polarity"].apply(lambda x: '1' if x>=0 else '0')

In [42]:
df2

Unnamed: 0,publish_date,headline_category,headline_text,Subjectivity,Polarity,sentiment
0,20010529,business.india-business,sensex stays in the bull zone; gains 60 points,0.000000,0.0,1
1,20010715,business.india-business,money wise brsensex faces a roller coaster,0.900000,0.7,1
2,20010727,business.india-business,for sensex; worst is yet to come,1.000000,-1.0,0
3,20010804,business.india-business,sensex recovers by 2.27% over last week,0.066667,0.0,1
4,20010807,business.india-business,sensex slides by 11 points,0.000000,0.0,1
...,...,...,...,...,...,...
5701,20201219,business.india-business,sensex hits 47k for 1st time as eco hobbles to...,0.000000,0.0,1
5702,20201222,business.india-business,virus variant: sensex tanks 1;407 points,0.000000,0.0,1
5703,20201223,business.india-business,sensex regains 46k in pullback rally,0.000000,0.0,1
5704,20201229,business.india-business,sensex ends above 47k pts for 1st time,0.100000,0.0,1


In [43]:
df2["sentiment"].value_counts()

1    4212
0    1494
Name: sentiment, dtype: int64

In [47]:
all_pos_head=[]
for headline, sentiment in zip(df2['headline_text'],df2["sentiment"]):
    if sentiment=='1':
        all_pos_head.append(headline)

In [48]:
len(all_pos_head)

4212

In [49]:
all_neg_head=[]
for headline, sentiment in zip(df2['headline_text'],df2["sentiment"]):
    if sentiment=='0':
        all_neg_head.append(headline)

In [50]:
len(all_neg_head)

1494

In [51]:
0.75 * len(all_pos_head)

3159.0

In [52]:
0.25 * len(all_pos_head)

1053.0

In [53]:
0.75 * len(all_neg_head)

1120.5

In [54]:
0.25 * len(all_neg_head)

373.5

In [55]:
train_pos= all_pos_head[:3159]
test_pos=all_pos_head[3159:]
train_neg= all_neg_head[:1120]
test_neg=all_neg_head[1120:]
train_x=train_pos+ train_neg
test_x=test_pos+ test_neg

In [56]:
len(train_x)

4279

In [57]:
train_y=np.append(np.ones((len(train_pos),1)),np.zeros((len(train_neg),1)),axis=0)

In [58]:
test_y=np.append(np.ones((len(test_pos),1)),np.zeros((len(test_neg),1)),axis=0)

In [59]:
def process(headline):
    tokenized =tokenizer.tokenize(headline)
    cleaned=[]
    for word in tokenized:
        if (word not in stopwords_english and word not in string.punctuation):
            cleaned.append(word)
    
    stemmed=[]
    for word in cleaned:
        stem_word= stemmer.stem(word)
        stemmed.append(stem_word)            
    return stemmed

In [60]:
def count_tweets(result, tweets, ys):
    yslist=np.squeeze(ys).tolist()
    for y, tweet in zip(yslist, tweets):
        for word in process(tweet):
            pair = (word,y)
            if pair in result:
                result[pair] += 1
            else: 
                result[pair] = 1
    return result

In [61]:
freqs = count_tweets({}, train_x, train_y)

In [62]:
process("I love America")

['love', 'america']

In [63]:
def lookup(freqs, word, label):
    
    n = 0

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [64]:
def train_naive_bayes(freqs, train_x, train_y):
  
    loglikelihood = {}
    logprior = 0

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]

    D = len(train_y)
    
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))

    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood

In [65]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

1.0369268365129454
2308


In [66]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = process(tweet)

    p = 0

    p += logprior

    for word in word_l:

        if word in loglikelihood:
          
            p += loglikelihood[word]


    return p

In [67]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0
    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0

        y_hats.append(y_hat_i)

    error = np.mean(np.absolute(y_hats-test_y))

    accuracy = 1-error


    return accuracy

In [68]:
print("Naive Bayes accuracy =%0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy =0.6379


In [69]:
from sklearn.model_selection import GridSearchCV

In [70]:
import sklearn

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
vectorizer = TfidfVectorizer(ngram_range=(1,3),lowercase=False)

In [73]:
from sklearn.model_selection import train_test_split

In [76]:
score=df2["sentiment"]

In [77]:
df2["headlines cleaned"]=df2["headline_text"].apply(process)

In [86]:
Xtrain,Xtest,ytrain,ytest = train_test_split(cl[int(len(cl)/2):],score[int(len(cl)/2):],test_size=0.25,random_state=21)

In [87]:
Xtrain = vectorizer.fit_transform(Xtrain)
Xtest = vectorizer.transform(Xtest)

In [81]:

from sklearn.naive_bayes import MultinomialNB



In [88]:
naive = MultinomialNB()
naive.fit(Xtrain,ytrain)

MultinomialNB()

In [89]:

prediciton = naive.predict(Xtest)

In [90]:
cleaned=df2["headlines cleaned"].tolist()    

In [91]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

matrix = confusion_matrix(ytest,prediciton)
print(matrix)
print('The model accuracy is {}'.format(round(accuracy_score(ytest,prediciton),3)))

[[ 42 153]
 [  0 519]]
The model accuracy is 0.786


In [85]:
cl=[]
for head in cleaned:
    a=" ".join([str(elem) for elem in head])
    cl.append(a)

In [67]:
cl

['car dealer caught bihar sale tax rule',
 're-negoti best deshmukh lender so centr',
 'samsung say hello cellular unit india',
 'govt lift port-link curb import',
 'ril plan mop rs 1 000cr',
 'eicher crash red cut 200 job',
 'indo rama post rs18cr net',
 'jet sahara wait watch',
 'auto bodi drive new classif norm',
 'us 64 sunni day',
 'dpc may contest merc jurisdict',
 'brave new biotech',
 'begin salari fall',
 'dgca propos airfar upset airlin',
 'zee art program turnaround',
 'learn check right isi mark',
 'mseb plan acid test dpc phase ii',
 'ashok leyland cut job divis',
 "fix former dse ed sodhi' pay packag lack transpar",
 'dd metro grab',
 'dpc signal climb see scope cut power tariff',
 'satyam say may face manpow problem',
 'net dip 54 keep payout 55',
 'better late never nid take new design',
 'sensex stay bull zone gain 60 point',
 'sail cut loss half',
 'drive low gear',
 'bewar snoopwar watch',
 'ia divest may roadblock new player domest sky',
 'car sale drop 15 april',
 