# Analyzing daily stocks news to understand market sentiments which are mainly positive or negative

In [1]:
import nltk
import pandas as pd
import selenium
import numpy as np
import bs4
from bs4 import BeautifulSoup as bs
import requests
import urllib

In [2]:
url = 'https://economictimes.indiatimes.com/markets/stocks/news'

### Scrapping using Selenium

In [3]:
import selenium
from selenium import webdriver
import time

In [4]:
driver = webdriver.Chrome('Drivers/chromedriver.exe')
driver.get(url)
time.sleep(5)
for i in range(5):
    driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")
driver.close()

### Using beautifulSoup to parse the data

In [5]:
page_html = bs(driver.page_source,"html.parser")

### Extracting headlines and summary of each headline

In [6]:
#Latest 48 headlines scrapped from economic times
headlines = [page_html.findAll('h3')[i].text for i in range(len(page_html.findAll('h3'))) if page_html.findAll('h3')[i].text!="BULL'S EYE" ][:-1]
summary = [page_html.findAll('div',{'class':"eachStory"})[i].findAll('p')[0].text for i in range(len(page_html.findAll('div',{'class':"eachStory"})))]

### Creating a raw dataframe

In [66]:
features = ['Headlines','Summary']
raw_data = pd.DataFrame(dict(zip(features,[headlines,summary])))

In [67]:
raw_data

Unnamed: 0,Headlines,Summary
0,Sebi extends CKYCR to legal entities,Regulated entities (REs) have already been upl...
1,New dividend policy to ensure consistent rewar...,The dividend policy has been further fine-tune...
2,RBI removes PCA restrictions on IDBI Bank,IDBI Bank was placed under the so-called PCA f...
3,UBS signals risk of India stocks trailing bond...,Continuing their rally from pandemic-driven lo...
4,F&O: Nifty makes higher lows for a third sessi...,India VIX fell 7.79% from 22.49 to 20.74 level...
5,"Tech View: Nifty wins a mini-battle, sets sigh...",After narrow trading for the last five session...
6,Anupam Rasayan IPO looks fully priced: Should ...,After annualising the earnings for the nine-mo...
7,"Meet Zhong Shanshan, a reclusive entrepreneur ...",He became the first Chinese entrepreneur to en...
8,Is the stock market closed tomorrow?,Markets will be closed for trading on Thursday...
9,States budgets can help improve India's GDP pr...,The consolidated fiscal deficit of the Centre ...


# Text preprocessing using nltk

In [68]:
import nltk
import string
import re
import inflect

In [69]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [70]:
def lowercase(x):
    return x.lower()

In [71]:
def replace_percent(x):
    string = []
    for i in x:
        if i=='%':
            string.append(' percent')
        else:
            string.append(i)
        temp_str = ''.join(string)
    return temp_str

In [72]:
def remove_stop_words(x):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(x)
    filtered_text = [word.strip() for word in word_tokens if word not in stop_words]
    filtered_text = [word.strip() for word in filtered_text if word != ' ']
    return filtered_text

In [73]:
def remove_punctuations(x):
    translator = str.maketrans('','',string.punctuation)
    return x.translate(translator)

In [74]:
def remove_numbers(x):
    temp = x
    new_str = []
    for word in temp:
        if word.isdigit():
            new_str.append('')
        else:
            new_str.append(word)
    return new_str

In [75]:
raw_data['Headlines'] = raw_data['Headlines'].apply(lowercase)
raw_data['Headlines'] = raw_data['Headlines'].apply(replace_percent)
raw_data['Headlines'] = raw_data['Headlines'].apply(remove_punctuations)
raw_data['Headlines'] = raw_data['Headlines'].apply(remove_stop_words)
raw_data['Headlines'] = raw_data['Headlines'].apply(remove_numbers)


raw_data['Summary'] = raw_data['Summary'].apply(lowercase)
raw_data['Summary'] = raw_data['Summary'].apply(replace_percent)
raw_data['Summary'] = raw_data['Summary'].apply(remove_punctuations)
raw_data['Summary'] = raw_data['Summary'].apply(remove_stop_words)
raw_data['Summary'] = raw_data['Summary'].apply(remove_numbers)

In [76]:
#Now we will apply lemmatization to get similar words and better generalize.
#For lemmatization we need to have our sentences in tokenized form.

#### Lemmatization

In [77]:
from nltk.stem import wordnet
lemma = wordnet.WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [78]:
def lemmatize_word(x):
    lemmas = [lemma.lemmatize(word,pos = 'v') for word in x]
    return lemmas

In [79]:
raw_data['Headlines'] = raw_data['Headlines'].apply(lemmatize_word)
raw_data['Summary'] = raw_data['Summary'].apply(lemmatize_word)

In [80]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [87]:
def drop_zero_len_string(x):
    s = []
    for i in x:
        if i=='':
            pass
        else:
            s.append(i)
    return s

In [89]:
raw_data['Summary'] = raw_data['Summary'].apply(drop_zero_len_string)
raw_data['Headlines'] = raw_data['Headlines'].apply(drop_zero_len_string)

# Parts of Speech (POS) Tagging

In [90]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
def pos_tagg(text): 
    return pos_tag(text)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [91]:
raw_data['Headlines'] = raw_data['Headlines'].apply(pos_tagg)
raw_data['Summary'] = raw_data['Summary'].apply(pos_tagg)

In [93]:
raw_data.head()

Unnamed: 0,Headlines,Summary
0,"[(sebi, JJ), (extend, VBP), (ckycr, JJ), (lega...","[(regulate, NN), (entities, NNS), (res, VBZ), ..."
1,"[(new, JJ), (dividend, NN), (policy, NN), (ens...","[(dividend, NN), (policy, NN), (finetuned, VBN..."
2,"[(rbi, NN), (remove, VB), (pca, NN), (restrict...","[(idbi, JJ), (bank, NN), (place, NN), (socalle..."
3,"[(ubs, JJ), (signal, NN), (risk, NN), (india, ...","[(continue, VB), (rally, RB), (pandemicdriven,..."
4,"[(fo, JJ), (nifty, NNS), (make, VBP), (higher,...","[(india, NN), (vix, NN), (fell, VBD), (percent..."


In [97]:
# downloading the tagset  
nltk.download('tagsets') 
  
# extract information about the tag 
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
