## Data Science Business Analytics Internship
### GRIP @ The Sparks Foundaton
<h2 style=color:green align="left"> BY: Anuganti Suresh </h2>

------------------

<h2 style="color:blue" align="left"> Task 7: Stock Market Prediction using Numerical and Textual Analysis </h2>


## Objective

- Create a hybrid model for stock price / performance prediction using numerical analysis of historical stock prices, and sentimental analysis of news headlines.


- Stock to analyze and predict - SENSEX (S&P BSE SENSEX)

----------------

<h2 style="color:blue" align="left"> 1. Import necessary Libraries </h2>

In [2]:
# Read Data
import numpy as np                     # Linear Algebra (calculate the mean and standard deviation)
import pandas as pd                    # manipulate data, data processing, load csv file I/O (e.g. pd.read_csv)

# Visualization
import seaborn as sns                  # Visualization using seaborn
import matplotlib.pyplot as plt        # Visualization using matplotlib
%matplotlib inline

# style
plt.style.use("fivethirtyeight")       # Set Graphs Background style using matplotlib
sns.set_style("darkgrid")              # Set Graphs Background style using seaborn

import warnings                        # Ignore Warnings
warnings.filterwarnings("ignore")

In [3]:
from textblob import TextBlob

<h2 style="color:blue" align="left"> 2. Load data </h2>

### a. india-news-headlines

In [4]:
news_data = pd.read_csv('india-news-headlines.csv')

In [5]:
# Import first 5 rows
news_data.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010101,sports.wwe,win over cena satisfying but defeating underta...
1,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
2,20010102,unknown,Fissures in Hurriyat over Pak visit
3,20010102,unknown,America's unwanted heading for India?
4,20010102,unknown,For bigwigs; it is destination Goa


In [6]:
# Import last 5 rows
news_data.tail()

Unnamed: 0,publish_date,headline_category,headline_text
3297167,20200630,gadgets-news,why tiktok removed 1 65 crore videos in india
3297168,20200630,entertainment.hindi.bollywood,apurva asrani calls alia bhatts mother soni ra...
3297169,20200630,entertainment.hindi.bollywood,kangana ranaut gets a doll version of herself ...
3297170,20200630,entertainment.hindi.bollywood,meezaan jaffrey reminisces his childhood days ...
3297171,20200630,entertainment.telugu.movies.news,prabhas20 titled as radhe shyam prabhas and po...


In [7]:
# checking dimension (num of rows and columns) of dataset
print("'india-news-headlines' shape (Rows, Columns):", news_data.shape)

'india-news-headlines' shape (Rows, Columns): (3297172, 3)


In [8]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3297172 entries, 0 to 3297171
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   publish_date       int64 
 1   headline_category  object
 2   headline_text      object
dtypes: int64(1), object(2)
memory usage: 75.5+ MB


In [9]:
# Finding all the column names inside the dataset
news_data.columns

Index(['publish_date', 'headline_category', 'headline_text'], dtype='object')

In [10]:
news_data.duplicated().sum()

21585

In [11]:
news_data['published_date'] = np.array([str(str(str(x)[:4]) + '/' + str(str(x)[4:6]) + '/' + str(str(x)[6:])) for x in news_data['publish_date']])

In [12]:
news_data.head()

Unnamed: 0,publish_date,headline_category,headline_text,published_date
0,20010101,sports.wwe,win over cena satisfying but defeating underta...,2001/01/01
1,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...,2001/01/02
2,20010102,unknown,Fissures in Hurriyat over Pak visit,2001/01/02
3,20010102,unknown,America's unwanted heading for India?,2001/01/02
4,20010102,unknown,For bigwigs; it is destination Goa,2001/01/02


In [13]:
news_data = news_data.drop('publish_date', axis=1)
news_data.head()

Unnamed: 0,headline_category,headline_text,published_date
0,sports.wwe,win over cena satisfying but defeating underta...,2001/01/01
1,unknown,Status quo will not be disturbed at Ayodhya; s...,2001/01/02
2,unknown,Fissures in Hurriyat over Pak visit,2001/01/02
3,unknown,America's unwanted heading for India?,2001/01/02
4,unknown,For bigwigs; it is destination Goa,2001/01/02


In [14]:
news_data['published_date'] = pd.to_datetime(news_data['published_date'])

In [15]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3297172 entries, 0 to 3297171
Data columns (total 3 columns):
 #   Column             Dtype         
---  ------             -----         
 0   headline_category  object        
 1   headline_text      object        
 2   published_date     datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 75.5+ MB


In [16]:
news_data = news_data[['published_date', 'headline_text']]
news_data.columns = ['published_date', 'headline']
news_data.head()

Unnamed: 0,published_date,headline
0,2001-01-01,win over cena satisfying but defeating underta...
1,2001-01-02,Status quo will not be disturbed at Ayodhya; s...
2,2001-01-02,Fissures in Hurriyat over Pak visit
3,2001-01-02,America's unwanted heading for India?
4,2001-01-02,For bigwigs; it is destination Goa


In [17]:
dict_news = {}

In [18]:
temp = news_data.loc[0, 'published_date']
temp2 = str(news_data.loc[0, 'headline'])
for x in range(1, len(news_data)):
    if news_data.loc[x, 'published_date']==temp:
        temp2 += '. ' + str(news_data.loc[x, 'headline'])
    else:
        dict_news[news_data.loc[x-1, 'published_date']] = temp2
        temp2 = ""
        temp = news_data.loc[x, 'published_date']

In [17]:
len(dict_news)

7079

In [18]:
indexes = np.arange(0, len(dict_news))

In [19]:
df_news = pd.DataFrame(indexes)
df_news.head()

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4


In [20]:
df_news['Published_Date'] = dict_news.keys()
df_news.head()

Unnamed: 0,0,Published_Date
0,0,2001-01-01
1,1,2001-01-02
2,2,2001-01-03
3,3,2001-01-04
4,4,2001-01-05


In [21]:
l = []
for i in dict_news.keys():
    l.append(dict_news[i])

In [22]:
l[0]

'win over cena satisfying but defeating undertaker bigger roman reigns'

In [24]:
df_news['Headline'] = np.array(l)
df_news.head()

Unnamed: 0,0,Published_Date,Headline
0,0,2001-01-01,win over cena satisfying but defeating underta...
1,1,2001-01-02,. Fissures in Hurriyat over Pak visit. America...
2,2,2001-01-03,. Think again; Laxman tells Sangh Parivar. Hur...
3,3,2001-01-04,. Vajpayee gives big push to tech education; R...
4,4,2001-01-05,. Hawkings' day out. Light combat craft takes ...


In [25]:
df_news = df_news.drop(0, axis=1)

In [29]:
polarity = []
subjectivity = []
tuples = []
for i in df_news['Headline'].values:
    my_valence = TextBlob(i)
    tuples.append(my_valence.sentiment)

KeyboardInterrupt: 

In [None]:
for i in tuples:
    polarity.append(i[0])
    subjectivity.append(i[1])

In [None]:
df_news['Polarity'] = np.array(polarity)
df_news['Subjectivity'] = np.array(subjectivity)

In [None]:
df_news.head()

In [None]:
temp = ['Positive', 'Negative', 'Neutral']
temp1 = ['Factual', 'Public']
polarity = []
subjectivity = []
for i in range(len(df_news)):
    pol = df_news.iloc[i]['Polarity']
    sub = df_news.iloc[i]['Subjectivity']
    if pol>=0:
        if pol>=0.2:
            polarity.append(temp[0])
        else:
            polarity.append(temp[2])
    else:
        if pol<=-0.2:
            polarity.append(temp[1])
        else:
            polarity.append(temp[2])
    
    if sub>=0.4:
        subjectivity.append(temp1[1])
    else:
        subjectivity.append(temp1[0])

In [None]:
df_news['Sentiment'] = polarity
df_news['Opinion'] = subjectivity

In [None]:
len(df_news)

In [None]:
plt.figure(figsize=(6,4))
df_news['Subjectivity'].hist()
plt.show()

In [None]:
plt.figure(figsize=(6,4))
df_news['Polarity'].hist()
plt.show()

In [None]:
sns.countplot(df_news['Opinion'])