In [1]:
#import necessary libraries
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('vader_lexicon')
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KNA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\KNA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
cols = ['Date','Category','News']
data = pd.read_csv('News Headline.csv', names = cols)

In [4]:
data.head()

Unnamed: 0,Date,Category,News
0,publish_date,headline_category,headline_text
1,20010720,city.delhi,Adulterated oil seized; three held
2,20010720,city.mumbai,Chicken lickin' delicious
3,20010720,city.ahmedabad,Silver corridor faces hazy future
4,20010720,city.hyderabad,Centre okays six power projects for state


In [5]:
data.drop(0, inplace=True)
data.drop('Category', axis = 1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035650 entries, 1 to 1035650
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   Date    1035650 non-null  object
 1   News    1035650 non-null  object
dtypes: object(2)
memory usage: 15.8+ MB


In [6]:
 #Converting data type of Date column 
 data['Date'] = pd.to_datetime(data['Date'],format= '%Y%m%d')
 data

Unnamed: 0,Date,News
1,2001-07-20,Adulterated oil seized; three held
2,2001-07-20,Chicken lickin' delicious
3,2001-07-20,Silver corridor faces hazy future
4,2001-07-20,Centre okays six power projects for state
5,2001-07-20,Undertrials benefit from court-in-jail experiment
...,...,...
1035646,2011-01-23,Police recover 100 'improper' SIMs
1035647,2011-01-23,5 new KGBVs for Lucknow
1035648,2011-01-23,70% can't afford sanitary napkins; reveals study
1035649,2011-01-23,Cops ask internet cafe; PCO owners to be more ...


In [7]:
#Grouping the headlines for each day
data['News'] = data.groupby(['Date']).transform(lambda x : ' '.join(x)) 
data = data.drop_duplicates() 
data.reset_index(inplace = True, drop = True)
data

Unnamed: 0,Date,News
0,2001-07-20,Adulterated oil seized; three held Chicken lic...
1,2001-07-21,Indian batsmen look for redemption Stop scorin...
2,2001-07-22,People's Front braces to slam Govt Lodge FIR a...
3,2001-07-23,BSP focuses on Punjab; UP issues for polls Con...
4,2001-07-24,Wrong timing: Hike in fee for medical; dental ...
...,...,...
3464,2011-01-18,Top sex mistakes men make No alcohol for 6-pac...
3465,2011-01-19,My ex doesn't want me: Pratik Saali is not a g...
3466,2011-01-20,Neetu Chandra denies going panty-less! Aishwar...
3467,2011-01-22,Paddy procurement stops for want of empty sack...


In [8]:
data['News']

0       Adulterated oil seized; three held Chicken lic...
1       Indian batsmen look for redemption Stop scorin...
2       People's Front braces to slam Govt Lodge FIR a...
3       BSP focuses on Punjab; UP issues for polls Con...
4       Wrong timing: Hike in fee for medical; dental ...
                              ...                        
3464    Top sex mistakes men make No alcohol for 6-pac...
3465    My ex doesn't want me: Pratik Saali is not a g...
3466    Neetu Chandra denies going panty-less! Aishwar...
3467    Paddy procurement stops for want of empty sack...
3468    Delhi Daredevils Mumbai Indians Chennai Super ...
Name: News, Length: 3469, dtype: object

In [None]:
#Cleaning headlines
c = []
for i in range(0,len(data['News'])):
    news = re.sub('[^a-zA-Z]',' ',data['News'][i])
    news = news.lower()
    news = news.split()
    ps =PorterStemmer()
    news = [ps.stem(word) for word in news if not word in set(stopwords.words('english'))]
    news=' '.join(news)
    c.append(news)

In [None]:
data['News'] = pd.Series(c)
data

In [None]:
# WordCloud
from wordcloud import WordCloud
allWords = ' '.join( [cmts for cmts in data['News']])
wordCloud = WordCloud(width = 500, height = 300, random_state = 21, max_font_size = 119).generate(allWords)

plt.imshow(wordCloud, interpolation= 'bilinear')
plt.axis('off')
plt.show

In [None]:
#Functions to get the subjectivity and polarity
def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
  return  TextBlob(text).sentiment.polarity
#Adding subjectivity and polarity columns
data['Subjectivity'] = data['News'].apply(getSubjectivity)
data['Polarity'] = data['News'].apply(getPolarity)
data

In [None]:
plt.figure(figsize = (10,6))
data['Polarity'].hist(color = 'blue')

In [None]:
plt.figure(figsize = (10,6))
data['Subjectivity'].hist(color = 'blue')

In [None]:
#Adding sentiment score to dataset
sentiment = SentimentIntensityAnalyzer()

data['Compound'] = [sentiment.polarity_scores(v)['compound'] for v in data['News']]
data['Negative'] = [sentiment.polarity_scores(v)['neg'] for v in data['News']]
data['Neutral'] = [sentiment.polarity_scores(v)['neu'] for v in data['News']]
data['Positive'] = [sentiment.polarity_scores(v)['pos'] for v in data['News']]
data


In [None]:
data = pd.to_csv('news')