# News Analyst
cc - Ujjwal Rajput
ujjwalrajputofficial.in@gmail.com

### Step 1 - Web Scraping

##### Importing Libraries

In [2]:
from bs4 import BeautifulSoup
from datetime import datetime
import time
import requests
import pandas as pd
import os

#### Web Scraping function 
Collecting news from www.news.google.com

In [25]:
count = 1
num_pages = 5  # Number Of Pages -- 1 = 100 per cp
base_url = "https://news.google.com/search?q="
cps = ['HDFC']  # Replace with your desired search query

headlines_list = []
sources_list = []
dates_list = []
cps_list = []

for cp in cps:
    for page_num in range(num_pages):
    
        url = base_url+cp
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            
            headlines = soup.find_all("h3", class_="ipQwMb ekueJc RD0gLb")
            dates = soup.find_all("time", class_="WW6dff uQIVzc Sksgp slhocf")
            sources = soup.find_all("div", class_="N0NI1d AVN2gc WfKKme")
            for headline, sources, dates in zip(headlines, sources , dates):

                date = dates["datetime"]
                source = sources.a.text
                headline_text = headline.a.text

                parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
                formatted_date = parsed_date.strftime("%d-%m-%y")
                
                headlines_list.append(headline_text)
                sources_list.append(source)
                dates_list.append(formatted_date)
                cps_list.append(cp)
                
                count += 1

            time.sleep(2)  # Add a delay to avoid overloading the server
        else:
            print("Unable to establish connection ")
            print(response)
            break
data = {
    "Company":cps_list,
    "Headline": headlines_list,
    "Source": sources_list,
    "Date": dates_list

}
print("Scraping Sucessfull \nTotal Article Scraped : "+str(count) )

Scraping Sucessfull 
Total Article Scraped : 511


Storing the Scrapped data in form of DataFrame and Excel  

In [26]:
df = pd.DataFrame(data)

save_directory = r"d:\github\News-Analyst"  # Replacable directory path
excel_filename = os.path.join(save_directory, "google_news_scraped_data.xlsx")

df.to_excel(excel_filename, index=False)

print("Saved successful")

Saved successful


In [27]:
df.shape

(510, 4)

In [28]:
df.head()

Unnamed: 0,Company,Headline,Source,Date
0,HDFC,Markets log 4th day of rally on buying in HDFC...,Times of India,06-09-23
1,HDFC,Markets log 4th day of rally on buying in HDFC...,The Hindu,06-09-23
2,HDFC,"HDFC Bank, ICICI Bank, Bank of Baroda shares: ...",CNBCTV18,07-09-23
3,HDFC,Buy Sobha; target of Rs 1024: HDFC Securities,Moneycontrol,07-09-23
4,HDFC,UPI and digital trends shaping the future of c...,BusinessLine,04-09-23


In [29]:
df.tail()

Unnamed: 0,Company,Headline,Source,Date
505,HDFC,Brokers alert clients to close out HDFC F&O Co...,Mint,12-07-23
506,HDFC,'It is time to hang my boots': HDFC chairman D...,Business Today,30-06-23
507,HDFC,"MFs may have to offload ₹4,500 cr of HDFC Bank...",BusinessLine,15-06-23
508,HDFC,"HDFC Bank stares at NIM, cost concerns after d...",Mint,17-04-23
509,HDFC,HDFC Bank’s $173 billion merger with its paren...,The Economic Times,27-06-23


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Company   510 non-null    object
 1   Headline  510 non-null    object
 2   Source    510 non-null    object
 3   Date      510 non-null    object
dtypes: object(4)
memory usage: 16.1+ KB


### Step 2 - VADER Sentiment Scoring

In [19]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [43]:
df['Headline'][6]

'HDFC Bank: ग्राहकों को बड़ा झटका, इस चीज के चुकाना होंगे ज्यादा पैसे, लिया ये बड़ा फैसला'

In [44]:
sia.polarity_scores(df['Headline'][6])

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [70]:
sc = pd.DataFrame()
sc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [71]:
for headline in df['Headline']:
    scores = sia.polarity_scores(headline)
    sc = pd.concat([sc, pd.DataFrame(scores, index=[0])], ignore_index=True)

sc.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   neg       510 non-null    float64
 1   neu       510 non-null    float64
 2   pos       510 non-null    float64
 3   compound  510 non-null    float64
dtypes: float64(4)
memory usage: 16.1 KB


In [69]:
sc.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,0.0,0.864,0.136,0.296
3,0.0,0.761,0.239,0.296
4,0.0,0.86,0.14,0.3818


In [82]:
df = pd.concat([df, sc], axis=1)
df = df.loc[:,~df.columns.duplicated()]
df.head()

Unnamed: 0,Company,Headline,Source,Date,neg,neu,pos,compound
0,HDFC,Markets log 4th day of rally on buying in HDFC...,Times of India,06-09-23,0.0,1.0,0.0,0.0
1,HDFC,Markets log 4th day of rally on buying in HDFC...,The Hindu,06-09-23,0.0,1.0,0.0,0.0
2,HDFC,"HDFC Bank, ICICI Bank, Bank of Baroda shares: ...",CNBCTV18,07-09-23,0.0,0.864,0.136,0.296
3,HDFC,Buy Sobha; target of Rs 1024: HDFC Securities,Moneycontrol,07-09-23,0.0,0.761,0.239,0.296
4,HDFC,UPI and digital trends shaping the future of c...,BusinessLine,04-09-23,0.0,0.86,0.14,0.3818


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Company   510 non-null    object 
 1   Headline  510 non-null    object 
 2   Source    510 non-null    object 
 3   Date      510 non-null    object 
 4   neg       510 non-null    float64
 5   neu       510 non-null    float64
 6   pos       510 non-null    float64
 7   compound  510 non-null    float64
dtypes: float64(4), object(4)
memory usage: 32.0+ KB


In [85]:
# Assuming you have a DataFrame named 'df'
average_neg = df['neg'].mean()
average_neu = df['neu'].mean()
average_pos = df['pos'].mean()
average_compound = df['compound'].mean()

# Print the averages
print("Average 'neg':", average_neg)
print("Average 'neu':", average_neu)
print("Average 'pos':", average_pos)
print("Average 'compound':", average_compound)


Average 'neg': 0.027450980392156862
Average 'neu': 0.8639901960784313
Average 'pos': 0.10855882352941176
Average 'compound': 0.1699509803921569
