In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd
from datetime import datetime

# create class to scrape stock data
class StockScraper:
    base_day = 40909
    base_date = (2012, 1, 1)
    d0 = datetime(*base_date)

    def __init__(self, stock, c_name, c_sector):
        self.stock = stock
        self.c_name = c_name
        self.c_sector = c_sector
    
    def getDate(self,year, month, date):
        # get the number of days since 01 Jan 2012
        d1 = datetime(year, month, date)
        delta = d1 - self.d0
        ndate = delta.days
    
        return ndate+self.base_day
    
    def checkContent(self,url):
        # check if the url is valid
        if url == None:
            return None

        try:
            # open the url and get content
            req = Request(url=url, headers={'user-agent': 'news_scraper'})
            page = urlopen(req).read()
            soup = BeautifulSoup(page,'html.parser')
        except:
            return None

        # get the content, which is inside the div tag with class 'artText medium'
        # print(soup.prettify())
        content = soup.find_all(class_='artText')
        # check if the content is empty
        if len(content) == 0:
            print("No content found at the url: ", url)
            return None

        # check if content contains the company name or sector
        content_txt = content[0].text
        if self.c_name in content_txt or self.c_sector in content_txt:
            return content_txt

        return None

    def getRelatedNews(self, year,month,day):
        date = (year, month, day)
        base_url = "https://economictimes.indiatimes.com"
        url = f'https://economictimes.indiatimes.com/archivelist/year-{date[0]},month-{date[1]},starttime-{self.getDate(*date)}.cms'
        print("url: ", url)
    
        request = Request(url=url, headers={'user-agent': 'news_scraper'})
        response = urlopen(request)
    
        # parse the data
        html = BeautifulSoup(response, features='html.parser')
        news_table = html.find_all(class_='content')
    
        news_list = news_table[0].find_all('li')
        print("len(news_list): ", len(news_list))
    
        news_with_cname = []
        news_with_csector = []
        # iterate over each news element
        for news_item in news_list:
            title = news_item.find('a').text
            content_url = base_url + news_item.find('a')['href']
    
            ret_content = self.checkContent(content_url)
            if ret_content != None:
                if self.c_name in ret_content:
                    news_with_cname.append((title, content_url, ret_content))
                else:
                    news_with_csector.append((title, content_url, ret_content))
        if len(news_with_cname) > 0:
            return news_with_cname
        return news_with_csector

In [2]:
stock = 'NMDC.NS'
c_name = 'NMDC'
c_sector = 'Mining'

scraper = StockScraper(stock, c_name, c_sector)

In [3]:
# open recover_points.csv and read the data
import pandas as pd
df = pd.read_csv('recover_points.csv')
# add two new columns to the dataframe
df['drop_day'] = "None"
df['recover_day'] = "None"

print(df.head())

   Unnamed: 0            drop_date         recover_date drop_day recover_day
0           0  2021-09-27 00:00:00  2021-10-04 00:00:00     None        None
1           5  2021-12-03 00:00:00  2021-12-09 00:00:00     None        None
2           6  2021-12-06 00:00:00  2021-12-08 00:00:00     None        None
3           7  2021-12-20 00:00:00  2021-12-21 00:00:00     None        None


In [4]:
# iterate each drop_date and recover_date and get the related news
# add the news to the dataframe and save it to a csv file

for index, row in df.iterrows():
    drop_date = row['drop_date']
    recover_date = row['recover_date']
    drop_day = scraper.getRelatedNews(int(drop_date[0:4]), int(drop_date[5:7]), int(drop_date[8:10]))
    recover_day = scraper.getRelatedNews(int(recover_date[0:4]), int(recover_date[5:7]), int(recover_date[8:10]))

    if len(drop_day) > 0:
        df.at[index, 'drop_day'] = drop_day[0][0]
    
    if len(recover_day) > 0:
        df.at[index, 'recover_day'] = recover_day[0][0]
    
    # put for debugging
    break

url:  https://economictimes.indiatimes.com/archivelist/year-2021,month-9,starttime-44466.cms
len(news_list):  191
No content found at the url:  https://economictimes.indiatimes.com/tech/newsletters/morning-dispatch/esta-2021-and-the-winners-are-/articleshow/86540716.cms
url:  https://economictimes.indiatimes.com/archivelist/year-2021,month-10,starttime-44473.cms
len(news_list):  217
No content found at the url:  https://economictimes.indiatimes.com/tech/newsletters/morning-dispatch/chinese-investor-may-exit-delhivery-thrasio-eyes-india-foray/articleshow/86737606.cms


In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,drop_date,recover_date,drop_day,recover_day
0,0,2021-09-27 00:00:00,2021-10-04 00:00:00,JSPL to start mining at Kasia mine within a mo...,
1,5,2021-12-03 00:00:00,2021-12-09 00:00:00,,
2,6,2021-12-06 00:00:00,2021-12-08 00:00:00,,
3,7,2021-12-20 00:00:00,2021-12-21 00:00:00,,


In [6]:
# save the changes to the csv file
df.to_csv('recover_points.csv', index=False)