In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import json
# BeautifulSoup allows us to parse the HTML content of a given URL and access its elements by identifying them 
# with their tags and attributes.

##Class Definition
class InshortsNewsScraping:
    def __init__(self):
        self.headlineList = []
        self.authorList = []
        self.dateList = []
        self.categoryList = []

        
    def extractNewsDetails(self,html,newsCategory):
        soup = BeautifulSoup(html,'lxml') # html5lib, html.parser are the other options in place of lxml
        # Soup is understood by bs4
        divs = soup.find_all('div', {'class': "news-card-title news-right-box"})

        #Extracting News Headlines
        newsHeadlines = [div.find('span',{'itemprop':'headline'}).string for div in divs]        
        self.headlineList.extend(newsHeadlines)

        #Extracting author
        authors = [div.find('span',class_ = "author").string for div in divs]
        self.authorList.extend(authors)

        #Extracting news date
        newsDates = [div.find('span',clas = "date").string for div in divs]
        self.dateList.extend(newsDates)

        #Extracting News Category
        categories = [newsCategory for div in divs]
        self.categoryList.extend(categories)

        
    # pattern to extract min_news_id
    def extractMinNewsId(self,response_text):
        soup = BeautifulSoup(response_text, 'lxml') # html5lib, html.parser are the other options in place of lxml
        patrn = re.compile('var min_news_id\s+=\s+"(.*?)"')
        new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
        min_news_id = patrn.search(new_id_scr.text).group(1)
        return min_news_id

    
    def ScrapInshortsNews(self,baseURL):
        newsCategoryList = ['national','business','sports','world','politics','technology','startup','entertainment','miscellaneous','hatke','science','automobile']

        for newsCategory in newsCategoryList:
            url = baseURL + newsCategory
            r = requests.get(url)#Requests module is used to get the HTML code from the page and then navigate through it with the BeautifulSoup package.
            r.status_code
            html = r.content

            self.extractNewsDetails(html,newsCategory)

            #Min News Id
            moreNewsUrl = "https://inshorts.com/en/ajax/more_news"
            min_news_id = self.extractMinNewsId(html)

            i = 0
            while i<2:        #Limiting no. of Load more.. with loop condition. Loop can be removed/modified to get more/less news
                response = requests.post(moreNewsUrl, data={"category": '', "news_offset": min_news_id})
                if response.status_code != 200:
                    print(response.status_code)
                    break

                response_json = json.loads(response.text)
                self.extractNewsDetails(response_json["html"], newsCategory)
                min_news_id = response_json["min_news_id"]
                i = i + 1
                
        #Converting list to pandas series and Concatenate all the series to form a dataframe
        df = pd.concat([pd.Series(self.categoryList,name = "Category"),pd.Series(self.headlineList,name = "Headline"), pd.Series(self.authorList,name = "Author"), pd.Series(self.dateList, name = "Date")],axis = 1)
        return df

    
##Class object creation...
newsObj = InshortsNewsScraping()
df = newsObj.ScrapInshortsNews('https://inshorts.com/en/read/')
df.to_excel("InshortsNews.xlsx",index=False)
df