### Installing and importing baseline dependencies

In [1]:
!pip install sentencepiece



In [2]:
!pip install transformers



In [3]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

In [4]:
from bs4 import BeautifulSoup
import requests

### Setting up summarization model

In [5]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

### Summarizing single article


In [6]:
url = 'https://www.moneycontrol.com/news/business/cryptocurrency/uncertainty-looms-over-bitcoin-as-mt-gox-prepares-to-eject-one-final-time-7740091.html'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find(class_ = 'page_left_wrapper').find_all('p')

In [7]:
print(paragraphs)

[<p><i><span style="font-weight: 400;">February 2014. The formidable, Tokyo-based crypto exchange of Mt. Gox, founded in 2010 and arguably the world’s biggest cryptocurrency platform at that point, handling more than 70 percent of all bitcoin transactions across the globe, was hacked. More than 8,50,000 BTC were stolen by hackers, of which around 7,50,000 or 80% of bitcoins belonged to customers. </span></i></p>, <p><span style="font-weight: 400;">And after 7 years of the exchange folding up, only some 2,00,000 BTC have been traced, and to an old wallet months after the company filed for bankruptcy. </span></p>, <p><span style="font-weight: 400;">Despite a Japanese court tasked with the mammoth job of compensating the aggrieved investors, the chance of any substantial monetary recovery being made by these early bitcoin investors and creditors looked slim. But that looks set to change. </span></p>, <p></p>]


In [8]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:300] #only grabbing first 400 words because summarization model have some limit
ARTICLE = ' '.join(words)

In [9]:
ARTICLE

'February 2014. The formidable, Tokyo-based crypto exchange of Mt. Gox, founded in 2010 and arguably the world’s biggest cryptocurrency platform at that point, handling more than 70 percent of all bitcoin transactions across the globe, was hacked. More than 8,50,000 BTC were stolen by hackers, of which around 7,50,000 or 80% of bitcoins belonged to customers.\xa0 And after 7 years of the exchange folding up, only some 2,00,000 BTC have been traced, and to\xa0an old wallet months after the company filed for bankruptcy.\xa0 Despite a Japanese court tasked with the mammoth job of compensating the aggrieved investors, the chance of\xa0any substantial monetary recovery being made by these early bitcoin investors and creditors looked slim. But that\xa0looks\xa0set to change.\xa0 '

In [10]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [11]:
summary

'Investors who lost money in the Mt. Gox hack are set to get compensation.'

### Building a News and Sentiment Pipeline



In [12]:
monitored_tickers = ['ZOMATO', 'NYKAA']


#### Searching for Stock news using Google and Economic Times




In [13]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=money+control+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [14]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'NYKAA': ['/?sa=X&ved=0ahUKEwi569GT4Kb0AhUNbc0KHXCmAOIQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwi569GT4Kb0AhUNbc0KHXCmAOIQPAgE',
  '/search?q=money+control+NYKAA&tbm=nws&ie=UTF-8&gbv=1&sei=b9CYYfn3Co3atQbwzIKQDg',
  '/search?q=money+control+NYKAA&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwi569GT4Kb0AhUNbc0KHXCmAOIQ_AUIBygA',
  '/search?q=money+control+NYKAA&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwi569GT4Kb0AhUNbc0KHXCmAOIQ_AUICSgC',
  '/search?q=money+control+NYKAA&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwi569GT4Kb0AhUNbc0KHXCmAOIQ_AUICigD',
  'https://maps.google.com/maps?q=money+control+NYKAA&um=1&ie=UTF-8&sa=X&ved=0ahUKEwi569GT4Kb0AhUNbc0KHXCmAOIQ_AUICygE',
  '/search?q=money+control+NYKAA&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwi569GT4Kb0AhUNbc0KHXCmAOIQ_AUIDCgF',
  '/search?q=money+control+NYKAA&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwi569GT4Kb0AhUNbc0KHXCmAOIQ_AUIDSgG',
  '/advanced_search',
  '/search?q=money+control+NYKAA&ie=UTF-8&tbm=nws&sourc

In [15]:
raw_urls['ZOMATO']

['/?sa=X&ved=0ahUKEwj24L6T4Kb0AhWCQc0KHVoxCxsQOwgC',
 '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwj24L6T4Kb0AhWCQc0KHVoxCxsQPAgE',
 '/search?q=money+control+ZOMATO&tbm=nws&ie=UTF-8&gbv=1&sei=btCYYfbxNIKDtQba4qzYAQ',
 '/search?q=money+control+ZOMATO&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwj24L6T4Kb0AhWCQc0KHVoxCxsQ_AUIBygA',
 '/search?q=money+control+ZOMATO&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwj24L6T4Kb0AhWCQc0KHVoxCxsQ_AUICSgC',
 '/search?q=money+control+ZOMATO&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwj24L6T4Kb0AhWCQc0KHVoxCxsQ_AUICigD',
 'https://maps.google.com/maps?q=money+control+ZOMATO&um=1&ie=UTF-8&sa=X&ved=0ahUKEwj24L6T4Kb0AhWCQc0KHVoxCxsQ_AUICygE',
 '/search?q=money+control+ZOMATO&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwj24L6T4Kb0AhWCQc0KHVoxCxsQ_AUIDCgF',
 '/search?q=money+control+ZOMATO&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwj24L6T4Kb0AhWCQc0KHVoxCxsQ_AUIDSgG',
 '/advanced_search',
 '/search?q=money+control+ZOMATO&ie=UTF-8&tbm=nws&source=lnt&tbs=qd

#### Stripping out unwanted URLs

In [16]:
import re

In [17]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [18]:
def strip_unwanted_urls(urls, exclude_list):
  val = []
  for url in urls:
    if 'https://' in url and not any (exclude_word in url for exclude_word in exclude_list):
      res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
      val.append(res)
  return list(set(val))


In [19]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'NYKAA': ['https://www.moneycontrol.com/news/business/ipo/nykaa-ipo-opens-issue-subscribed-10-so-far-on-first-day-of-bidding-7639861.html',
  'https://www.moneycontrol.com/news/business/ipo/nykaa-ipo-share-allotment-to-be-finalised-today-also-check-listing-date-grey-market-premium-here-7689721.html',
  'https://www.moneycontrol.com/news/business/ipo/nykaa-ipo-to-open-on-october-28-price-band-at-rs-1085-1125-per-share-7613081.html',
  'https://www.moneycontrol.com/news/business/ipo/nykaa-makes-a-stellar-debut-stock-lists-at-rs-2018-with-79-premium-7698501.html',
  'https://www.moneycontrol.com/news/business/companies/nykaa-paytm-and-long-term-investing-7719421.html',
  'https://www.moneycontrol.com/news/business/ipo/nykaa-ipo-opens-on-october-28-10-key-things-to-know-before-subscribing-to-the-issue-7622901.html',
  'https://www.moneycontrol.com/news/business/earnings/fsn-e-conykaa-standalone-september-2021-net-sales-at-rs-49-39-crore-up-21-6-y-o-y-7738801.html',
  'https://www.moneycon

#### Searching and scraping cleaned URLs

In [20]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find(class_ = 'page_left_wrapper').find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [21]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'NYKAA': ['The public issue of\xa0FSN E-Commerce Ventures, backed by Falguni Nayar and private equity firm TPG, has subscribed\xa01.55 times as it received bids for\xa04.09 crore\xa0equity shares against an IPO size of 2.64 crore shares on October 28, the first day of bidding. The offer size has been reduced to 2.64 crore equity shares from more than 4.75 crore equity shares after the company\xa0raised Rs 2,396 crore from anchor investors on October 27. Retail investors have put in bids for\xa03.50 times of their reserved portion and a part set aside for non-institutional investors was subscribed\xa060 percent. Employees have subscribed for\xa01,69,140 equity shares against their reserved portion of 2.5 lakh shares, while qualified institutional buyers\xa0have bought\xa01.39\xa0times\xa0the\xa0portion set aside for them. This is the first public issue to hit the street in the last one month after Aditya Birla Sun Life AMC. The offer will close on November 1. Also read\xa0-\xa0Nykaa IP

#### Summarising the articles

In [22]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [23]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'NYKAA': ['Retail investors put in bids for 3.50 times of reserved portion. Employees have subscribed for 1,69,140 equity shares',
  "Investors can check share allotment status on BSE website. Nykaa operator FSN E-Commerce Ventures' public offering was subscribed 81.78 times",
  'Retail investors can invest a minimum of Rs 13,500 and a maximum of Rs 1,89,000.',
  "FSN E-commerce Ventures' IPO was subscribed 81.78 times in October 28-November 1.",
  'DMart’s business model was spot-on, Nykaa CEO Nayar says.',
  'Bidders will start bidding for the third-largest IPO of the year on October 28.',
  'FSN E-Commerce Ventures (Nykaa) has posted 94.65% jump in net profit.',
  'The issue has received bids for 216.59 crore shares against an IPO size of 2.64 crore shares.',
  '‘Nykaa is rightly placed to tap the high growth digital/fashion market,’ says Motilal Oswal.',
  'Online retailer Nykaa has acquired skincare brand, Dot & Key.'],
 'ZOMATO': ['Food delivery platform Zomato leads $60 million

In [24]:
summaries['ZOMATO']

['Food delivery platform Zomato leads $60 million Series D round of investment. Magicpin has 6 million active users with 25 sessions per month',
 'Food delivery platform BigBasket is among the first to wish Paytm good luck.',
 'All photographs subject to copyright.',
 'Angel Broking is bullish on Zomato and has recommended buy rating on the stock',
 "Info Edge founder and vice-chairman Bikhchandani comments. He was responding to analysts who asked him about Zomato's strategy of investing",
 'Food delivery giant Zomato will be added to MSCI India Index from December 1, 2021.',
 "Mobile wallet plunges 27.25 percent on debut day. Only two issues among this year's IPOs have crossed the Rs 1-lakh-crore mark",
 'Online food delivery firm Zomato is said to invest $75 million in Shiprocket. Move comes as India’s quick commerce market is expected to grow',
 'Online food delivery platform reported an increase of 87 percent in loss.',
 'Food delivery platform Zomato recently raised over $1.25 bil

#### Adding Sentiment analysis

In [25]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [26]:
sentiment(summaries['ZOMATO'])

  cpuset_checked))


[{'label': 'POSITIVE', 'score': 0.7904181480407715},
 {'label': 'NEGATIVE', 'score': 0.6091110706329346},
 {'label': 'NEGATIVE', 'score': 0.9676677584648132},
 {'label': 'NEGATIVE', 'score': 0.9668949842453003},
 {'label': 'POSITIVE', 'score': 0.7293859720230103},
 {'label': 'POSITIVE', 'score': 0.8657234311103821},
 {'label': 'NEGATIVE', 'score': 0.9931377172470093},
 {'label': 'POSITIVE', 'score': 0.995121419429779},
 {'label': 'NEGATIVE', 'score': 0.9994181394577026},
 {'label': 'NEGATIVE', 'score': 0.9987953901290894}]

In [27]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

  cpuset_checked))


{'NYKAA': [{'label': 'NEGATIVE', 'score': 0.960218608379364},
  {'label': 'NEGATIVE', 'score': 0.9751524925231934},
  {'label': 'NEGATIVE', 'score': 0.659213125705719},
  {'label': 'NEGATIVE', 'score': 0.9873227477073669},
  {'label': 'POSITIVE', 'score': 0.9455984234809875},
  {'label': 'POSITIVE', 'score': 0.9611591696739197},
  {'label': 'POSITIVE', 'score': 0.9607256650924683},
  {'label': 'NEGATIVE', 'score': 0.9782252311706543},
  {'label': 'POSITIVE', 'score': 0.9993256330490112},
  {'label': 'POSITIVE', 'score': 0.8124743103981018}],
 'ZOMATO': [{'label': 'POSITIVE', 'score': 0.7904181480407715},
  {'label': 'NEGATIVE', 'score': 0.6091110706329346},
  {'label': 'NEGATIVE', 'score': 0.9676677584648132},
  {'label': 'NEGATIVE', 'score': 0.9668949842453003},
  {'label': 'POSITIVE', 'score': 0.7293859720230103},
  {'label': 'POSITIVE', 'score': 0.8657234311103821},
  {'label': 'NEGATIVE', 'score': 0.9931377172470093},
  {'label': 'POSITIVE', 'score': 0.995121419429779},
  {'label':

#### Expoting Results to CSV

In [28]:
range(len(summaries['NYKAA']))

range(0, 10)

In [29]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [30]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['ZOMATO',
  'Food delivery platform Zomato leads $60 million Series D round of investment. Magicpin has 6 million active users with 25 sessions per month',
  'POSITIVE',
  0.7904181480407715,
  'https://www.moneycontrol.com/news/business/startup/magicpin-bags-60-million-in-its-series-d-round-led-by-zomato-7704121.html'],
 ['ZOMATO',
  'Food delivery platform BigBasket is among the first to wish Paytm good luck.',
  'NEGATIVE',
  0.6091110706329346,
  'https://www.moneycontrol.com/news/trends/paytm-ipo-on-paytms-big-ipo-day-zomato-bigbasket-send-dahi-shakkar-basket-full-of-luck-7690011.html'],
 ['ZOMATO',
  'All photographs subject to copyright.',
  'NEGATIVE',
  0.9676677584648132,
  'https://www.moneycontrol.com/news/photos/business/stocks/buzzing-stocks-paytm-one-97-communications-sapphire-foods-onelife-capital-advisors-and-other-stocks-in-news-today-7733161.html'],
 ['ZOMATO',
  'Angel Broking is bullish on Zomato and has recommended buy rating on the stock',
  'NEGATIVE',
  0.966

In [31]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [32]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['ZOMATO',
  'Food delivery platform Zomato leads $60 million Series D round of investment. Magicpin has 6 million active users with 25 sessions per month',
  'POSITIVE',
  0.7904181480407715,
  'https://www.moneycontrol.com/news/business/startup/magicpin-bags-60-million-in-its-series-d-round-led-by-zomato-7704121.html'],
 ['ZOMATO',
  'Food delivery platform BigBasket is among the first to wish Paytm good luck.',
  'NEGATIVE',
  0.6091110706329346,
  'https://www.moneycontrol.com/news/trends/paytm-ipo-on-paytms-big-ipo-day-zomato-bigbasket-send-dahi-shakkar-basket-full-of-luck-7690011.html'],
 ['ZOMATO',
  'All photographs subject to copyright.',
  'NEGATIVE',
  0.9676677584648132,
  'https://www.moneycontrol.com/news/photos/business/stocks/buzzing-stocks-paytm-one-97-communications-sapphire-foods-onelife-capital-advisors-and-other-stocks-in-news-today-7733161.html'],
 ['ZOMATO',
  'Angel Broking is bullish on Zomato and has recom

In [33]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)