# Scraping and Summarizing News

# 1. Install and Import Baseline Dependencies

In [1]:
!pip install transformers



In [2]:
!pip install sentencepiece



In [3]:
from transformers import PegasusForConditionalGeneration, AutoTokenizer
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [4]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 3. Summarize a Single Article

In [5]:
url = "https://nz.yahoo.com/news/no-donald-trump-isn-t-195944759.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [6]:
paragraphs[0].text

'An image depicting former president Donald Trump wading through floodwaters alongside a fellow disaster responder went viral on social media this week.'

In [7]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [8]:
ARTICLE



In [9]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [10]:
summary

'Machine-generated images are circulating in the wake of Hurricane Florence.'

# 4. Building a News and Sentiment Pipeline

In [36]:
#monitored_tickers = ['TSLA', 'BTC']
monitored_tickers = ['TSLA']

## 4.1. Search for Stock News using Google and Yahoo Finance

Google search: yahoo finance {}, and click News.

In [37]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [38]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'TSLA': ['/?sa=X&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQOwgC',
  '/search?q=yahoo+finance+TSLA&sca_esv=a4f4a06d7248b912&ie=UTF-8&tbm=nws&gbv=1&sei=YRQJZ-SAHuC90PEPqfmwoAw',
  '/search?q=yahoo+finance+TSLA&sca_esv=a4f4a06d7248b912&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQ_AUIBSgA',
  '/search?q=yahoo+finance+TSLA&sca_esv=a4f4a06d7248b912&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQ_AUIBygC',
  '/search?q=yahoo+finance+TSLA&sca_esv=a4f4a06d7248b912&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQ_AUICCgD',
  '/url?q=https://maps.google.com/maps%3Fq%3Dyahoo%2Bfinance%2BTSLA%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQiaAMCAkoBA&usg=AOvVaw3S7p1PojgyDuC-a6M2ESur',
  '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2BTSLA%26sca_esv%3Da4f4a06d7248b912%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwiktfLZpIaJAxXgHjQI

In [40]:
raw_urls['TSLA']

['/?sa=X&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQOwgC',
 '/search?q=yahoo+finance+TSLA&sca_esv=a4f4a06d7248b912&ie=UTF-8&tbm=nws&gbv=1&sei=YRQJZ-SAHuC90PEPqfmwoAw',
 '/search?q=yahoo+finance+TSLA&sca_esv=a4f4a06d7248b912&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQ_AUIBSgA',
 '/search?q=yahoo+finance+TSLA&sca_esv=a4f4a06d7248b912&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQ_AUIBygC',
 '/search?q=yahoo+finance+TSLA&sca_esv=a4f4a06d7248b912&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQ_AUICCgD',
 '/url?q=https://maps.google.com/maps%3Fq%3Dyahoo%2Bfinance%2BTSLA%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQiaAMCAkoBA&usg=AOvVaw3S7p1PojgyDuC-a6M2ESur',
 '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2BTSLA%26sca_esv%3Da4f4a06d7248b912%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwiktfLZpIaJAxXgHjQIHak8DMQQiaAMCAo

## 4.2. Strip out unwanted URLs

In [41]:
import re

In [42]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [43]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [44]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'TSLA': ['https://www.google.com/search?q%3Dyahoo%2Bfinance%2BTSLA%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/musk-shows-tesla-cybercab-sees-085158169.html',
  'https://finance.yahoo.com/news/tesla-inc-tsla-stock-rich-063044616.html',
  'https://finance.yahoo.com/news/teslas-cybercab-robotaxi-is-finally-here-with-a-30k-price-tag--plus-a-surprise-robovan-071844079.html',
  'https://finance.yahoo.com/news/tesla-robotaxi-event-analysts-weigh-in-on-what-to-expect-from-ceo-elon-musks-big-moment-164748104.html',
  'https://finance.yahoo.com/news/tesla-stock-short-sellers-favorite-185545686.html',
  'https://finance.yahoo.com/video/pfizer-starboard-value-tesla-robotaxi-221957862.html',
  'https://finance.yahoo.com/news/tesla-robotaxi-event-long-promises-082325759.html',
  'https://finance.yahoo.com/quote/LCID/',
  'https://finance.yahoo.com/news/cramer-says-buy-tesla-inc-220900728.html',
  'https://finance.yahoo.com/video/tesla-robotaxi-day-know-ahead-233000325.html']}

## 4.3. Search and Scrape Cleaned URLs

In [45]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [46]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'TSLA': ['© 2024 - Privacy - Terms',
  '(Bloomberg) -- Elon Musk’s unveiling of self-driving taxi prototypes let down some Tesla Inc. investors, with many expecting more particulars on how the carmaker can possibly pull off his latest robocar predictions. Most Read from Bloomberg The Cablebus Transformed Commutes in Mexico City’s Populous Outskirts San Francisco to Shut 9% of Public Schools Amid Budget Woes Chicago’s $1 Billion Budget Hole Exacerbated by School Turmoil Urban Heat Stress Is Another Disparity in the World’s Most Unequal Nation Should Evictions Be Banned After Hurricanes and Climate Disasters? The Tesla chief executive officer unveiled a slick two-seat sedan called Cybercab late Thursday, saying production may start in 2026 and that the vehicle could cost less than $30,000. Musk hitched a ride in one of the cars on his way to the stage at the automaker’s event in Burbank, California. He also showcased a futuristic-looking Robovan concept that he said could transport as m

## 4.4. Summarise all Articles

In [47]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt', max_length=55, truncation=True)
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [48]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'TSLA': ['Your information may be shared with third parties.',
  'Tesla unveils self-driving taxi prototypes in California. Investors expect more details on how the technology will work',
  'Tesla is one of the 10 stocks that will make you rich in 5-10 years.',
  'Musk unveiled a larger, self-driving Robovan at Tesla event.',
  'Analysts expect Tesla to show off self-driving car at event. Most don’t expect a fully functional robotaxi right away',
  'Shares of electric car maker are down more than 20% this year.',
  'We are aware of the issue and are working to resolve it.',
  'Tesla unveils driverless robotaxis in Hollywood.',
  'Shares of the electric vehicle maker have more than doubled this year.',
  'Tesla, Inc. stands against other stocks on Jim Cramer’s radar.',
  'We are aware of the issue and are working to resolve it.']}

# 5. Adding Sentiment Analysis

In [49]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [50]:
sentiment(summaries['TSLA'])

[{'label': 'NEGATIVE', 'score': 0.9903545379638672},
 {'label': 'NEGATIVE', 'score': 0.9943777918815613},
 {'label': 'POSITIVE', 'score': 0.9995711445808411},
 {'label': 'POSITIVE', 'score': 0.990311861038208},
 {'label': 'NEGATIVE', 'score': 0.9942530989646912},
 {'label': 'NEGATIVE', 'score': 0.9996978044509888},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'POSITIVE', 'score': 0.9570705890655518},
 {'label': 'POSITIVE', 'score': 0.996726393699646},
 {'label': 'NEGATIVE', 'score': 0.9937511086463928},
 {'label': 'POSITIVE', 'score': 0.9979088306427002}]

In [51]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'TSLA': [{'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'NEGATIVE', 'score': 0.9943777918815613},
  {'label': 'POSITIVE', 'score': 0.9995711445808411},
  {'label': 'POSITIVE', 'score': 0.990311861038208},
  {'label': 'NEGATIVE', 'score': 0.9942530989646912},
  {'label': 'NEGATIVE', 'score': 0.9996978044509888},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9570705890655518},
  {'label': 'POSITIVE', 'score': 0.996726393699646},
  {'label': 'NEGATIVE', 'score': 0.9937511086463928},
  {'label': 'POSITIVE', 'score': 0.9979088306427002}]}

# 6. Exporting Results to CSV

In [52]:
summaries

{'TSLA': ['Your information may be shared with third parties.',
  'Tesla unveils self-driving taxi prototypes in California. Investors expect more details on how the technology will work',
  'Tesla is one of the 10 stocks that will make you rich in 5-10 years.',
  'Musk unveiled a larger, self-driving Robovan at Tesla event.',
  'Analysts expect Tesla to show off self-driving car at event. Most don’t expect a fully functional robotaxi right away',
  'Shares of electric car maker are down more than 20% this year.',
  'We are aware of the issue and are working to resolve it.',
  'Tesla unveils driverless robotaxis in Hollywood.',
  'Shares of the electric vehicle maker have more than doubled this year.',
  'Tesla, Inc. stands against other stocks on Jim Cramer’s radar.',
  'We are aware of the issue and are working to resolve it.']}

In [53]:
scores

{'TSLA': [{'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'NEGATIVE', 'score': 0.9943777918815613},
  {'label': 'POSITIVE', 'score': 0.9995711445808411},
  {'label': 'POSITIVE', 'score': 0.990311861038208},
  {'label': 'NEGATIVE', 'score': 0.9942530989646912},
  {'label': 'NEGATIVE', 'score': 0.9996978044509888},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9570705890655518},
  {'label': 'POSITIVE', 'score': 0.996726393699646},
  {'label': 'NEGATIVE', 'score': 0.9937511086463928},
  {'label': 'POSITIVE', 'score': 0.9979088306427002}]}

In [54]:
cleaned_urls

{'TSLA': ['https://www.google.com/search?q%3Dyahoo%2Bfinance%2BTSLA%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/musk-shows-tesla-cybercab-sees-085158169.html',
  'https://finance.yahoo.com/news/tesla-inc-tsla-stock-rich-063044616.html',
  'https://finance.yahoo.com/news/teslas-cybercab-robotaxi-is-finally-here-with-a-30k-price-tag--plus-a-surprise-robovan-071844079.html',
  'https://finance.yahoo.com/news/tesla-robotaxi-event-analysts-weigh-in-on-what-to-expect-from-ceo-elon-musks-big-moment-164748104.html',
  'https://finance.yahoo.com/news/tesla-stock-short-sellers-favorite-185545686.html',
  'https://finance.yahoo.com/video/pfizer-starboard-value-tesla-robotaxi-221957862.html',
  'https://finance.yahoo.com/news/tesla-robotaxi-event-long-promises-082325759.html',
  'https://finance.yahoo.com/quote/LCID/',
  'https://finance.yahoo.com/news/cramer-says-buy-tesla-inc-220900728.html',
  'https://finance.yahoo.com/video/tesla-robotaxi-day-know-ahead-233000325.html']}

In [56]:
range(len(summaries['TSLA']))

range(0, 11)

In [57]:
summaries['TSLA'][3]

'Musk unveiled a larger, self-driving Robovan at Tesla event.'

In [31]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [58]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['TSLA',
  'Your information may be shared with third parties.',
  'NEGATIVE',
  0.9903545379638672,
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BTSLA%26tbm%3Dnws%26pccc%3D1'],
 ['TSLA',
  'Tesla unveils self-driving taxi prototypes in California. Investors expect more details on how the technology will work',
  'NEGATIVE',
  0.9943777918815613,
  'https://finance.yahoo.com/news/musk-shows-tesla-cybercab-sees-085158169.html'],
 ['TSLA',
  'Tesla is one of the 10 stocks that will make you rich in 5-10 years.',
  'POSITIVE',
  0.9995711445808411,
  'https://finance.yahoo.com/news/tesla-inc-tsla-stock-rich-063044616.html'],
 ['TSLA',
  'Musk unveiled a larger, self-driving Robovan at Tesla event.',
  'POSITIVE',
  0.990311861038208,
  'https://finance.yahoo.com/news/teslas-cybercab-robotaxi-is-finally-here-with-a-30k-price-tag--plus-a-surprise-robovan-071844079.html'],
 ['TSLA',
  'Analysts expect Tesla to show off self-driving car at event. Most don’t expect a fully functional

In [59]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [60]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['TSLA',
  'Your information may be shared with third parties.',
  'NEGATIVE',
  0.9903545379638672,
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BTSLA%26tbm%3Dnws%26pccc%3D1'],
 ['TSLA',
  'Tesla unveils self-driving taxi prototypes in California. Investors expect more details on how the technology will work',
  'NEGATIVE',
  0.9943777918815613,
  'https://finance.yahoo.com/news/musk-shows-tesla-cybercab-sees-085158169.html'],
 ['TSLA',
  'Tesla is one of the 10 stocks that will make you rich in 5-10 years.',
  'POSITIVE',
  0.9995711445808411,
  'https://finance.yahoo.com/news/tesla-inc-tsla-stock-rich-063044616.html'],
 ['TSLA',
  'Musk unveiled a larger, self-driving Robovan at Tesla event.',
  'POSITIVE',
  0.990311861038208,
  'https://finance.yahoo.com/news/teslas-cybercab-robotaxi-is-finally-here-with-a-30k-price-tag--plus-a-surprise-robovan-071844079.html'],
 ['TSLA',
  'Analysts expect Tesla to show off self-driv

In [62]:
import csv
with open('articlesummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

Download **articlesummaries.csv** from the folder icon (📁) in the left panel.

##END