In [1]:
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# 1.model

In [6]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)



Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 2. gathering news of coins 

## 2.1 gathering links from yahoo.finance

In [7]:
coins = ['BTC', 'ETH', 'SOL']

In [8]:
def search_for_crypto_news_urls(coins):
    search_url = f"https://finance.yahoo.com/quote/{coins}-USD?p={coins}-USD"
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    a_all = soup.find_all('a')
    hrefs = [link['href'] for link in a_all]
    return hrefs 

In [9]:
raw_crypto_urls = {coin:search_for_crypto_news_urls(coin) for coin in coins}
raw_crypto_urls

{'BTC': ['https://www.yahoo.com/',
  'https://mail.yahoo.com/',
  'https://news.yahoo.com/',
  'https://finance.yahoo.com/',
  'https://sports.yahoo.com/',
  'https://www.yahoo.com/entertainment/',
  'https://search.yahoo.com/search/',
  'https://mobile.yahoo.com/',
  'https://www.yahoo.com/everything/',
  'https://finance.yahoo.com',
  '#Nav-0-DesktopNav',
  '#market-summary',
  '#Aside',
  'https://login.yahoo.com/config/login?.src=finance&.intl=us&.lang=en-US&.done=https%3A%2F%2Ffinance.yahoo.com%2Fquote%2FBTC-USD%3Ffailsafe%3D1%26ynet%3D0%26_device%3Ddesktop%26device%3Ddesktop&activity=uh-signin&pspid=1197805203',
  'https://mail.yahoo.com/?.intl=us&.lang=en-US&.partner=none&.src=finance&activity=uh-mail&pspid=1197805203',
  'https://login.yahoo.com/config/login?.src=finance&.intl=us&.lang=en-US&.done=https://finance.yahoo.com&activity=uh-mail&pspid=1197805203',
  '/',
  '/watchlists/',
  '/portfolios/',
  '/calendar/',
  '/news/',
  '/videos/',
  '/plus-dashboard?ncid=dcm_30615876

## 2.2 links preprocessing

In [12]:
def link_condition(link):
    if 'news' in link and '.html' in link:
        return True
    
def preprocess_link(coin):
    domain_name = 'https://finance.yahoo.com'
    return [domain_name + link for link in raw_crypto_urls.get(coin) if 
                                                                       link_condition(link)]

In [13]:
crypto_urls = {coin : preprocess_link(coin) for coin in coins}
crypto_urls

{'BTC': ['https://finance.yahoo.com/news/short-seller-jim-chanos-talks-163109367.html',
  'https://finance.yahoo.com/news/ether-spikes-10-highest-level-142126838.html',
  'https://finance.yahoo.com/news/blackrock-files-ether-etf-eth-092126413.html',
  'https://finance.yahoo.com/news/ethereum-gets-blackrock-boost-real-161955519.html',
  'https://finance.yahoo.com/news/bitcoin-just-wont-die-135408250.html',
  'https://finance.yahoo.com/news/crypto-scams-bitcoin-coinbase-binance-060027245.html',
  'https://finance.yahoo.com/news/microstrategy-stock-250-michael-saylor-202319811.html',
  'https://finance.yahoo.com/news/bitcoin-surges-above-37000-hitting-highest-mark-in-18-months-171158931.html',
  'https://finance.yahoo.com/news/ether-surges-above-2-000-163054732.html',
  'https://finance.yahoo.com/news/jpmorgan-dismantles-bullish-argument-bitcoin-152908417.html'],
 'ETH': ['https://finance.yahoo.com/news/bitcoin-surges-above-37000-hitting-highest-mark-in-18-months-171158931.html',
  'https

## 2.3 getting text data

In [14]:
def scrape_data(urls):
    articles = []
    for link in urls:
        r = requests.get(link)
        soup = BeautifulSoup(r.text, 'html.parser')
        p_all = soup.find_all('p')
        text = [p.text for p in p_all]
        words = ' '.join(text).split(' ')[:300]
        article = ' '.join(words)
        articles.append(article)
    return articles

In [15]:
coins_text = {coin: scrape_data(crypto_urls.get(coin)) for coin in coins}
coins_text

{'BTC': ['Short seller Jim Chanos spoke with the Institute for New Economic Thinking about crypto, tesla, and AI. "You have to understand that the crypto ecosystem is well-suited for the dark side of finance," he said. Here are his four best quotes on crypto and other tech developments. Though crypto markets are rebounding in the face of promising developments like the approval of the first US spot bitcoin ETF, the sector continues to suffer the same fundamental defects, famed short-seller Jim Chanos said. In a new interview with the Institute for New Economic Thinking, the Chanos & Company founder laid out how the crypto sphere can be an avenue for financial fraud, challenges facing Tesla, and his views on the nascent artificial intelligence space. Here are the legendary short-seller\'s best quotes from the interview: Crypto and the dark side of finance "You have to understand that the crypto ecosystem is well-suited for the dark side of finance for a lot of reasons. It\'s perfect for

# 3 summarzing all text

In [27]:
def summarize(text):
    if 'Thank you for your patience' not in text:
        
        summaries = []
        for article in text:
    #         article = article[:512]
            input_ids = tokenizer.encode(article, return_tensors='pt')
            output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
            summary = tokenizer.decode(output[0], skip_special_token=True)
            summary = summary[5:-4]
            summaries.append(summary)
        return summaries

In [28]:
summaries = {coin: summarize(coins_text.get(coin)) for coin in coins}
summaries

{'BTC': ['Short-seller Jim Chanos sees crypto as an avenue for financial fraud. Tesla, tesla, and AI also in the interview',
  'Asset manager has filed for an Ethereum ETF. Ether has had a year of banner growth, gaining 74%',
  'Ether rose 9.5% in the 24 hours leading up to 4:40 p.m.',
  'The cryptocurrency has been overshadowed by Bitcoin and Solana',
  "The cryptocurrency has more than doubled in price since the collapse of FTX. It's outperforming both the S&P 500 and the “Magnificent Seven”",
  'Social media platforms are main breeding ground for scams. Average amount lost in crypto scams is more than any other type of fraud',
  'Saylor says spot bitcoin ETFs and the 2024 halving event will push the token higher.',
  'Investors hope SEC will soon approve spot bitcoin ETF. Bitcoin has rallied more than 120% this year',
  'BlackRock has registered Ethereum Trust in Delaware. Ether is trading above $2,000 for the first time since April',
  "The possible launch of spot ETFs is unlikely 

# 4. sentiment analysis

In [29]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [30]:
scores = {coin: sentiment(summaries.get(coin)) for coin in coins}
scores

{'BTC': [{'label': 'NEGATIVE', 'score': 0.9572890400886536},
  {'label': 'POSITIVE', 'score': 0.9984733462333679},
  {'label': 'POSITIVE', 'score': 0.981963038444519},
  {'label': 'NEGATIVE', 'score': 0.9986457228660583},
  {'label': 'NEGATIVE', 'score': 0.8268928527832031},
  {'label': 'NEGATIVE', 'score': 0.9909327030181885},
  {'label': 'POSITIVE', 'score': 0.7460976839065552},
  {'label': 'POSITIVE', 'score': 0.986227810382843},
  {'label': 'NEGATIVE', 'score': 0.53252112865448},
  {'label': 'NEGATIVE', 'score': 0.998786985874176}],
 'ETH': [{'label': 'POSITIVE', 'score': 0.986227810382843},
  {'label': 'NEGATIVE', 'score': 0.9969011545181274},
  {'label': 'NEGATIVE', 'score': 0.9997627139091492},
  {'label': 'NEGATIVE', 'score': 0.9909327030181885},
  {'label': 'NEGATIVE', 'score': 0.9909033179283142},
  {'label': 'POSITIVE', 'score': 0.981963038444519},
  {'label': 'NEGATIVE', 'score': 0.9909033179283142},
  {'label': 'POSITIVE', 'score': 0.7460976839065552},
  {'label': 'NEGATIV

In [31]:
s_1 = pd.DataFrame([summaries]).T[0].explode().reset_index()
s_2 = pd.DataFrame([scores]).T[0].explode().reset_index()

In [32]:
s_2

Unnamed: 0,index,0
0,BTC,"{'label': 'NEGATIVE', 'score': 0.9572890400886..."
1,BTC,"{'label': 'POSITIVE', 'score': 0.9984733462333..."
2,BTC,"{'label': 'POSITIVE', 'score': 0.981963038444519}"
3,BTC,"{'label': 'NEGATIVE', 'score': 0.9986457228660..."
4,BTC,"{'label': 'NEGATIVE', 'score': 0.8268928527832..."
5,BTC,"{'label': 'NEGATIVE', 'score': 0.9909327030181..."
6,BTC,"{'label': 'POSITIVE', 'score': 0.7460976839065..."
7,BTC,"{'label': 'POSITIVE', 'score': 0.986227810382843}"
8,BTC,"{'label': 'NEGATIVE', 'score': 0.53252112865448}"
9,BTC,"{'label': 'NEGATIVE', 'score': 0.998786985874176}"


In [33]:
import json

In [34]:
df = pd.concat([s_1,
                pd.json_normalize(s_2[0])
               ], axis=1)
df.rename(columns={0:'article'}, inplace=True)
df

Unnamed: 0,index,article,label,score
0,BTC,Short-seller Jim Chanos sees crypto as an aven...,NEGATIVE,0.957289
1,BTC,Asset manager has filed for an Ethereum ETF. E...,POSITIVE,0.998473
2,BTC,Ether rose 9.5% in the 24 hours leading up to ...,POSITIVE,0.981963
3,BTC,The cryptocurrency has been overshadowed by Bi...,NEGATIVE,0.998646
4,BTC,The cryptocurrency has more than doubled in pr...,NEGATIVE,0.826893
5,BTC,Social media platforms are main breeding groun...,NEGATIVE,0.990933
6,BTC,Saylor says spot bitcoin ETFs and the 2024 hal...,POSITIVE,0.746098
7,BTC,Investors hope SEC will soon approve spot bitc...,POSITIVE,0.986228
8,BTC,BlackRock has registered Ethereum Trust in Del...,NEGATIVE,0.532521
9,BTC,The possible launch of spot ETFs is unlikely t...,NEGATIVE,0.998787


In [24]:
df.to_csv(f'crypto_news_sentiment_{datetime.datetime.now()}.csv')