In [1]:
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

In [1]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [None]:
def summarize(text):
    summaries = []
    for article in text:
#         article = article[:512]
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_token=True)
        summary = summary[5:-4]
        summaries.append(summary)
    return summaries

In [None]:
scores = {coin: sentiment(summaries.get(coin)) for coin in coins}
scores

In [None]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

In [None]:
scores = {coin: sentiment(summaries.get(coin)) for coin in coins}
scores

# 1.model

In [6]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)



Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 2.parsing crypto news

In [10]:
url = "https://finance.yahoo.com/crypto/"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [11]:
paragraphs[2].text

'Bitcoin pushed higher amid optimism about regulatory approval of a product that would allow investors to get exposure to the cryptocurrency without having to own it.'

In [12]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
article = ' '.join(words)
article

In [14]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [15]:
summary

'Meta Platforms faces lawsuits from 42 U.S. attorney generals. BitMEX co-founder says investors are offloading gold and crypto'

# 4. gathering news of coins 

## 4.1 gathering links from yahoo.finance

In [65]:
coins = ['BTC', 'ETH', 'SOL']

In [83]:
def search_for_crypto_news_urls(coins):
    search_url = f"https://finance.yahoo.com/quote/{coins}-USD?p={coins}-USD"
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    a_all = soup.find_all('a')
    hrefs = [link['href'] for link in a_all]
    return hrefs 

In [84]:
raw_crypto_urls = {coin:search_for_crypto_news_urls(coin) for coin in coins}
raw_crypto_urls

{'BTC': ['https://www.yahoo.com/',
  'https://mail.yahoo.com/',
  'https://news.yahoo.com/',
  'https://finance.yahoo.com/',
  'https://sports.yahoo.com/',
  'https://www.yahoo.com/entertainment/',
  'https://search.yahoo.com/search/',
  'https://mobile.yahoo.com/',
  'https://www.yahoo.com/everything/',
  'https://finance.yahoo.com',
  '#Nav-0-DesktopNav',
  '#market-summary',
  '#Aside',
  'https://login.yahoo.com/config/login?.src=finance&.intl=us&.lang=en-US&.done=https%3A%2F%2Ffinance.yahoo.com%2Fquote%2FBTC-USD%3Ffailsafe%3D1%26ynet%3D0%26_device%3Ddesktop%26device%3Ddesktop&activity=uh-signin&pspid=1197805203',
  'https://mail.yahoo.com/?.intl=us&.lang=en-US&.partner=none&.src=finance&activity=uh-mail&pspid=1197805203',
  'https://login.yahoo.com/config/login?.src=finance&.intl=us&.lang=en-US&.done=https://finance.yahoo.com&activity=uh-mail&pspid=1197805203',
  '/',
  '/watchlists/',
  '/portfolios/',
  '/calendar/',
  '/news/',
  '/videos/',
  '/plus-dashboard?ncid=dcm_30615876

## 4.2 links preprocessing

In [None]:
def link_condition(link):
    if 'news' in link and '.html' in link:
        return True
    
def preprocess_link(coin):
    domain_name = 'https://finance.yahoo.com'
    return [domain_name + link for link in raw_crypto_urls.get(coin) if 
                                                                       link_condition(link)]

In [None]:
crypto_urls = {coin : preprocess_link(coin) for coin in coins}
crypto_urls

## 4.3 getting text data

In [122]:
def scrape_data(urls):
    articles = []
    for link in urls:
        r = requests.get(link)
        soup = BeautifulSoup(r.text, 'html.parser')
        p_all = soup.find_all('p')
        text = [p.text for p in p_all]
        words = ' '.join(text).split(' ')[:300]
        article = ' '.join(words)
        articles.append(article)
    return articles

In [123]:
coins_text = {coin: scrape_data(crypto_urls.get(coin)) for coin in coins}
coins_text

{'BTC': ['Bitcoin (BTC-USD) surged above $35,000 in its largest single-day jump in 13 months, fueled by optimism that regulatory approval of the first spot bitcoin exchange-traded fund is nearing. The world’s largest cryptocurrency moved up more than 10% Monday, its biggest increase over a 24-hour period since September 2022. The price reached its highest point in a year and a half. The stocks of digital asset companies also pushed higher. Coinbase (COIN), the largest US crypto exchange, opened 13% higher Tuesday while MicroStrategy (MSTR) rose 12%. Bitcoin mining firms Riot Platforms (RIOT) and Marathon Digital (MARA) surged more than 13% and 17%, respectively. The rally appeared to be driven partly by speculation that the Securities and Exchange Commission was close to granting approval for a spot bitcoin ETF, which would allow investors to get exposure to the cryptocurrency without having to own it. BlackRock (BLK) is among the money managers that have recently applied to launch suc

# 5 summarzing all text

In [140]:
def summarize(text):
    summaries = []
    for article in text:
#         article = article[:512]
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_token=True)
        summary = summary[5:-4]
        summaries.append(summary)
    return summaries

In [141]:
summaries = {coin: summarize(coins_text.get(coin)) for coin in coins}
summaries

{'BTC': ['Largest cryptocurrency climbs more than 10% on Monday. Bitcoin mining companies also rally on hopes for an ETF',
  'Coin briefly breaks U.S.$35,000 support level on Monday. Grayscale says it hopes to ‘work constructively’ with SEC',
  'Co-founder of BitMEX blames hawkish U.S. policy for crypto rally. Bitcoin to rise on fears of global inflation: Hayes',
  'Decentralized finance sector has staged a quiet rally. Bitcoin rallied above $35,000 for the first time since 2022',
  'CoinDesk report shows $178 million in losses in 24 hours. Bitcoin jumped more than 12% Tuesday on speculation for an ETF',
  'Optimism surrounding spot ETF approval has been driving a rally. Bitcoin is up 28% this month, 107% year-to-date',
  'Some of the quirkier snippets from the stock and bond markets:',
  "Firm is awaiting approval for a Bitcoin ETF. Settlement comes as the SEC prepares to assess the firm's application",
  'DTCC adds BlackRock’s proposed ETF to its system. Bitcoin futures ETF along wit

# 6. sentiment analysis

In [52]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [142]:
scores = {coin: sentiment(summaries.get(coin)) for coin in coins}
scores

In [164]:
s_1 = pd.DataFrame([summaries]).T[0].explode().reset_index()
s_2 = pd.DataFrame([scores]).T[0].explode().reset_index()

In [176]:
s_2

Unnamed: 0,index,0
0,BTC,"{'label': 'NEGATIVE', 'score': 0.892184317111969}"
1,BTC,"{'label': 'NEGATIVE', 'score': 0.9634637236595..."
2,BTC,"{'label': 'NEGATIVE', 'score': 0.9877322912216..."
3,BTC,"{'label': 'POSITIVE', 'score': 0.9602352976799..."
4,BTC,"{'label': 'NEGATIVE', 'score': 0.993755578994751}"
5,BTC,"{'label': 'POSITIVE', 'score': 0.9253668785095..."
6,BTC,"{'label': 'POSITIVE', 'score': 0.9762342572212..."
7,BTC,"{'label': 'NEGATIVE', 'score': 0.9924166202545..."
8,BTC,"{'label': 'POSITIVE', 'score': 0.9961269497871..."
9,ETH,"{'label': 'NEGATIVE', 'score': 0.9997580647468..."


In [166]:
import json

In [179]:
df = pd.concat([s_1, pd.json_normalize(s_2[0])], axis=1)
df.rename(columns={0:'article'}, inplace=True)
df

Unnamed: 0,index,article,label,score
0,BTC,Largest cryptocurrency climbs more than 10% on...,NEGATIVE,0.892184
1,BTC,"Coin briefly breaks U.S.$35,000 support level ...",NEGATIVE,0.963464
2,BTC,Co-founder of BitMEX blames hawkish U.S. polic...,NEGATIVE,0.987732
3,BTC,Decentralized finance sector has staged a quie...,POSITIVE,0.960235
4,BTC,CoinDesk report shows $178 million in losses i...,NEGATIVE,0.993756
5,BTC,Optimism surrounding spot ETF approval has bee...,POSITIVE,0.925367
6,BTC,Some of the quirkier snippets from the stock a...,POSITIVE,0.976234
7,BTC,Firm is awaiting approval for a Bitcoin ETF. S...,NEGATIVE,0.992417
8,BTC,DTCC adds BlackRock’s proposed ETF to its syst...,POSITIVE,0.996127
9,ETH,Uniswap to charge 0.15% swap fee on website an...,NEGATIVE,0.999758


In [180]:
df.to_csv(f'crypto_news_sentiment_{datetime.datetime.now()}.csv')