# Template API Notebook

This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a neo4j tutorial the heading should be `Neo4j API`.

- Add description of what the notebook does.
- Point to references, e.g. (neo4j.API.md)
- Add citations.
- Keep the notebook flow clear.
- Comments should be imperative and have a period at the end.
- Your code should be well commented. 

The name of this notebook should in the following format:
- if the notebook is exploring `pycaret API`, then it is `pycaret.API.ipynb`

Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [8]:
!pwd

/Users/arunbhyashaswi/Drive/Code/UMD/Data605/Project/tutorials/DATA605/Spring2025/projects/TutorTask106_Spring2025_Real_Time_Bitcoin_Sentiment_Analysis_Using_TextBlob


## Imports

In [18]:
import logging
# Import libraries in this section.
# Avoid imports like import *, from ... import ..., from ... import *, etc.

from src.logger import get_logger
from pycoingecko import CoinGeckoAPI
from newsapi import NewsApiClient
import os
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from datetime import datetime, timedelta
import time
from textblob import TextBlob
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

from src.logger import get_logger
from src.data_saver import save_data, load_data

## Configuration

In [None]:
logger = get_logger(__name__)

NEWS_API_KEY = os.getenv("NEWS_API_KEY", "")
if not NEWS_API_KEY:
    logger.warning("NEWS_API_KEY not set in environment variables")
RELEVANT_SOURCES = [
    # 'australian-financial-review',
    'bloomberg',
    'business-insider',
    # 'financial-post',
    'fortune',
    # 'the-wall-street-journal'
]
newsapi = NewsApiClient(api_key=NEWS_API_KEY) if NEWS_API_KEY else None
coingecko = CoinGeckoAPI()

## Make the notebook flow clear
Each notebook needs to follow a clear and logical flow, e.g:
- Load data
- Compute stats
- Clean data
- Compute stats
- Do analysis
- Show results

In [10]:
def fetch_bitcoin_news(start_date, end_date, refresh=False):
    """
    Fetch Bitcoin-related news articles from NewsAPI.

    Args:
        start_date (str): Start date in 'YYYY-MM-DD' format
        end_date (str): End date in 'YYYY-MM-DD' format
        refresh (bool): Ignored, included for interface compatibility

    Returns:
        pandas.DataFrame: DataFrame containing news articles
    """
    start_dt = datetime.strptime(start_date, '%Y-%m-%d')
    end_dt = datetime.strptime(end_date, '%Y-%m-%d')

    today = datetime.now().date()
    earliest_allowed = today - timedelta(days=29)
    if start_dt.date() < earliest_allowed:
        msg = f"Adjusting start date from {start_dt.date()} to {earliest_allowed} due to NewsAPI's 30-day limit"
        logger.warning(msg)
        start_dt = datetime.combine(earliest_allowed, datetime.min.time())

    if not newsapi:
        logger.error("NewsAPI key not available. Please set the NEWS_API_KEY environment variable.")
        return None

    articles = []
    try:
        try:
            key_status = newsapi.get_sources()
            if key_status.get('status') == 'ok':
                logger.info("Using valid NewsAPI key")
        except:
            logger.warning("Unable to verify NewsAPI key")

        current_date = start_dt
        total_articles = 0

        # Break into 1-day chunks to avoid 100 article cap
        while current_date <= end_dt:
            next_date = min(current_date + timedelta(days=1), end_dt)
            from_date = current_date.strftime('%Y-%m-%d')
            to_date = next_date.strftime('%Y-%m-%d')
            sources_param = ','.join(RELEVANT_SOURCES)
            
            try:
                response = newsapi.get_everything(
                    q='bitcoin OR crypto OR cryptocurrency',
                    language='en',
                    sources=sources_param,  # <-- NEW: filter by trusted sources
                    from_param=from_date,
                    to=to_date,
                    sort_by='publishedAt',
                    page=1,  # Only page 1 to avoid hitting free-tier limit
                    page_size=100
                )

                if response.get('articles'):
                    batch_articles = [{
                        'title': a.get('title', ''),
                        'description': a.get('description', ''),
                        'content': a.get('content', ''),
                        'source': a.get('source', {}).get('name', 'Unknown'),
                        'author': a.get('author', 'Unknown'),
                        'url': a.get('url', ''),
                        'publishedAt': a.get('publishedAt', '')
                    } for a in response['articles']]

                    articles.extend(batch_articles)
                    total_articles += len(batch_articles)

                    if len(batch_articles) == 100:
                        logger.warning(f"Hit 100-article cap for {from_date}. More articles likely exist but are not retrievable with free tier.")

                time.sleep(0.5)
            except Exception as e:
                logger.warning(f"Error fetching news for {from_date} to {to_date}: {str(e)}")

            current_date = next_date + timedelta(days=1)
            time.sleep(0.5)

        if articles:
            df = pd.DataFrame(articles)
            df['publishedAt'] = pd.to_datetime(df['publishedAt'], errors='coerce')
            df['date'] = df['publishedAt'].dt.date
            filtered_df = df[(df['date'] >= start_dt.date()) & (df['date'] <= end_dt.date())]
            logger.info(f"Retrieved {len(filtered_df)} articles from NewsAPI for {start_date} to {end_date}")
            return filtered_df
        else:
            logger.warning("No articles found for the specified date range.")
            return None

    except Exception as e:
        logger.error(f"Error fetching news: {str(e)}")
        return None

In [11]:
def analyze_sentiment(articles_df):
    """
    Analyze sentiment of news articles using TextBlob.

    Args:
        articles_df (pandas.DataFrame): DataFrame containing news articles.

    Returns:
        pandas.DataFrame: DataFrame with added sentiment scores and categories.
    """
    if articles_df is None or articles_df.empty:
        logger.warning("No articles to analyze.")
        return None

    logger.info("Analyzing sentiment of news articles...")

    df = articles_df.copy()
    df['polarity'] = np.nan
    df['subjectivity'] = np.nan

    def get_sentiment(text):
        if not isinstance(text, str) or not text.strip():
            return 0.0, 0.0
        try:
            blob = TextBlob(text)
            return blob.sentiment.polarity, blob.sentiment.subjectivity
        except Exception as e:
            logger.warning(f"Error analyzing sentiment: {e}")
            return 0.0, 0.0

    # Apply sentiment analysis
    for idx, row in df.iterrows():
        text = row.get('content') or row.get('description') or row.get('title')
        polarity, subjectivity = get_sentiment(text)
        df.at[idx, 'polarity'] = polarity
        df.at[idx, 'subjectivity'] = subjectivity

    # Classify sentiment category
    df['sentiment_category'] = pd.cut(
        df['polarity'],
        bins=[-1.1, -0.5, -0.1, 0.1, 0.5, 1.1],
        labels=['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']
    )

    logger.info(f"Sentiment analysis completed for {len(df)} articles.")
    return df

In [None]:

end_date = datetime.now().date()
start_date = end_date - timedelta(days=29)
start_date_str = start_date.strftime('%Y-%m-%d')
end_date_str = end_date.strftime('%Y-%m-%d')

new_articles_fetch = fetch_bitcoin_news(start_date_str, end_date_str, refresh=True)
new_articles_fetch.head()

2025-04-22 19:24:00 - __main__ - INFO - Using valid NewsAPI key
2025-04-22 19:24:17 - __main__ - INFO - Retrieved 35 articles from NewsAPI for 2025-03-24 to 2025-04-22


Unnamed: 0,title,description,content,source,author,url,publishedAt,date
0,Trump's latest crypto push: Getting into the s...,The Trump-backed World Liberty Financial plans...,The Trump family is diving into another corner...,Business Insider,fdemott@businessinsider.com (Filip De Mott),https://www.businessinsider.com/donald-trump-c...,2025-03-25 17:26:22+00:00,2025-03-25
1,Trump Media is up 9% after a deal to launch ne...,Trump Media is partnering Crypto.com to sell e...,Shares of Trump Media and Technology surged 9....,Business Insider,htan@insider.com (Huileng Tan),https://markets.businessinsider.com/news/stock...,2025-03-25 08:13:13+00:00,2025-03-25
2,The tech industry is souring on Trump,Even conservative executives are fed up with t...,Two months into Donald Trump's\r\n second term...,Business Insider,zbernard@businessinsider.com (Zoë Bernard),https://www.businessinsider.com/tech-leaders-s...,2025-03-25 08:06:01+00:00,2025-03-25
3,Female executives are fighting back against al...,Boardrooms are brimming with bro-ish bravado. ...,FreshSplash/Getty Images\r\n<ul><li>This post ...,Business Insider,Hallam Bullock,https://www.businessinsider.com/female-executi...,2025-03-24 11:47:17+00:00,2025-03-24
4,Here are the 20 startups that sponsor the most...,"In an ever-evolving tech landscape, the H-1B v...",OpenAI CEO Sam Altman.Joel Saget/Getty Images;...,Business Insider,"Melia Russell,Alex Nicoll",https://www.businessinsider.com/tech-startups-...,2025-03-24 09:00:01+00:00,2025-03-24


In [13]:
new_articles = analyze_sentiment(new_articles_fetch)
new_articles.head()

2025-04-22 19:25:26 - __main__ - INFO - Analyzing sentiment of news articles...
2025-04-22 19:25:26 - __main__ - INFO - Sentiment analysis completed for 35 articles.


Unnamed: 0,title,description,content,source,author,url,publishedAt,date,polarity,subjectivity,sentiment_category
0,Trump's latest crypto push: Getting into the s...,The Trump-backed World Liberty Financial plans...,The Trump family is diving into another corner...,Business Insider,fdemott@businessinsider.com (Filip De Mott),https://www.businessinsider.com/donald-trump-c...,2025-03-25 17:26:22+00:00,2025-03-25,0.0,0.0,Neutral
1,Trump Media is up 9% after a deal to launch ne...,Trump Media is partnering Crypto.com to sell e...,Shares of Trump Media and Technology surged 9....,Business Insider,htan@insider.com (Huileng Tan),https://markets.businessinsider.com/news/stock...,2025-03-25 08:13:13+00:00,2025-03-25,0.0,0.0,Neutral
2,The tech industry is souring on Trump,Even conservative executives are fed up with t...,Two months into Donald Trump's\r\n second term...,Business Insider,zbernard@businessinsider.com (Zoë Bernard),https://www.businessinsider.com/tech-leaders-s...,2025-03-25 08:06:01+00:00,2025-03-25,0.0,0.0,Neutral
3,Female executives are fighting back against al...,Boardrooms are brimming with bro-ish bravado. ...,FreshSplash/Getty Images\r\n<ul><li>This post ...,Business Insider,Hallam Bullock,https://www.businessinsider.com/female-executi...,2025-03-24 11:47:17+00:00,2025-03-24,0.1875,0.375,Positive
4,Here are the 20 startups that sponsor the most...,"In an ever-evolving tech landscape, the H-1B v...",OpenAI CEO Sam Altman.Joel Saget/Getty Images;...,Business Insider,"Melia Russell,Alex Nicoll",https://www.businessinsider.com/tech-startups-...,2025-03-24 09:00:01+00:00,2025-03-24,0.416667,0.383333,Positive


In [14]:
def aggregate_sentiment(articles_with_sentiment):
    """
    Aggregate sentiment scores by date.

    Args:
        articles_with_sentiment (pandas.DataFrame): DataFrame containing articles with sentiment scores

    Returns:
        pandas.DataFrame: DataFrame with aggregated sentiment scores by date
    """
    if articles_with_sentiment is None or articles_with_sentiment.empty:
        logger.warning("No sentiment data to aggregate.")
        return None

    logger.info("Aggregating sentiment data by date...")

    # Ensure 'date' is in datetime format
    if 'date' in articles_with_sentiment.columns:
        articles_with_sentiment['date'] = pd.to_datetime(articles_with_sentiment['date'])

    logger.info(f"Articles date range: {articles_with_sentiment['date'].min()} to {articles_with_sentiment['date'].max()}")
    logger.info(f"Total articles by date: {articles_with_sentiment.groupby('date').size().to_dict()}")

    # Aggregate numeric sentiment features
    aggregated = articles_with_sentiment.groupby('date').agg(
        avg_polarity=('polarity', 'mean'),
        avg_subjectivity=('subjectivity', 'mean'),
        max_polarity=('polarity', 'max'),
        min_polarity=('polarity', 'min'),
        article_count=('polarity', 'count')
    ).reset_index()

    # Add polarity volatility
    volatility = articles_with_sentiment.groupby('date')['polarity'].std().reset_index()
    volatility.columns = ['date', 'polarity_volatility']
    aggregated = pd.merge(aggregated, volatility, on='date', how='left')
    aggregated['polarity_volatility'] = aggregated['polarity_volatility'].fillna(0)

    # Add sentiment category distribution (as percentages)
    # sentiment_distribution = articles_with_sentiment.groupby(['date', 'sentiment_category']).size().unstack(fill_value=0)
    sentiment_distribution = (
        articles_with_sentiment
        .groupby(['date', 'sentiment_category'], observed=True)
        .size()
        .unstack(fill_value=0)
    )
    
    for category in sentiment_distribution.columns:
        sentiment_distribution[f'{category}_pct'] = sentiment_distribution[category] / sentiment_distribution.sum(axis=1) * 100
    sentiment_distribution = sentiment_distribution[[col for col in sentiment_distribution.columns if '_pct' in col]]
    sentiment_distribution = sentiment_distribution.reset_index()

    aggregated = pd.merge(aggregated, sentiment_distribution, on='date', how='left')

    # Ensure full daily continuity using interpolation
    aggregated['date'] = pd.to_datetime(aggregated['date'])
    min_date = aggregated['date'].min()
    max_date = aggregated['date'].max()
    full_dates = pd.date_range(start=min_date, end=max_date, freq='D')
    full_df = pd.DataFrame({'date': full_dates})

    # Merge and interpolate missing values
    aggregated = pd.merge(full_df, aggregated, on='date', how='left')
    aggregated = aggregated.set_index('date')
    aggregated = aggregated.ffill().bfill().reset_index()
    aggregated = aggregated.sort_values('date')

    logger.info(f"Interpolated data covers {len(aggregated)} days from {min_date.date()} to {max_date.date()}")
    logger.info(f"Sentiment aggregation completed for {len(aggregated)} days.")

    return aggregated

In [15]:
aggregated_sentiment = aggregate_sentiment(new_articles)


2025-04-22 19:26:49 - __main__ - INFO - Aggregating sentiment data by date...
2025-04-22 19:26:49 - __main__ - INFO - Articles date range: 2025-03-24 00:00:00 to 2025-04-21 00:00:00
2025-04-22 19:26:49 - __main__ - INFO - Total articles by date: {Timestamp('2025-03-24 00:00:00'): 2, Timestamp('2025-03-25 00:00:00'): 3, Timestamp('2025-03-26 00:00:00'): 2, Timestamp('2025-03-29 00:00:00'): 2, Timestamp('2025-03-30 00:00:00'): 1, Timestamp('2025-03-31 00:00:00'): 1, Timestamp('2025-04-01 00:00:00'): 3, Timestamp('2025-04-02 00:00:00'): 1, Timestamp('2025-04-03 00:00:00'): 1, Timestamp('2025-04-04 00:00:00'): 1, Timestamp('2025-04-06 00:00:00'): 1, Timestamp('2025-04-07 00:00:00'): 1, Timestamp('2025-04-08 00:00:00'): 2, Timestamp('2025-04-09 00:00:00'): 2, Timestamp('2025-04-10 00:00:00'): 1, Timestamp('2025-04-11 00:00:00'): 1, Timestamp('2025-04-13 00:00:00'): 1, Timestamp('2025-04-14 00:00:00'): 2, Timestamp('2025-04-15 00:00:00'): 1, Timestamp('2025-04-16 00:00:00'): 1, Timestamp('20

In [16]:
aggregated_sentiment.head()

Unnamed: 0,date,avg_polarity,avg_subjectivity,max_polarity,min_polarity,article_count,polarity_volatility,Negative_pct,Neutral_pct,Positive_pct,Very Positive_pct
0,2025-03-24,0.302083,0.379167,0.416667,0.1875,2.0,0.162045,0.0,0.0,100.0,0.0
1,2025-03-25,0.0,0.0,0.0,0.0,3.0,0.0,0.0,100.0,0.0,0.0
2,2025-03-26,0.2125,0.425,0.3,0.125,2.0,0.123744,0.0,0.0,100.0,0.0
3,2025-03-27,0.2125,0.425,0.3,0.125,2.0,0.123744,0.0,0.0,100.0,0.0
4,2025-03-28,0.2125,0.425,0.3,0.125,2.0,0.123744,0.0,0.0,100.0,0.0


In [19]:
use_cached_data = False 

end_date = datetime.now().date()
start_date = end_date - timedelta(days=29)
start_date_str = start_date.strftime('%Y-%m-%d')
end_date_str = end_date.strftime('%Y-%m-%d')

new_articles_fetch = fetch_bitcoin_news(start_date_str, end_date_str, refresh=True)
new_articles_fetch.head()


if use_cached_data:
    logger.info("Loading cached news articles and sentiment data.")
    articles = load_data("articles_data.csv")
    aggregated_sentiment = load_data("aggregated_sentiment.csv")
else:
    logger.info("Fetching Bitcoin-related news.")
    new_articles_fetch = fetch_bitcoin_news(start_date_str, end_date_str, refresh=True)
    new_articles = analyze_sentiment(new_articles_fetch)

    # Load old data and merge
    old_articles = load_data("articles_data.csv")
    if old_articles is not None:
        articles = pd.concat([old_articles, new_articles], ignore_index=True)
        # articles = articles.drop_duplicates(subset=['url'])
        
    else:
        articles = new_articles

    # Aggregate sentiment
    aggregated_sentiment = aggregate_sentiment(articles)

    # Save updated versions
    save_data(articles, "articles_data.csv")
    save_data(aggregated_sentiment, "aggregated_sentiment.csv")

2025-04-22 19:59:23 - __main__ - INFO - Using valid NewsAPI key
2025-04-22 19:59:40 - __main__ - INFO - Retrieved 35 articles from NewsAPI for 2025-03-24 to 2025-04-22
2025-04-22 19:59:40 - __main__ - INFO - Fetching Bitcoin-related news.
2025-04-22 19:59:41 - __main__ - INFO - Using valid NewsAPI key
2025-04-22 19:59:58 - __main__ - INFO - Retrieved 35 articles from NewsAPI for 2025-03-24 to 2025-04-22
2025-04-22 19:59:58 - __main__ - INFO - Analyzing sentiment of news articles...
2025-04-22 19:59:58 - __main__ - INFO - Sentiment analysis completed for 35 articles.
2025-04-22 19:59:58 - src.data_saver - INFO - Loaded data from data/articles_data.csv
2025-04-22 19:59:58 - __main__ - INFO - Aggregating sentiment data by date...
2025-04-22 19:59:58 - __main__ - INFO - Articles date range: 2025-03-24 00:00:00 to 2025-04-21 00:00:00
2025-04-22 19:59:58 - __main__ - INFO - Total articles by date: {Timestamp('2025-03-24 00:00:00'): 6, Timestamp('2025-03-25 00:00:00'): 9, Timestamp('2025-03-2

In [20]:
articles.head()

Unnamed: 0,title,description,content,source,author,url,publishedAt,date,polarity,subjectivity,sentiment_category
0,Female executives are fighting back against al...,Boardrooms are brimming with bro-ish bravado. ...,FreshSplash/Getty Images\r\n<ul><li>This post ...,Business Insider,Hallam Bullock,https://www.businessinsider.com/female-executi...,2025-03-24 11:47:17,2025-03-24,0.1875,0.375,Positive
1,Here are the 20 startups that sponsor the most...,"In an ever-evolving tech landscape, the H-1B v...",OpenAI CEO Sam Altman.Joel Saget/Getty Images;...,Business Insider,"Melia Russell,Alex Nicoll",https://www.businessinsider.com/tech-startups-...,2025-03-24 09:00:01,2025-03-24,0.416667,0.383333,Positive
2,Looking for a career in private equity? Here's...,"The Golden Age of private equity is over, but ...",private equityEmir Memedovski/Getty Images\r\n...,Business Insider,Hallam Bullock,https://www.businessinsider.com/private-equity...,2025-03-26 12:22:27,2025-03-26,0.125,0.375,Positive
3,GameStop is getting into bitcoin — and investo...,Videogame seller GameStop said it will now inc...,Videogame seller GameStop said it will now inc...,Business Insider,shubhangigoel@insider.com (Shubhangi Goel),https://www.businessinsider.com/gamestop-stock...,2025-03-26 06:49:04,2025-03-26,0.3,0.475,Positive
4,Trump's latest crypto push: Getting into the s...,The Trump-backed World Liberty Financial plans...,The Trump family is diving into another corner...,Business Insider,fdemott@businessinsider.com (Filip De Mott),https://www.businessinsider.com/donald-trump-c...,2025-03-25 17:26:22,2025-03-25,0.0,0.0,Neutral


In [21]:
aggregated_sentiment.head()

Unnamed: 0,date,avg_polarity,avg_subjectivity,max_polarity,min_polarity,article_count,polarity_volatility,Negative_pct,Neutral_pct,Positive_pct,Very Positive_pct
0,2025-03-24,0.302083,0.379167,0.416667,0.1875,6.0,0.12552,0.0,0.0,100.0,0.0
1,2025-03-25,0.0,0.0,0.0,0.0,9.0,0.0,0.0,100.0,0.0,0.0
2,2025-03-26,0.2125,0.425,0.3,0.125,6.0,0.095851,0.0,0.0,100.0,0.0
3,2025-03-27,0.2125,0.425,0.3,0.125,6.0,0.095851,0.0,0.0,100.0,0.0
4,2025-03-28,0.2125,0.425,0.3,0.125,6.0,0.095851,0.0,0.0,100.0,0.0


## The flow should be highlighted using headings in markdown
```
# Level 1
## Level 2
### Level 3
```

In [None]:
class Template:
    """
    Brief imperative description of what the class does in one line, if needed.
    """
    def __init__(self):
        pass
    
    def method1(self, arg1:int) -> None:
        """
        Brief imperative description of what the method does in one line.

        You can elaborate more in the method docstring in this section, for e.g. explaining 
        the formula/algorithm. Every method/function should have a docstring, typehints and include the
        parameters and return as follows:

        :param arg1: description of arg1
        :return: description of return
        """
        # Code bloks go here.
        # Make sure to include comments to explain what the code is doing.
        # No empty lines between code blocks.
        pass


def template_function(arg1:int) -> None:
    """
    Brief imperative description of what the function does in one line.

    You can elaborate more in the function docstring in this section, for e.g. explaining 
    the formula/algorithm. Every function should have a docstring, typehints and include the
    parameters and return as follows:

    :param arg1: description of arg1
    :return: description of return
    """
    # Code bloks go here.
    # Make sure to include comments to explain what the code is doing.
    # No empty lines between code blocks.
    pass
