In [None]:
%pip install yfinance
%pip install augini
%pip install PyPortfolioOpt
%pip install pandas

In [1]:
import yfinance as yf
import pandas as pd
from pypfopt import EfficientFrontier, expected_returns, risk_models
from augini import Augini
import os
from IPython.display import display, Markdown

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 0: Input your api key
def get_api_key():
    api_key = os.environ.get('OPENROUTER_TOKEN')
    if api_key:
        print("Using API key from environment variable.")
        return api_key
    else:
        api_key = input("Enter your API key manually: ")
        return api_key

# Set up Augini
api_key = get_api_key()
augini = Augini(api_key=api_key, use_openrouter=True, model='openai/gpt-4o-mini')

Using API key from environment variable.


In [3]:
# Step 1: Initial Data Collection

tickers = ['META', 'AAPL', 'AMZN', 'NFLX', 'GOOGL']

def get_stock_data(tickers, period='1y'):
    data = pd.DataFrame()
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        hist = stock.history(period=period)
        data[ticker] = hist['Close']
    return data

# Get stock price data
stock_data = get_stock_data(tickers)
print("\nStock price data sample:")
print(stock_data.head())

# Basic statistics
print("\nBasic statistics for each stock:")
print(stock_data.describe())


Stock price data sample:
                                 META        AAPL        AMZN        NFLX  \
Date                                                                        
2023-12-27 00:00:00-05:00  356.468323  192.208359  153.339996  491.790009   
2023-12-28 00:00:00-05:00  356.956482  192.636276  153.380005  490.510010   
2023-12-29 00:00:00-05:00  352.613068  191.591385  151.940002  486.880005   
2024-01-02 00:00:00-05:00  344.972260  184.734970  149.929993  468.500000   
2024-01-03 00:00:00-05:00  343.159180  183.351746  148.470001  470.260010   

                                GOOGL  
Date                                   
2023-12-27 00:00:00-05:00  139.862976  
2023-12-28 00:00:00-05:00  139.723480  
2023-12-29 00:00:00-05:00  139.185440  
2024-01-02 00:00:00-05:00  137.670929  
2024-01-03 00:00:00-05:00  138.418228  

Basic statistics for each stock:
             META        AAPL        AMZN        NFLX       GOOGL
count  253.000000  253.000000  253.000000  253.000000 

In [4]:
# Step 2: News Data Collection using yfinance Search

def get_stock_news(tickers, news_count=10):
    news_data = []
    
    for ticker in tickers:
        try:
            # Use Search class to get news
            search_results = yf.Search(
                query=ticker,
                news_count=news_count,
                max_results=news_count
            )
            
            if hasattr(search_results, 'news') and search_results.news:
                for news_item in search_results.news:
                    try:
                        news_data.append({
                            'ticker': ticker,
                            'title': news_item.get('title', ''),
                            'description': news_item.get('description', ''),
                            'link': news_item.get('link', ''),
                            'publisher': news_item.get('publisher', ''),
                            'date': pd.to_datetime(news_item.get('providerPublishTime', pd.Timestamp.now().timestamp()), unit='s')
                        })
                    except Exception as e:
                        print(f"Error processing news item for {ticker}: {str(e)}")
                        continue
            else:
                print(f"No news found for {ticker}")
                news_data.append({
                    'ticker': ticker,
                    'title': 'No recent news available',
                    'description': 'No description available',
                    'link': '',
                    'publisher': '',
                    'date': pd.Timestamp.now()
                })
                
        except Exception as e:
            print(f"Error fetching news for {ticker}: {str(e)}")
            news_data.append({
                'ticker': ticker,
                'title': 'Failed to fetch news',
                'description': 'Error in data retrieval',
                'link': '',
                'publisher': '',
                'date': pd.Timestamp.now()
            })
    
    # Create DataFrame
    news_df = pd.DataFrame(news_data)
    
    # Sort by date (most recent first)
    news_df = news_df.sort_values('date', ascending=False)
    
    return news_df

# Fetch news data
news_df = get_stock_news(tickers)

# Display results
print("\nRecent news data summary:")
print(f"Total news articles retrieved: {len(news_df)}")
print("\nArticles per ticker:")
print(news_df['ticker'].value_counts())

print("\nSample of recent news:")
# Display sample showing ticker, title, and date
print(news_df[['ticker', 'title', 'date']].head())

# Quality checks
print("\nMissing values in each column:")
print(news_df.isnull().sum())

# Date range of news
print("\nNews date range:")
print(f"Earliest: {news_df['date'].min()}")
print(f"Latest: {news_df['date'].max()}")


Recent news data summary:
Total news articles retrieved: 50

Articles per ticker:
ticker
AMZN     10
AAPL     10
NFLX     10
GOOGL    10
META     10
Name: count, dtype: int64

Sample of recent news:
   ticker                                              title  \
24   AMZN  Amazon Shares Slip 2.5% as Market Awaits 2025 ...   
17   AAPL  The White House Estimates RealPage Software Ca...   
32   NFLX  Netflix Shatters Records with NFL Debut While ...   
20   AMZN  Analyst Explains Why Amazon.com (AMZN) Is The ...   
45  GOOGL  The AI stock trade is starting to shift beyond...   

                  date  
24 2024-12-27 15:32:20  
17 2024-12-27 15:30:16  
32 2024-12-27 15:24:19  
20 2024-12-27 15:21:58  
45 2024-12-27 15:11:33  

Missing values in each column:
ticker         0
title          0
description    0
link           0
publisher      0
date           0
dtype: int64

News date range:
Earliest: 2024-12-26 09:03:32
Latest: 2024-12-27 15:32:20


In [5]:
# Step 3: Synthetic Data Generation using Augini

sentiment_prompt = """
Analyze the news title to determine market sentiment.
Generate synthetic sentiment labels across three categories:
- Bearish (negative outlook)
- Neutral (stable outlook)
- Bullish (positive outlook)
Base the sentiment on market implications and investor perspective. The name of the column returned will be 'sentiment'.
"""

# Generate synthetic sentiment data
augmented_df = augini.augment_single(
    news_df,
    column_name="sentiment",
    custom_prompt=sentiment_prompt,
    use_sync=False
)

print("\nOriginal vs Augmented Data Sample:")
print(augmented_df[['ticker', 'title', 'sentiment']].head())

# Convert sentiment to numerical scores
sentiment_scores = {
    'Bearish': -1,
    'Neutral': 0,
    'Bullish': 1
}

augmented_df['sentiment_score'] = augmented_df['sentiment'].map(sentiment_scores)

print("\nSentiment Distribution:")
print(augmented_df['sentiment'].value_counts())


Original vs Augmented Data Sample:
   ticker                                              title sentiment
24   AMZN  Amazon Shares Slip 2.5% as Market Awaits 2025 ...   Bearish
17   AAPL  The White House Estimates RealPage Software Ca...   Bullish
32   NFLX  Netflix Shatters Records with NFL Debut While ...   Bullish
20   AMZN  Analyst Explains Why Amazon.com (AMZN) Is The ...   Bearish
45  GOOGL  The AI stock trade is starting to shift beyond...   Bearish

Sentiment Distribution:
sentiment
Bullish    22
Bearish    18
Neutral    10
Name: count, dtype: int64


In [10]:
display(Markdown(augini.chat("Give me an overview of the data.",df=augmented_df)))

## Data Overview 📊

**Dataset Characteristics:**
- **Shape:** 50 rows and 8 columns
- **Data Types:** 
  - `ticker`: object
  - `title`: object
  - `description`: object
  - `link`: object
  - `publisher`: object
  - `date`: datetime64[ns]
  - `sentiment`: object
  - `sentiment_score`: int64

### Column Statistics
- **Ticker:** 5 unique values, top categories include AMZN, AAPL, NFLX, GOOGL, META (each with 10 occurrences).
- **Title:** 43 unique values, most frequent titles are related to 'AI stock trade' and 'Magnificent Seven Stocks'.
- **Description:** Single unique value (empty string) across all rows, indicating lack of descriptive content.
- **Link:** 43 unique links, with some repeated multiple times.
- **Publisher:** 16 unique publishers, with 'Insider Monkey' (8 occurrences) being the most frequent.
- **Date:** 42 unique timestamps, indicating various publication times, predominantly on 2024-12-27.
- **Sentiment:** 3 categories (Bullish, Bearish, Neutral) with a majority being Bullish (22 occurrences).
- **Sentiment Score:** Mean of 0.08, median of 0.0, and a standard deviation of approximately 0.90. Scores range from -1.0 to 1.0 with slight negative skewness.

### Data Quality
- **Total Missing Values:** 0
- **Duplicated Rows:** 0

### Sample Data
Some entries include:
1. **Ticker:** AMZN, **Title:** Amazon Shares Slip 2.5% as Market Awaits 2025 Outlook, **Sentiment:** Bearish, **Sentiment Score:** -1
2. **Ticker:** AAPL, **Title:** The White House Estimates RealPage Software Caused U.S. Renters To Spend An Extra $3.8 Billion Last Year, **Sentiment:** Bullish, **Sentiment Score:** 1
3. **Ticker:** NFLX, **Title:** Netflix Shatters Records with NFL Debut While Squid Game 2 Disappoints, **Sentiment:** Bullish, **Sentiment Score:** 1

### Conclusion
The dataset is well-structured with no missing or duplicated entries. However, the description column lacks content which could limit the understanding of the articles referenced.

In [9]:
# Step 4: Portfolio Optimization with Sentiment Integration

# Calculate base returns and risk
returns = expected_returns.mean_historical_return(stock_data)
cov_matrix = risk_models.sample_cov(stock_data)

# Adjust expected returns based on sentiment
sentiment_adjustment = 0.02  
for idx, row in augmented_df.iterrows():
    ticker = row['ticker']
    if ticker in returns.index:
        returns[ticker] *= (1 + sentiment_adjustment * row['sentiment_score'])

print("\nAdjusted Expected Returns:")
for ticker in returns.index:
    print(f"{ticker}: {returns[ticker]:.4f}")

# Optimize portfolio
ef = EfficientFrontier(returns, cov_matrix)
weights = ef.max_sharpe()  
cleaned_weights = ef.clean_weights()

print("\nOptimized Portfolio Weights:")
for ticker, weight in cleaned_weights.items():
    print(f"{ticker}: {weight:.4f}")

# Calculate and display performance metrics
expected_annual_return, annual_volatility, sharpe_ratio = ef.portfolio_performance()
print("\nPortfolio Performance Metrics:")
print(f"Expected Annual Return: {expected_annual_return:.2%}")
print(f"Annual Volatility: {annual_volatility:.2%}")
print(f"Sharpe Ratio: {sharpe_ratio:.2f}")


Adjusted Expected Returns:
META: 0.7333
AAPL: 0.3205
AMZN: 0.4449
NFLX: 0.8776
GOOGL: 0.3478

Optimized Portfolio Weights:
META: 0.2336
AAPL: 0.1211
AMZN: 0.0000
NFLX: 0.6453
GOOGL: 0.0000

Portfolio Performance Metrics:
Expected Annual Return: 77.65%
Annual Volatility: 24.51%
Sharpe Ratio: 3.09
