In [None]:
%pip install yfinance
%pip install augini
%pip install PyPortfolioOpt
%pip install pandas

In [1]:
import yfinance as yf
import pandas as pd
from pypfopt import EfficientFrontier, expected_returns, risk_models
from augini import Augini
import os
from IPython.display import display, Markdown

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 0: Input your api key
def get_api_key():
    api_key = os.environ.get('OPENROUTER_TOKEN')
    if api_key:
        print("Using API key from environment variable.")
        return api_key
    else:
        api_key = input("Enter your API key manually: ")
        return api_key

# Set up Augini
api_key = get_api_key()
augini = Augini(api_key=api_key, use_openrouter=True, model='openai/gpt-4o-mini')

Using API key from environment variable.


In [3]:
# Step 1: Initial Data Collection

tickers = ['META', 'AAPL', 'AMZN', 'NFLX', 'GOOGL']

def get_stock_data(tickers, period='1y'):
    data = pd.DataFrame()
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        hist = stock.history(period=period)
        data[ticker] = hist['Close']
    return data

# Get stock price data
stock_data = get_stock_data(tickers)
print("\nStock price data sample:")
print(stock_data.head())

# Basic statistics
print("\nBasic statistics for each stock:")
print(stock_data.describe())


Stock price data sample:
                                 META        AAPL        AMZN        NFLX  \
Date                                                                        
2023-12-27 00:00:00-05:00  356.468292  192.208359  153.339996  491.790009   
2023-12-28 00:00:00-05:00  356.956421  192.636276  153.380005  490.510010   
2023-12-29 00:00:00-05:00  352.613037  191.591385  151.940002  486.880005   
2024-01-02 00:00:00-05:00  344.972229  184.734985  149.929993  468.500000   
2024-01-03 00:00:00-05:00  343.159149  183.351761  148.470001  470.260010   

                                GOOGL  
Date                                   
2023-12-27 00:00:00-05:00  139.862976  
2023-12-28 00:00:00-05:00  139.723480  
2023-12-29 00:00:00-05:00  139.185440  
2024-01-02 00:00:00-05:00  137.670929  
2024-01-03 00:00:00-05:00  138.418228  

Basic statistics for each stock:
             META        AAPL        AMZN        NFLX       GOOGL
count  253.000000  253.000000  253.000000  253.000000 

In [4]:
# Step 2: News Data Collection using yfinance Search

def get_stock_news(tickers, news_count=10):
    news_data = []
    
    for ticker in tickers:
        try:
            # Use Search class to get news
            search_results = yf.Search(
                query=ticker,
                news_count=news_count,
                max_results=news_count
            )
            
            if hasattr(search_results, 'news') and search_results.news:
                for news_item in search_results.news:
                    try:
                        news_data.append({
                            'ticker': ticker,
                            'title': news_item.get('title', ''),
                            'description': news_item.get('description', ''),
                            'link': news_item.get('link', ''),
                            'publisher': news_item.get('publisher', ''),
                            'date': pd.to_datetime(news_item.get('providerPublishTime', pd.Timestamp.now().timestamp()), unit='s')
                        })
                    except Exception as e:
                        print(f"Error processing news item for {ticker}: {str(e)}")
                        continue
            else:
                print(f"No news found for {ticker}")
                news_data.append({
                    'ticker': ticker,
                    'title': 'No recent news available',
                    'description': 'No description available',
                    'link': '',
                    'publisher': '',
                    'date': pd.Timestamp.now()
                })
                
        except Exception as e:
            print(f"Error fetching news for {ticker}: {str(e)}")
            news_data.append({
                'ticker': ticker,
                'title': 'Failed to fetch news',
                'description': 'Error in data retrieval',
                'link': '',
                'publisher': '',
                'date': pd.Timestamp.now()
            })
    
    # Create DataFrame
    news_df = pd.DataFrame(news_data)
    
    # Sort by date (most recent first)
    news_df = news_df.sort_values('date', ascending=False)
    
    return news_df

# Fetch news data
news_df = get_stock_news(tickers)

# Display results
print("\nRecent news data summary:")
print(f"Total news articles retrieved: {len(news_df)}")
print("\nArticles per ticker:")
print(news_df['ticker'].value_counts())

print("\nSample of recent news:")
# Display sample showing ticker, title, and date
print(news_df[['ticker', 'title', 'date']].head())

# Quality checks
print("\nMissing values in each column:")
print(news_df.isnull().sum())

# Date range of news
print("\nNews date range:")
print(f"Earliest: {news_df['date'].min()}")
print(f"Latest: {news_df['date'].max()}")


Recent news data summary:
Total news articles retrieved: 50

Articles per ticker:
ticker
AAPL     10
GOOGL    10
NFLX     10
AMZN     10
META     10
Name: count, dtype: int64

Sample of recent news:
   ticker                                              title  \
16   AAPL  Dow Jones Slides 280 Points As Treasury Yields...   
17   AAPL  Made In China, Paid In America: What Trump's N...   
48  GOOGL  OpenAI Wants to Create a ‘Public Benefit Corpo...   
30   NFLX  Netflix sets NFL streaming record with Christm...   
35   NFLX  Shareholders Don’t Vote as If Their Power Matt...   

                  date  
16 2024-12-27 14:32:50  
17 2024-12-27 13:58:42  
48 2024-12-27 13:57:00  
30 2024-12-27 13:52:32  
35 2024-12-27 13:30:00  

Missing values in each column:
ticker         0
title          0
description    0
link           0
publisher      0
date           0
dtype: int64

News date range:
Earliest: 2024-12-26 09:03:32
Latest: 2024-12-27 14:32:50


In [5]:
# Step 3: Synthetic Data Generation using Augini

sentiment_prompt = """
Analyze the news title and description to determine market sentiment.
Generate synthetic sentiment labels across three categories:
- Bearish (negative outlook)
- Neutral (stable outlook)
- Bullish (positive outlook)
Base the sentiment on market implications and investor perspective. The name of the column returned will be 'sentiment'.
"""

# Generate synthetic sentiment data
augmented_df = augini.augment_single(
    news_df,
    column_name="sentiment",
    custom_prompt=sentiment_prompt,
    use_sync=False
)

print("\nOriginal vs Augmented Data Sample:")
print(augmented_df[['ticker', 'title', 'sentiment']].head())

# Convert sentiment to numerical scores
sentiment_scores = {
    'Bearish': -1,
    'Neutral': 0,
    'Bullish': 1
}

augmented_df['sentiment_score'] = augmented_df['sentiment'].map(sentiment_scores)

print("\nSentiment Distribution:")
print(augmented_df['sentiment'].value_counts())


Original vs Augmented Data Sample:
   ticker                                              title sentiment
16   AAPL  Dow Jones Slides 280 Points As Treasury Yields...   Bearish
17   AAPL  Made In China, Paid In America: What Trump's N...   Bullish
48  GOOGL  OpenAI Wants to Create a ‘Public Benefit Corpo...   Bearish
30   NFLX  Netflix sets NFL streaming record with Christm...   Bearish
35   NFLX  Shareholders Don’t Vote as If Their Power Matt...   Bearish

Sentiment Distribution:
sentiment
Bullish    20
Bearish    16
Neutral    14
Name: count, dtype: int64


In [6]:
display(Markdown(augini.chat("Give me an overview of the data.",df=augmented_df)))

## Data Overview 📊

The dataset consists of **50 rows** and **8 columns**. It contains news articles associated with various stock tickers, including their titles, descriptions, publication links, publishers, publication dates, sentiment analysis, and sentiment scores.

### Column Details
- **Ticker**: Represents stock tickers (5 unique values). The most frequent tickers are AAPL, GOOGL, NFLX, AMZN, and META, each appearing 10 times.
- **Title**: Contains titles of articles (44 unique values). The most common titles appear twice.
- **Description**: All entries are empty, indicating a lack of description content.
- **Link**: Contains URLs of articles (44 unique values). Several links are repeated twice.
- **Publisher**: Lists the publishers (15 unique values). Most articles are published by 'Motley Fool', followed by 'Insider Monkey' and 'Zacks'.
- **Date**: Indicates the publication date and time (43 unique values). The most frequent dates include several entries on '2024-12-27'.
- **Sentiment**: Categorizes the sentiment of the articles into three groups: Bullish (20), Bearish (16), Neutral (14).
- **Sentiment Score**: Numerical representation of sentiment with a mean score of **0.08** and a range from **-1.0 to 1.0**.

### Data Quality
- **Missing Values**: There are no missing values across all columns.
- **Duplicated Rows**: The dataset has no duplicate entries.

### Sample Data
Here are the first three rows:
1. **AAPL** - "Dow Jones Slides 280 Points As Treasury Yields Climb; Nvidia, Tesla Drop" - Sentiment: Bearish (Score: -1)
2. **AAPL** - "Made In China, Paid In America: What Trump's New Tariffs Mean For The US Economy" - Sentiment: Bullish (Score: 1)
3. **GOOGL** - "OpenAI Wants to Create a ‘Public Benefit Corporation.’ Here’s What That Means." - Sentiment: Bearish (Score: -1)

### Limitations
- The **description** column lacks content, limiting the depth of information provided by each article.

In [7]:
# Step 4: Portfolio Optimization with Sentiment Integration

# Calculate base returns and risk
returns = expected_returns.mean_historical_return(stock_data)
cov_matrix = risk_models.sample_cov(stock_data)

# Adjust expected returns based on sentiment
sentiment_adjustment = 0.02  
for idx, row in augmented_df.iterrows():
    ticker = row['ticker']
    if ticker in returns.index:
        returns[ticker] *= (1 + sentiment_adjustment * row['sentiment_score'])

print("\nAdjusted Expected Returns:")
for ticker in returns.index:
    print(f"{ticker}: {returns[ticker]:.4f}")

# Optimize portfolio
ef = EfficientFrontier(returns, cov_matrix)
weights = ef.max_sharpe()  
cleaned_weights = ef.clean_weights()


print("\nOptimized Portfolio Weights:")
for ticker, weight in cleaned_weights.items():
    print(f"{ticker}: {weight:.4f}")

# Calculate and display performance metrics
expected_annual_return, annual_volatility, sharpe_ratio = ef.portfolio_performance()
print("\nPortfolio Performance Metrics:")
print(f"Expected Annual Return: {expected_annual_return:.2%}")
print(f"Annual Volatility: {annual_volatility:.2%}")
print(f"Sharpe Ratio: {sharpe_ratio:.2f}")


Adjusted Expected Returns:
META: 0.6418
AAPL: 0.3627
AMZN: 0.4434
NFLX: 0.8460
GOOGL: 0.3919

Optimized Portfolio Weights:
META: 0.1619
AAPL: 0.2094
AMZN: 0.0000
NFLX: 0.6013
GOOGL: 0.0274

Portfolio Performance Metrics:
Expected Annual Return: 69.93%
Annual Volatility: 22.94%
Sharpe Ratio: 2.96
