# Investment Portfolio Dataset Exploration

This notebook explores the generated portfolio dataset and demonstrates how to use it with ML models.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os

# Set plot style
plt.style.use('dark_background')
sns.set_theme(style="darkgrid")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

## 1. Load the Dataset

First, let's load the portfolio dataset and market news data.

In [None]:
# Check if data directory exists, if not, run the generator script
if not os.path.exists('data/portfolio_dataset.csv'):
    print("Dataset not found. Running generator script...")
    %run portfolio_dataset_generator.py
    print("Dataset generation complete.")
else:
    print("Dataset found. Loading data...")

# Load portfolio data
portfolio_df = pd.read_csv('data/portfolio_dataset.csv')

# Convert sdg_alignment from string to list if needed
if 'sdg_alignment' in portfolio_df.columns:
    portfolio_df['sdg_alignment'] = portfolio_df['sdg_alignment'].apply(
        lambda x: eval(x) if isinstance(x, str) and x.strip() else []
    )

# Load market news data
news_df = pd.read_csv('data/market_news.csv')

# Try to load SDG data and sustainability trends if they exist
try:
    with open('data/sdg_data.json', 'r') as f:
        sdg_data = json.load(f)
    print(f"Loaded SDG data successfully")
except Exception as e:
    print(f"Note: SDG data file not found. This is not critical.")
    # Create a simple SDG dictionary for reference
    sdg_data = {str(i): {"name": f"SDG {i}"} for i in range(1, 18)}

try:
    with open('data/sustainability_trends.json', 'r') as f:
        sustainability_trends = json.load(f)
    print(f"Loaded sustainability trends successfully")
except Exception as e:
    print(f"Note: Sustainability trends file not found. This is not critical.")
    sustainability_trends = []

## 2. Explore Portfolio Data

Let's examine the portfolio dataset.

In [None]:
# Display basic information about the dataset
print(f"Portfolio dataset shape: {portfolio_df.shape}")
print(f"Number of stocks: {len(portfolio_df[portfolio_df['asset_type'] == 'Stock'])}")
print(f"Number of cryptocurrencies: {len(portfolio_df[portfolio_df['asset_type'] == 'Crypto'])}")

# Display the first few rows
portfolio_df.head()

In [None]:
# Check column data types and missing values
portfolio_df.info()

In [None]:
# Summary statistics
portfolio_df.describe()

## 3. Visualize Portfolio Data

Let's create some visualizations to better understand the data.

In [None]:
# Distribution of ESG scores by asset type
plt.figure(figsize=(12, 6))
sns.boxplot(x='asset_type', y='esg_score', data=portfolio_df)
plt.title('Distribution of ESG Scores by Asset Type')
plt.xlabel('Asset Type')
plt.ylabel('ESG Score')
plt.show()

In [None]:
# Scatter plot of ESG score vs. ROI
plt.figure(figsize=(12, 6))
sns.scatterplot(x='esg_score', y='roi_1y', hue='asset_type', size='market_cap_b', 
                sizes=(50, 400), alpha=0.7, data=portfolio_df)
plt.title('ESG Score vs. 1-Year ROI')
plt.xlabel('ESG Score')
plt.ylabel('1-Year ROI (%)')
plt.grid(True)
plt.show()

In [None]:
# Correlation heatmap of numerical features
numeric_cols = ['current_price', 'price_change_24h', 'market_cap_b', 'roi_1y', 'volatility',
                'environmental_score', 'social_score', 'governance_score', 'esg_score',
                'beta', 'sharpe_ratio', 'market_correlation', 'carbon_footprint']

plt.figure(figsize=(14, 10))
correlation = portfolio_df[numeric_cols].corr()
mask = np.triu(correlation)
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', mask=mask, vmin=-1, vmax=1)
plt.title('Correlation Matrix of Portfolio Features')
plt.tight_layout()
plt.show()

In [None]:
# Distribution of assets by sector
plt.figure(figsize=(14, 8))
sector_counts = portfolio_df['sector'].value_counts()
sns.barplot(x=sector_counts.index, y=sector_counts.values)
plt.title('Distribution of Assets by Sector')
plt.xlabel('Sector')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 4. Explore Market News Data

Let's examine the market news dataset.

In [None]:
# Display basic information about the news dataset
print(f"News dataset shape: {news_df.shape}")
print(f"Sentiment distribution:\n{news_df['sentiment'].value_counts()}")

# Display the first few rows
news_df.head()

In [None]:
# Visualize sentiment distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=news_df, palette={'positive': 'green', 'neutral': 'gray', 'negative': 'red'})
plt.title('Distribution of News Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

## 5. Prepare Data for ML Models

Now, let's prepare the data for use with ML models.

In [None]:
# Function to prepare data for portfolio recommendation model
def prepare_data_for_recommendation(portfolio_df, user_preferences):
    """
    Prepare data for the portfolio recommendation model
    
    Args:
        portfolio_df: DataFrame of portfolio assets
        user_preferences: Dict of user preferences
        
    Returns:
        DataFrame ready for ML model
    """
    # Convert SDG alignment from string to list if needed
    if isinstance(portfolio_df['sdg_alignment'].iloc[0], str):
        portfolio_df['sdg_alignment'] = portfolio_df['sdg_alignment'].apply(eval)
    
    # Extract features for ML model
    features = [
        'ticker', 'name', 'asset_type', 'sector',
        'current_price', 'price_change_24h', 'market_cap_b', 'roi_1y', 'volatility',
        'environmental_score', 'social_score', 'governance_score', 'esg_score',
        'beta', 'sharpe_ratio', 'market_correlation', 'carbon_footprint'
    ]
    
    ml_data = portfolio_df[features].copy()
    
    # Apply user preferences
    risk_tolerance = user_preferences.get('risk_tolerance', 5)
    sustainability_focus = user_preferences.get('sustainability_focus', 5)
    
    # Calculate weights based on user preferences
    risk_weight = (11 - risk_tolerance) / 10  # 1.0 to 0.1
    esg_weight = sustainability_focus / 10  # 0.1 to 1.0
    return_weight = 1 - risk_weight - esg_weight/2  # Balance the weights
    
    # Calculate custom score
    ml_data['custom_score'] = (
        ml_data['esg_score'] * esg_weight +
        (100 - ml_data['volatility'] * 100) * risk_weight +
        ml_data['roi_1y'] * return_weight
    )
    
    return ml_data

# Example user preferences
user_preferences = {
    'risk_tolerance': 7,  # 1-10 scale (higher = more risk tolerant)
    'sustainability_focus': 8,  # 1-10 scale (higher = more sustainability focused)
    'investment_horizon': 'Long-term (> 3 years)'
}

# Prepare data for ML model
ml_data = prepare_data_for_recommendation(portfolio_df, user_preferences)
ml_data.head()

In [None]:
# Sort by custom score to get recommendations
recommendations = ml_data.sort_values('custom_score', ascending=False)

# Display top 5 recommendations
print("Top 5 Recommendations based on User Preferences:")
recommendations[['name', 'ticker', 'asset_type', 'sector', 'esg_score', 'roi_1y', 'volatility', 'custom_score']].head(5)

## 6. Visualize Recommendations

Let's visualize how recommendations change with different user preferences.

In [None]:
# Function to get recommendations for different user preferences
def get_recommendations_for_preferences(portfolio_df, risk_tolerance, sustainability_focus):
    user_prefs = {
        'risk_tolerance': risk_tolerance,
        'sustainability_focus': sustainability_focus,
        'investment_horizon': 'Medium-term (1-3 years)'
    }
    
    ml_data = prepare_data_for_recommendation(portfolio_df, user_prefs)
    return ml_data.sort_values('custom_score', ascending=False).head(3)['ticker'].tolist()

# Create a grid of different preference combinations
risk_values = [3, 5, 8]  # Low, Medium, High
sustainability_values = [3, 5, 8]  # Low, Medium, High

# Create a DataFrame to store results
results = []

for risk in risk_values:
    for sus in sustainability_values:
        top_picks = get_recommendations_for_preferences(portfolio_df, risk, sus)
        results.append({
            'Risk Tolerance': risk,
            'Sustainability Focus': sus,
            'Top Recommendations': ', '.join(top_picks)
        })

results_df = pd.DataFrame(results)

# Display the results
print("How Recommendations Change with Different User Preferences:")
results_df

## 7. Prepare Data for Risk Assessment Model

Now, let's prepare the data for the risk assessment model.

In [None]:
# Function to assess portfolio risk
def assess_portfolio_risk(portfolio_df, user_preferences):
    """
    Assess the risk of a portfolio
    
    Args:
        portfolio_df: DataFrame of portfolio assets
        user_preferences: Dict of user preferences
        
    Returns:
        Dict with risk assessment results
    """
    # Calculate portfolio-level metrics (weighted by allocation)
    portfolio_volatility = np.average(portfolio_df['volatility'], weights=portfolio_df['allocation'])
    portfolio_beta = np.average(portfolio_df['beta'], weights=portfolio_df['allocation'])
    portfolio_esg_risk = 100 - np.average(portfolio_df['esg_score'], weights=portfolio_df['allocation'])
    
    # Adjust risk based on user preferences
    risk_tolerance = user_preferences.get('risk_tolerance', 5)
    sustainability_focus = user_preferences.get('sustainability_focus', 5)
    
    # Risk tolerance adjustment (1-10 scale)
    # Higher risk tolerance = lower perceived risk
    risk_tolerance_factor = risk_tolerance / 5  # 0.2-2.0 range
    
    # Sustainability focus adjustment (1-10 scale)
    # Higher sustainability focus = higher sensitivity to ESG risk
    sustainability_factor = sustainability_focus / 5  # 0.2-2.0 range
    
    # Adjust risk components
    market_risk_weight = 1.0 / risk_tolerance_factor
    esg_risk_weight = sustainability_factor
    
    # Calculate adjusted risk score
    risk_score = (portfolio_volatility * 100) * 0.4 * market_risk_weight + \
                 portfolio_esg_risk * 0.4 * esg_risk_weight + \
                 portfolio_beta * 20 * 0.2
    
    # Ensure risk score is within 0-100 range
    risk_score = min(max(risk_score, 0), 100)
    
    # Determine risk category
    if risk_score < 25:
        risk_category = 'Low'
    elif risk_score < 50:
        risk_category = 'Moderate'
    elif risk_score < 75:
        risk_category = 'High'
    else:
        risk_category = 'Very High'
    
    # Prepare risk factors
    risk_factors = {
        'Market Risk': portfolio_volatility * 100,
        'Systematic Risk': portfolio_beta * 50,
        'ESG Risk': portfolio_esg_risk
    }
    
    return {
        'risk_category': risk_category,
        'risk_score': risk_score,
        'risk_factors': risk_factors,
        'portfolio_metrics': {
            'volatility': portfolio_volatility,
            'beta': portfolio_beta,
            'esg_risk_score': portfolio_esg_risk
        }
    }

# Assess portfolio risk
risk_assessment = assess_portfolio_risk(portfolio_df, user_preferences)

# Display risk assessment results
print(f"Risk Category: {risk_assessment['risk_category']}")
print(f"Risk Score: {risk_assessment['risk_score']:.2f}")
print("\nRisk Factors:")
for factor, score in risk_assessment['risk_factors'].items():
    print(f"  {factor}: {score:.2f}")

## 8. Prepare Data for Sentiment Analysis Model

Finally, let's prepare the data for the sentiment analysis model.

In [None]:
# Function to analyze market sentiment
def analyze_market_sentiment(ticker, news_df, user_preferences):
    """
    Analyze market sentiment for a given ticker
    
    Args:
        ticker: Stock ticker symbol
        news_df: DataFrame of news items
        user_preferences: Dict of user preferences
        
    Returns:
        Dict with sentiment analysis results
    """
    # Filter news for the given ticker
    ticker_news = news_df[news_df['ticker'] == ticker]
    
    if len(ticker_news) == 0:
        return {
            'ticker': ticker,
            'sentiment_score': 0,
            'overall_sentiment': 'Neutral',
            'error': 'No news found for this ticker'
        }
    
    # Calculate sentiment score (-100 to 100)
    sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
    sentiment_values = [sentiment_map[s] for s in ticker_news['sentiment']]
    base_sentiment_score = sum(sentiment_values) / len(sentiment_values) * 100
    
    # Adjust sentiment score based on user preferences
    risk_tolerance = user_preferences.get('risk_tolerance', 5)
    sustainability_focus = user_preferences.get('sustainability_focus', 5)
    
    # Sustainability focus adjustment (1-10 scale)
    # Higher sustainability focus = more sensitive to negative ESG news
    sustainability_factor = sustainability_focus / 5  # 0.2-2.0 range
    
    # Risk tolerance adjustment (1-10 scale)
    # Lower risk tolerance = more sensitive to negative news
    risk_sensitivity = (11 - risk_tolerance) / 5  # 0.2-2.0 range
    
    # Apply adjustments
    if base_sentiment_score < 0:
        # Negative sentiment is amplified for sustainability-focused or risk-averse users
        sentiment_score = base_sentiment_score * max(sustainability_factor, risk_sensitivity)
    else:
        # Positive sentiment is slightly dampened for very sustainability-focused users
        sentiment_score = base_sentiment_score * (1 - (sustainability_factor - 1) * 0.1 if sustainability_factor > 1 else 1)
    
    # Count sentiments
    sentiment_counts = ticker_news['sentiment'].value_counts().to_dict()
    
    # Determine overall sentiment
    if sentiment_score > 30:
        overall_sentiment = 'Bullish'
    elif sentiment_score > 10:
        overall_sentiment = 'Somewhat Bullish'
    elif sentiment_score > -10:
        overall_sentiment = 'Neutral'
    elif sentiment_score > -30:
        overall_sentiment = 'Somewhat Bearish'
    else:
        overall_sentiment = 'Bearish'
    
    return {
        'ticker': ticker,
        'sentiment_score': sentiment_score,
        'overall_sentiment': overall_sentiment,
        'sentiment_counts': sentiment_counts,
        'news': ticker_news.to_dict('records')
    }

# Get top recommendation ticker
top_ticker = recommendations['ticker'].iloc[0]

# Analyze market sentiment
sentiment_analysis = analyze_market_sentiment(top_ticker, news_df, user_preferences)

# Display sentiment analysis results
print(f"Market Sentiment for {top_ticker}: {sentiment_analysis['overall_sentiment']}")
print(f"Sentiment Score: {sentiment_analysis['sentiment_score']:.2f}")
print("\nSentiment Distribution:")
for sentiment, count in sentiment_analysis['sentiment_counts'].items():
    print(f"  {sentiment.capitalize()}: {count}")

# Display recent news
print("\nRecent News:")
for i, news in enumerate(sentiment_analysis['news'][:3]):
    print(f"  {i+1}. {news['headline']} ({news['sentiment']})")

## 9. Conclusion

In this notebook, we've explored the portfolio dataset and demonstrated how to use it with ML models for:

1. Portfolio recommendations based on user preferences
2. Risk assessment of the portfolio
3. Market sentiment analysis for specific assets

These models can be integrated into the investment portfolio application to provide personalized recommendations and insights to users.