# Sustainable Investment Portfolio Analysis

This notebook demonstrates the ML models used for portfolio recommendations and analysis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Set plot style
plt.style.use('dark_background')
sns.set_theme(style="darkgrid")

## 1. Load and Prepare Data

In [None]:
# Generate dataset if it doesn't exist
if not os.path.exists('data/portfolio_dataset.csv'):
    print("Dataset not found. Running generator script...")
    %run portfolio_dataset_generator.py
    print("Dataset generation complete.")
else:
    print("Dataset found. Loading data...")

# Load portfolio data
portfolio_df = pd.read_csv('data/portfolio_dataset.csv')

# Convert sdg_alignment from string to list if needed
if 'sdg_alignment' in portfolio_df.columns:
    portfolio_df['sdg_alignment'] = portfolio_df['sdg_alignment'].apply(
        lambda x: eval(x) if isinstance(x, str) and x.strip() else []
    )

# Load market news data
news_df = pd.read_csv('data/market_news.csv')

print("Dataset shapes:")
print(f"Portfolio data: {portfolio_df.shape}")
print(f"Market news data: {news_df.shape}")

# Display first few rows
portfolio_df.head()

## 2. Portfolio Analysis

In [None]:
# Basic portfolio statistics
print("Portfolio Overview:")
print(f"Number of assets: {len(portfolio_df)}")
print(f"Number of stocks: {len(portfolio_df[portfolio_df['asset_type'] == 'Stock'])}")
print(f"Number of cryptocurrencies: {len(portfolio_df[portfolio_df['asset_type'] == 'Crypto'])}")
print(f"Sectors: {', '.join(portfolio_df['sector'].unique())}")
print(f"\nAverage ESG Score: {portfolio_df['esg_score'].mean():.2f}")
print(f"Average ROI: {portfolio_df['roi_1y'].mean():.2f}%")

# Plot sector distribution
plt.figure(figsize=(12, 6))
sector_counts = portfolio_df['sector'].value_counts()
sns.barplot(x=sector_counts.index, y=sector_counts.values)
plt.title('Assets by Sector')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 3. ESG Analysis

In [None]:
# ESG Score Distribution by Asset Type
plt.figure(figsize=(12, 6))
sns.boxplot(x='asset_type', y='esg_score', data=portfolio_df)
plt.title('ESG Scores by Asset Type')
plt.xlabel('Asset Type')
plt.ylabel('ESG Score')
plt.show()

# ESG Components by Asset Type
esg_components = ['environmental_score', 'social_score', 'governance_score']
plt.figure(figsize=(12, 6))
esg_data = portfolio_df.groupby('asset_type')[esg_components].mean().reset_index()
esg_data_melted = pd.melt(esg_data, id_vars='asset_type', value_vars=esg_components, 
                          var_name='ESG Component', value_name='Score')
sns.barplot(x='asset_type', y='Score', hue='ESG Component', data=esg_data_melted)
plt.title('Average ESG Component Scores by Asset Type')
plt.xlabel('Asset Type')
plt.ylabel('Score')
plt.legend(title='ESG Component')
plt.show()

## 4. Risk vs. Return Analysis

In [None]:
# Risk-Return Scatter Plot
plt.figure(figsize=(12, 8))
sns.scatterplot(data=portfolio_df, x='volatility', y='roi_1y', hue='asset_type', size='market_cap_b',
                sizes=(50, 400), alpha=0.6)
plt.title('Risk vs. Return by Asset Type')
plt.xlabel('Volatility (Risk)')
plt.ylabel('1-Year ROI (%)')
plt.grid(True)
plt.legend(title='Asset Type')
plt.tight_layout()
plt.show()

# ESG Score vs. ROI
plt.figure(figsize=(12, 6))
sns.scatterplot(x='esg_score', y='roi_1y', hue='asset_type', size='market_cap_b', 
                sizes=(50, 400), alpha=0.7, data=portfolio_df)
plt.title('ESG Score vs. 1-Year ROI')
plt.xlabel('ESG Score')
plt.ylabel('1-Year ROI (%)')
plt.grid(True)
plt.show()

## 5. Portfolio Recommendations

In [None]:
def get_portfolio_recommendations(df, risk_tolerance=5, sustainability_focus=5):
    """Generate portfolio recommendations based on user preferences."""
    # Make a copy to avoid modifying the original dataframe
    portfolio_df = df.copy()
    
    # Ensure all required columns exist
    required_columns = [
        'ticker', 'name', 'asset_type', 'sector',
        'current_price', 'roi_1y', 'volatility',
        'esg_score', 'beta', 'sharpe_ratio'
    ]
    
    # Check for missing columns and add them with default values if needed
    for col in required_columns:
        if col not in portfolio_df.columns:
            print(f"Warning: Column '{col}' not found in dataset. Adding with default values.")
            if col in ['ticker', 'name', 'asset_type', 'sector']:
                portfolio_df[col] = f"Unknown {col}"
            else:
                portfolio_df[col] = 50.0  # Default numeric value
    
    # Normalize preferences to 0-1 scale
    risk_weight = (11 - risk_tolerance) / 10  # Higher risk tolerance = lower risk weight
    esg_weight = sustainability_focus / 10
    return_weight = 1 - risk_weight - esg_weight/2
    
    # Calculate weighted scores
    portfolio_df['recommendation_score'] = (
        portfolio_df['esg_score'] * esg_weight +
        (100 - portfolio_df['volatility'] * 100) * risk_weight +
        portfolio_df['roi_1y'] * return_weight
    )
    
    # Add recommendation strength
    max_score = portfolio_df['recommendation_score'].max()
    min_score = portfolio_df['recommendation_score'].min()
    score_range = max_score - min_score
    
    portfolio_df['recommendation_strength'] = portfolio_df['recommendation_score'].apply(
        lambda x: 'Strong Buy' if x > (min_score + 0.8 * score_range) else
                  'Buy' if x > (min_score + 0.6 * score_range) else
                  'Hold' if x > (min_score + 0.4 * score_range) else
                  'Underweight' if x > (min_score + 0.2 * score_range) else
                  'Sell'
    )
    
    # Sort by recommendation score
    recommendations = portfolio_df.sort_values('recommendation_score', ascending=False)
    
    return recommendations

# Get recommendations for different user preferences
print("Conservative, Sustainability-Focused Portfolio:")
conservative_recs = get_portfolio_recommendations(portfolio_df, risk_tolerance=3, sustainability_focus=8)
display(conservative_recs[['name', 'ticker', 'asset_type', 'sector', 'esg_score', 'roi_1y', 'volatility', 'recommendation_score', 'recommendation_strength']].head(5))

print("\nAggressive, Return-Focused Portfolio:")
aggressive_recs = get_portfolio_recommendations(portfolio_df, risk_tolerance=8, sustainability_focus=3)
display(aggressive_recs[['name', 'ticker', 'asset_type', 'sector', 'esg_score', 'roi_1y', 'volatility', 'recommendation_score', 'recommendation_strength']].head(5))

## 6. Risk Assessment Model

In [None]:
def assess_portfolio_risk(portfolio_df, user_preferences=None):
    """Assess portfolio risk using ML model."""
    # Make a copy to avoid modifying the original dataframe
    df = portfolio_df.copy()
    
    # Set default user preferences if not provided
    if user_preferences is None:
        user_preferences = {
            'risk_tolerance': 5,
            'sustainability_focus': 5
        }
    
    # Ensure all required columns exist
    required_columns = ['volatility', 'beta', 'esg_score', 'allocation']
    
    # Check for missing columns and add them with default values if needed
    for col in required_columns:
        if col not in df.columns:
            print(f"Warning: Column '{col}' not found in dataset. Adding with default values.")
            if col == 'allocation':
                # Equal allocation if missing
                df[col] = 1.0 / len(df)
            else:
                df[col] = 50.0  # Default numeric value
    
    # Calculate portfolio-level metrics (weighted by allocation)
    portfolio_volatility = np.average(df['volatility'], weights=df['allocation'])
    portfolio_beta = np.average(df['beta'], weights=df['allocation'])
    portfolio_esg_risk = 100 - np.average(df['esg_score'], weights=df['allocation'])
    
    # Adjust risk based on user preferences
    risk_tolerance = user_preferences.get('risk_tolerance', 5)
    sustainability_focus = user_preferences.get('sustainability_focus', 5)
    
    # Risk tolerance adjustment (1-10 scale)
    # Higher risk tolerance = lower perceived risk
    risk_tolerance_factor = risk_tolerance / 5  # 0.2-2.0 range
    
    # Sustainability focus adjustment (1-10 scale)
    # Higher sustainability focus = higher sensitivity to ESG risk
    sustainability_factor = sustainability_focus / 5  # 0.2-2.0 range
    
    # Adjust risk components
    market_risk_weight = 1.0 / risk_tolerance_factor
    esg_risk_weight = sustainability_factor
    
    # Calculate adjusted risk score
    risk_score = (portfolio_volatility * 100) * 0.4 * market_risk_weight + \
                 portfolio_esg_risk * 0.4 * esg_risk_weight + \
                 portfolio_beta * 20 * 0.2
    
    # Ensure risk score is within 0-100 range
    risk_score = min(max(risk_score, 0), 100)
    
    # Determine risk category
    if risk_score < 25:
        risk_category = 'Low'
    elif risk_score < 50:
        risk_category = 'Moderate'
    elif risk_score < 75:
        risk_category = 'High'
    else:
        risk_category = 'Very High'
    
    # Calculate risk probabilities (simulated)
    risk_probabilities = {
        'Low': max(0, min(1, 1 - (risk_score / 25))),
        'Moderate': max(0, min(1, 1 - abs(risk_score - 37.5) / 25)),
        'High': max(0, min(1, 1 - abs(risk_score - 62.5) / 25)),
        'Very High': max(0, min(1, (risk_score - 75) / 25))
    }
    
    # Prepare risk factors
    risk_factors = {
        'Market Risk': portfolio_volatility * 100,
        'Systematic Risk': portfolio_beta * 50,
        'ESG Risk': portfolio_esg_risk
    }
    
    return {
        'risk_category': risk_category,
        'risk_score': risk_score,
        'risk_probabilities': risk_probabilities,
        'risk_factors': risk_factors,
        'portfolio_metrics': {
            'volatility': portfolio_volatility,
            'beta': portfolio_beta,
            'esg_risk_score': portfolio_esg_risk
        }
    }

# Assess risk for different portfolios
# Create a conservative portfolio (stocks with low volatility)
stocks_df = portfolio_df[portfolio_df['asset_type'] == 'Stock']
conservative_portfolio = stocks_df[stocks_df['volatility'] < stocks_df['volatility'].median()]

# Create an aggressive portfolio (crypto assets)
aggressive_portfolio = portfolio_df[portfolio_df['asset_type'] == 'Crypto']

# User preferences
conservative_user = {'risk_tolerance': 3, 'sustainability_focus': 7}
aggressive_user = {'risk_tolerance': 8, 'sustainability_focus': 4}

print("Conservative Portfolio Risk Assessment:")
conservative_risk = assess_portfolio_risk(conservative_portfolio, conservative_user)
print(f"Risk Category: {conservative_risk['risk_category']}")
print(f"Risk Score: {conservative_risk['risk_score']:.2f}/100")
print("\nRisk Factors:")
for factor, score in conservative_risk['risk_factors'].items():
    print(f"  {factor}: {score:.2f}")

print("\nAggressive Portfolio Risk Assessment:")
aggressive_risk = assess_portfolio_risk(aggressive_portfolio, aggressive_user)
print(f"Risk Category: {aggressive_risk['risk_category']}")
print(f"Risk Score: {aggressive_risk['risk_score']:.2f}/100")
print("\nRisk Factors:")
for factor, score in aggressive_risk['risk_factors'].items():
    print(f"  {factor}: {score:.2f}")

## 7. Sentiment Analysis

In [None]:
def analyze_market_sentiment(ticker, news_df, user_preferences=None):
    """Analyze market sentiment for a given ticker."""
    # Set default user preferences if not provided
    if user_preferences is None:
        user_preferences = {
            'risk_tolerance': 5,
            'sustainability_focus': 5
        }
    
    # Make a copy to avoid modifying the original dataframe
    df = news_df.copy()
    
    # Ensure all required columns exist
    required_columns = ['ticker', 'headline', 'sentiment', 'publication_date', 'source']
    
    # Check for missing columns and add them with default values if needed
    for col in required_columns:
        if col not in df.columns:
            print(f"Warning: Column '{col}' not found in news dataset. Adding with default values.")
            if col == 'ticker':
                df[col] = ticker
            elif col == 'sentiment':
                df[col] = 'neutral'
            elif col == 'publication_date':
                df[col] = pd.Timestamp.now().strftime('%Y-%m-%d')
            elif col == 'source':
                df[col] = 'Unknown Source'
            else:
                df[col] = f"No {col} available"
    
    # Filter news for the given ticker
    ticker_news = df[df['ticker'] == ticker]
    
    if len(ticker_news) == 0:
        return {
            'ticker': ticker,
            'sentiment_score': 0,
            'overall_sentiment': 'Neutral',
            'error': 'No news found for this ticker'
        }
    
    # Calculate sentiment score (-100 to 100)
    sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
    sentiment_values = [sentiment_map[s] for s in ticker_news['sentiment']]
    base_sentiment_score = sum(sentiment_values) / len(sentiment_values) * 100
    
    # Adjust sentiment score based on user preferences
    risk_tolerance = user_preferences.get('risk_tolerance', 5)
    sustainability_focus = user_preferences.get('sustainability_focus', 5)
    
    # Sustainability focus adjustment (1-10 scale)
    # Higher sustainability focus = more sensitive to negative ESG news
    sustainability_factor = sustainability_focus / 5  # 0.2-2.0 range
    
    # Risk tolerance adjustment (1-10 scale)
    # Lower risk tolerance = more sensitive to negative news
    risk_sensitivity = (11 - risk_tolerance) / 5  # 0.2-2.0 range
    
    # Apply adjustments
    if base_sentiment_score < 0:
        # Negative sentiment is amplified for sustainability-focused or risk-averse users
        sentiment_score = base_sentiment_score * max(sustainability_factor, risk_sensitivity)
    else:
        # Positive sentiment is slightly dampened for very sustainability-focused users
        sentiment_score = base_sentiment_score * (1 - (sustainability_factor - 1) * 0.1 if sustainability_factor > 1 else 1)
    
    # Count sentiments
    sentiment_counts = ticker_news['sentiment'].value_counts().to_dict()
    
    # Determine overall sentiment
    if sentiment_score > 30:
        overall_sentiment = 'Bullish'
    elif sentiment_score > 10:
        overall_sentiment = 'Somewhat Bullish'
    elif sentiment_score > -10:
        overall_sentiment = 'Neutral'
    elif sentiment_score > -30:
        overall_sentiment = 'Somewhat Bearish'
    else:
        overall_sentiment = 'Bearish'
    
    return {
        'ticker': ticker,
        'sentiment_score': sentiment_score,
        'overall_sentiment': overall_sentiment,
        'sentiment_counts': sentiment_counts,
        'news': ticker_news.to_dict('records')
    }

# Select a few tickers to analyze
tickers_to_analyze = portfolio_df['ticker'].sample(3).tolist()

for ticker in tickers_to_analyze:
    company_name = portfolio_df[portfolio_df['ticker'] == ticker]['name'].iloc[0]
    print(f"\nAnalyzing sentiment for {company_name} ({ticker}):")
    
    sentiment = analyze_market_sentiment(ticker, news_df)
    
    if 'error' in sentiment:
        print(f"  {sentiment['error']}")
        continue
        
    print(f"  Overall Sentiment: {sentiment['overall_sentiment']}")
    print(f"  Sentiment Score: {sentiment['sentiment_score']:.2f}")
    
    print("  Sentiment Distribution:")
    for sentiment_type, count in sentiment['sentiment_counts'].items():
        print(f"    {sentiment_type.capitalize()}: {count}")
    
    print("  Recent News:")
    for i, news in enumerate(sentiment['news'][:3]):
        print(f"    {i+1}. {news['headline']} ({news['source']}, {news['publication_date']})")

## 8. Conclusion

In this notebook, we've demonstrated how to use ML models for sustainable investment portfolio analysis:

1. **Portfolio Analysis**: We explored the dataset to understand the distribution of assets across sectors and their ESG profiles.

2. **ESG Analysis**: We analyzed ESG scores and their components to identify sustainable investment opportunities.

3. **Risk vs. Return Analysis**: We examined the relationship between risk, return, and ESG scores.

4. **Portfolio Recommendations**: We implemented a recommendation model that considers user preferences for risk tolerance and sustainability focus.

5. **Risk Assessment**: We developed a risk assessment model that evaluates portfolio risk based on various factors.

6. **Sentiment Analysis**: We analyzed market sentiment for specific assets based on news data.

These models can be integrated into the investment portfolio application to provide personalized recommendations and insights to users.