In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MultiLabelBinarizer

# Load datasets
static_df = pd.read_csv("/Users/isha/Datathon/venv/static_client_data.csv")
time_series_df = pd.read_csv("/Users/isha/Datathon/adjusted_timeseries_SCN_02.csv")


# Convert preferred_asset_classes to lists (if stored as strings)
static_df["preferred_asset_classes"] = static_df["preferred_asset_classes"].apply(lambda x: eval(x) if isinstance(x, str) else x)

# One-Hot Encode categorical columns
one_hot_cols = ["gender", "employment_status", "investment_goals"]
one_hot_encoder = OneHotEncoder(drop="first", sparse_output=False)
one_hot_encoded = one_hot_encoder.fit_transform(static_df[one_hot_cols])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_cols))

# Label Encode risk_appetite
label_encoder = LabelEncoder()
static_df["risk_appetite_encoded"] = label_encoder.fit_transform(static_df["risk_appetite"])

# Multi-Hot Encode preferred_asset_classes
mlb = MultiLabelBinarizer()
multi_hot_encoded = mlb.fit_transform(static_df["preferred_asset_classes"])
multi_hot_df = pd.DataFrame(multi_hot_encoded, columns=[f"asset_{cls}" for cls in mlb.classes_])

# Drop original categorical columns and concatenate encoded features
static_df.drop(columns=one_hot_cols + ["risk_appetite", "preferred_asset_classes"], inplace=True)
static_df = pd.concat([static_df, one_hot_df, multi_hot_df], axis=1)

# Merge with time-series data (preserving all time-series rows)
merged_df = time_series_df.merge(static_df, on="client_id", how="left")




# Save processed dataset
merged_df.to_csv("/Users/isha/Datathon/static_time_series_ps3.csv", index=False)

print("✅ Encoding & Merging complete! Merged dataset shape:", merged_df.shape)


✅ Encoding & Merging complete! Merged dataset shape: (36000, 31)


In [39]:
time_series_df.shape


(36000, 9)

Check stationarity for each client and feature

In [40]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

df = pd.read_csv("/Users/isha/Datathon/static_time_series_ps3.csv")

time_features = [
    'portfolio_value', 
    'equity_allocation_pct', 
    'fixed_income_allocation_pct', 
    'monthly_contribution', 
    'market_volatility_index', 
    'macroeconomic_score', 
    'sentiment_index'
]

# Initialize an empty list to store the results
all_results = []

# Check stationarity for each client and feature
for client_id in df['client_id'].unique():
    client_data = df[df['client_id'] == client_id]
    
    for feature in time_features:
        # Extract the time series for the current feature
        series = client_data[feature].values
        
        # Run ADF test
        result = adfuller(series, autolag='AIC')
        
        # Store results in the list
        all_results.append({
            'client_id': client_id,
            'feature': feature,
            'adf_statistic': result[0],
            'p_value': result[1],
            'is_stationary': result[1] < 0.05  # p-value < 0.05 → stationarity
        })

# Create the DataFrame after the loop
stationarity_results = pd.DataFrame(all_results)

# Save or display results
print(stationarity_results)


                                 client_id                      feature  \
0     2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2              portfolio_value   
1     2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2        equity_allocation_pct   
2     2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2  fixed_income_allocation_pct   
3     2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2         monthly_contribution   
4     2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2      market_volatility_index   
...                                    ...                          ...   
6995  41b1bc9d-b7ec-4092-803b-6ae2a1023605  fixed_income_allocation_pct   
6996  41b1bc9d-b7ec-4092-803b-6ae2a1023605         monthly_contribution   
6997  41b1bc9d-b7ec-4092-803b-6ae2a1023605      market_volatility_index   
6998  41b1bc9d-b7ec-4092-803b-6ae2a1023605          macroeconomic_score   
6999  41b1bc9d-b7ec-4092-803b-6ae2a1023605              sentiment_index   

      adf_statistic       p_value  is_stationary  
0         -1.729305  4.160802e-01          False

Apply differencing if non-stationary

In [34]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

# Load data
df = pd.read_csv("/Users/isha/Datathon/static_time_series_ps3.csv")

# List of time-series features to check
time_features = [
    'portfolio_value', 
    'equity_allocation_pct', 
    'fixed_income_allocation_pct', 
    'monthly_contribution', 
    'market_volatility_index', 
    'macroeconomic_score', 
    'sentiment_index'
]

# List to store transformed data
differenced_data = []

# Loop over each client
for client_id in df['client_id'].unique():
    client_data = df[df['client_id'] == client_id].copy()

    for feature in time_features:
        series = client_data[feature].values

        # ADF test
        try:
            p_value = adfuller(series, autolag='AIC')[1]
        except:
            p_value = 1  # Treat as non-stationary if test fails

        # Apply differencing if non-stationary
        if p_value >= 0.05:
            diff_series = np.diff(series)
            diff_series = np.insert(diff_series, 0, 0)  # Pad to maintain length
        else:
            diff_series = series

        # Add new stationary feature column
        client_data[feature + '_stationary'] = diff_series

    # Store updated client data
    differenced_data.append(client_data)

# Combine everything back
df_stationary = pd.concat(differenced_data, ignore_index=True)

# Save to file
df_stationary.to_csv("/Users/isha/Datathon/stationary_data1.csv", index=False)
print("✅ Stationarity transformation applied to all features and saved.")


✅ Stationarity transformation applied to all features and saved.


In [16]:
df_stationary

Unnamed: 0,client_id,month,portfolio_value,equity_allocation_pct,fixed_income_allocation_pct,monthly_contribution,market_volatility_index,macroeconomic_score,sentiment_index,age,...,asset_Mutual Funds,asset_Real Estate,asset_Stocks,portfolio_value_stationary,equity_allocation_pct_stationary,fixed_income_allocation_pct_stationary,monthly_contribution_stationary,market_volatility_index_stationary,macroeconomic_score_stationary,sentiment_index_stationary
0,2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2,01-03-2022,53094.36,63.89460,32.72620,1188.90,17.16,3.83,7.25,53,...,0,1,0,0.00,63.89460,32.72620,1188.90,17.16,3.83,7.25
1,2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2,01-04-2022,54580.94,57.82784,39.74048,1738.97,12.01,2.56,3.82,53,...,0,1,0,1486.58,57.82784,39.74048,1738.97,12.01,2.56,3.82
2,2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2,01-05-2022,56255.62,37.69136,63.02192,1175.53,24.98,4.62,6.21,53,...,0,1,0,1674.68,37.69136,63.02192,1175.53,24.98,4.62,6.21
3,2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2,01-06-2022,58008.47,44.51416,55.13352,1968.66,22.68,1.99,6.49,53,...,0,1,0,1752.85,44.51416,55.13352,1968.66,22.68,1.99,6.49
4,2eef3ba0-a7c7-42b1-a3ac-3c33aedcb5f2,01-07-2022,59831.30,36.42822,64.48234,1529.49,11.39,2.23,7.39,53,...,0,1,0,1822.83,36.42822,64.48234,1529.49,11.39,2.23,7.39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35995,41b1bc9d-b7ec-4092-803b-6ae2a1023605,01-10-2024,56962.78,19.08540,84.53380,1862.76,20.85,1.45,5.99,64,...,0,1,0,1884.81,19.08540,84.53380,1862.76,20.85,1.45,5.99
35996,41b1bc9d-b7ec-4092-803b-6ae2a1023605,01-11-2024,59890.52,43.59216,56.19952,1926.25,22.79,3.21,5.02,64,...,0,1,0,2927.74,43.59216,56.19952,1926.25,22.79,3.21,5.02
35997,41b1bc9d-b7ec-4092-803b-6ae2a1023605,01-12-2024,60708.53,28.86782,73.22354,1958.70,15.64,3.41,3.82,64,...,0,1,0,818.01,28.86782,73.22354,1958.70,15.64,3.41,3.82
35998,41b1bc9d-b7ec-4092-803b-6ae2a1023605,01-01-2025,61984.00,40.88148,59.33356,1284.70,26.19,2.17,5.82,64,...,0,1,0,1275.47,40.88148,59.33356,1284.70,26.19,2.17,5.82


Feature engineering: lag and rolling features added to stationary columns.

In [17]:
import pandas as pd
import numpy as np

# Load the stationary dataset
df = pd.read_csv("/Users/isha/Datathon/stationary_data1.csv")

# ✅ Only apply to stationary-transformed columns
features_to_engineer = [
    'portfolio_value_stationary',
    'monthly_contribution_stationary',
    'market_volatility_index_stationary',
    'sentiment_index_stationary',
    'macroeconomic_score_stationary'
]

# ---------- Function: Add Lag Features ----------
def add_lag_features(df, features, lags=[1, 3, 6]):
    df_lagged = df.copy()
    for feature in features:
        for lag in lags:
            df_lagged[f"{feature}_lag_{lag}"] = df_lagged.groupby("client_id")[feature].shift(lag)
    return df_lagged

# ---------- Function: Add Rolling Statistics ----------
def add_rolling_features(df, features, windows=[3, 6]):
    df_rolled = df.copy()
    for feature in features:
        for window in windows:
            df_rolled[f"{feature}_roll_mean_{window}"] = (
                df_rolled.groupby("client_id")[feature].transform(lambda x: x.rolling(window).mean())
            )
            df_rolled[f"{feature}_roll_std_{window}"] = (
                df_rolled.groupby("client_id")[feature].transform(lambda x: x.rolling(window).std())
            )
    return df_rolled

# ---------- Apply Lag + Rolling Feature Engineering ----------
df = add_lag_features(df, features=features_to_engineer, lags=[1, 3, 6])
df = add_rolling_features(df, features=features_to_engineer, windows=[3, 6])

# ---------- Drop rows with NaNs introduced by shifting/rolling ----------
df = df.dropna().reset_index(drop=True)

# ---------- Save the engineered dataset ----------
df.to_csv("/Users/isha/Datathon/engineered_stationary_data1.csv", index=False)
print("✅ Feature engineering complete: lag and rolling features added to stationary columns.")


✅ Feature engineering complete: lag and rolling features added to stationary columns.


Engineer new features: portfolio_growth_rate, contribution_ratio, eq_fi_ratio

In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError

# --- Config ---
test_file = "/Users/isha/Datathon/stationary_data_with_smart_features1.csv"
model_path = "/Users/isha/Datathon/lstm_portfolio_forecast_model.keras"  # Update if different
output_file = "/Users/isha/Datathon/predictions.csv"
sequence_length = 24



# --- Features used in training ---
selected_features = ['employment_status_Unemployed', 'investment_goals_Wealth Accumulation', 'asset_Real Estate',
                     'risk_appetite_encoded', 'market_volatility_index_stationary_roll_mean_6',
                     'investment_goals_Home Purchase', 'eq_fi_ratio', 'savings_rate',
                     'sentiment_index_stationary_roll_mean_6', 'age', 'employment_status_Self-Employed',
                     'annual_income', 'asset_Bonds', 'macroeconomic_score_stationary_roll_mean_3',
                     'debt_to_income_ratio', 'financial_knowledge_score', 'investment_goals_Retirement',
                     'asset_Mutual Funds', 'asset_Stocks', 'net_worth', 'monthly_contribution_stationary_roll_mean_6',
                     'gender_Male', 'gender_Other', 'monthly_contribution_stationary_roll_mean_3',
                     'dependents', 'investment_horizon_years', 'employment_status_Salaried',
                     'sentiment_index_stationary_roll_mean_3', 'market_volatility_index_stationary_roll_mean_3',
                     'asset_ETFs', 'macroeconomic_score_stationary_roll_mean_6']

# --- Load test data ---
df = pd.read_csv(test_file)

# Drop rows with missing values in selected features
df = df.dropna(subset=selected_features)

# Sort by client_id and month
df = df.sort_values(['client_id', 'month'])

# Normalize features
feature_scaler = MinMaxScaler()
df[selected_features] = feature_scaler.fit_transform(df[selected_features])

# Dummy scaler for inverse transform later (we just need shape and min/max)
target_scaler = MinMaxScaler()
target_scaler.fit(np.zeros((1, 3)))  # Just for inverse_transform to work

# --- Sequence Creation ---
def create_sequences(data, features, seq_len):
    X, client_ids = [], []
    grouped = data.groupby('client_id')
    for client_id, group in grouped:
        group = group.reset_index(drop=True)
        if len(group) >= seq_len:
            for i in range(len(group) - seq_len + 1):
                seq_x = group.loc[i:i+seq_len-1, features].values.astype(np.float32)
                X.append(seq_x)
                client_ids.append(client_id)
    return np.array(X), client_ids

X_test, client_ids = create_sequences(df, selected_features, sequence_length)

# --- Load model ---
model = load_model(model_path, compile=False)


# --- Predict ---
y_pred = model.predict(X_test)

# --- Save predictions ---
pred_df = pd.DataFrame(y_pred, columns=['forecasted_value_year_1', 'forecasted_value_year_2', 'forecasted_value_year_3'])
pred_df['client_id'] = client_ids
pred_df.to_csv(output_file, index=False)

print(f"✅ Predictions saved to: {output_file}")


[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
✅ Predictions saved to: /Users/isha/Datathon/predictions.csv


In [35]:
from joblib import load

target_scaler = load('/Users/isha/Datathon/target_scaler.save')  # adjust path if needed


# --- Inverse transform ---
y_pred_unscaled = target_scaler.inverse_transform(y_pred)

# --- Save unscaled predictions ---
unscaled_pred_df = pd.DataFrame(y_pred_unscaled, columns=['forecasted_value_year_1', 'forecasted_value_year_2', 'forecasted_value_year_3'])
unscaled_pred_df['client_id'] = client_ids
unscaled_pred_df.to_csv("/Users/isha/Datathon/predictions.csv", index=False)

print("📈 Unscaled predictions saved to: /Users/isha/Datathon/predictions.csv")



📈 Unscaled predictions saved to: /Users/isha/Datathon/predictions.csv


In [31]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd
import numpy as np

# Assuming 'df' is your original or stationary DataFrame
df = pd.read_csv("/Users/isha/Datathon/engineered_stationary_data1.csv")

# === Feature 1: Portfolio Growth Rate ===
df['portfolio_growth_rate'] = df.groupby('client_id')['portfolio_value_stationary'].pct_change()

# === Feature 2: Contribution Ratio ===
df['contribution_ratio'] = df['monthly_contribution_stationary'] / (df['portfolio_value_stationary'] + 1e-6)

# === Feature 3: Equity to Fixed Income Ratio ===
df['eq_fi_ratio'] = df['equity_allocation_pct_stationary'] / (df['fixed_income_allocation_pct_stationary'] + 1e-6)

# Optional: Fill NaNs from pct_change or divide-by-zero issues
df.fillna(0, inplace=True)

# Save updated DataFrame
df.to_csv("/Users/isha/Datathon/stationary_data_with_smart_features1.csv", index=False)
print("✅ Added 3 smart financial features and saved.")


✅ Added 3 smart financial features and saved.


In [28]:
print("X_test shape:", X_test.shape)


X_test shape: (7000, 24, 31)


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Load data
df = pd.read_csv("/Users/isha/Datathon/engineered_stationary_data.csv")

# Define features and targets
selected_features = [
    'employment_status_Unemployed', 'investment_goals_Wealth Accumulation', 'asset_Real Estate',
    'risk_appetite_encoded', 'market_volatility_index_stationary_roll_mean_6', 'investment_goals_Home Purchase',
    'savings_rate', 'sentiment_index_stationary_roll_mean_6', 'age', 'employment_status_Self-Employed',
    'annual_income', 'asset_Bonds', 'macroeconomic_score_stationary_roll_mean_3', 'debt_to_income_ratio',
    'financial_knowledge_score', 'investment_goals_Retirement', 'fixed_income_allocation_pct_stationary',
    'asset_Mutual Funds', 'asset_Stocks', 'net_worth', 'monthly_contribution_stationary_roll_mean_6', 'gender_Male',
    'equity_allocation_pct_stationary', 'gender_Other', 'monthly_contribution_stationary_roll_mean_3', 'dependents',
    'investment_horizon_years', 'employment_status_Salaried', 'sentiment_index_stationary_roll_mean_3',
    'market_volatility_index_stationary_roll_mean_3', 'asset_ETFs', 'macroeconomic_score_stationary_roll_mean_6'
]
target_cols = ['forecasted_value_year_1', 'forecasted_value_year_2', 'forecasted_value_year_3']

# Drop rows with missing values
df = df.dropna(subset=selected_features + target_cols)

# Normalize features
feature_scaler = MinMaxScaler()
df[selected_features] = feature_scaler.fit_transform(df[selected_features])

# Normalize target
target_scaler = MinMaxScaler()
df[target_cols] = target_scaler.fit_transform(df[target_cols])

# Sort by client and time step (assuming 'month' exists)
df = df.sort_values(['client_id', 'month'])

# Create sequences per client
sequence_length = 24
def create_sequences(data, features, targets, seq_len):
    X, y = [], []
    grouped = data.groupby('client_id')
    for _, group in grouped:
        group = group.reset_index(drop=True)
        if len(group) >= seq_len + 1:
            for i in range(len(group) - seq_len):
                seq_x = group.loc[i:i+seq_len-1, features].values.astype(np.float32)
                seq_y = group.loc[i+seq_len, targets].values.astype(np.float32)
                X.append(seq_x)
                y.append(seq_y)
    return np.array(X), np.array(y)

X, y = create_sequences(df, selected_features, target_cols, sequence_length)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = Sequential([
    LSTM(64, input_shape=(sequence_length, len(selected_features)), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(3)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
loss, mae = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test MAE:", mae)

# Predict and inverse transform
y_pred = model.predict(X_test)
y_pred_rescaled = target_scaler.inverse_transform(y_pred)
y_test_rescaled = target_scaler.inverse_transform(y_test)

print("Sample predicted vs actual values:")
for i in range(5):
    print(f"Predicted: {y_pred_rescaled[i]}, Actual: {y_test_rescaled[i]}")


In [20]:
print("Test Loss:", loss)
print("Test MAE:", mae)

Test Loss: 0.01979292370378971
Test MAE: 0.1096835657954216


In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Load data
df = pd.read_csv("/Users/isha/Datathon/engineered_stationary_data.csv")

# Define features and targets
all_features = df.columns.tolist()
non_stationary_cols = [
    'monthly_contribution', 'equity_allocation_pct', 'fixed_income_allocation_pct',
    'market_volatility_index', 'sentiment_index', 'macroeconomic_score'
]
target_cols = ['forecasted_value_year_1', 'forecasted_value_year_2', 'forecasted_value_year_3']

# Remove target and non-stationary columns from features
selected_features = [col for col in all_features if col not in non_stationary_cols + target_cols + ['client_id', 'month']]

# Drop rows with missing values
df = df.dropna(subset=selected_features + target_cols)

# Normalize features
feature_scaler = MinMaxScaler()
df[selected_features] = feature_scaler.fit_transform(df[selected_features])

# Normalize target
target_scaler = MinMaxScaler()
df[target_cols] = target_scaler.fit_transform(df[target_cols])

# Sort by client and time step (assuming 'month' exists)
df = df.sort_values(['client_id', 'month'])

# Create sequences per client
sequence_length = 24
def create_sequences(data, features, targets, seq_len):
    X, y = [], []
    grouped = data.groupby('client_id')
    for _, group in grouped:
        group = group.reset_index(drop=True)
        if len(group) >= seq_len + 1:
            for i in range(len(group) - seq_len):
                seq_x = group.loc[i:i+seq_len-1, features].values.astype(np.float32)
                seq_y = group.loc[i+seq_len, targets].values.astype(np.float32)
                X.append(seq_x)
                y.append(seq_y)
    return np.array(X), np.array(y)

X, y = create_sequences(df, selected_features, target_cols, sequence_length)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = Sequential([
    LSTM(64, input_shape=(sequence_length, len(selected_features)), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(3)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
loss, mae = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test MAE:", mae)

# Predict and inverse transform
y_pred = model.predict(X_test)
y_pred_rescaled = target_scaler.inverse_transform(y_pred)
y_test_rescaled = target_scaler.inverse_transform(y_test)

print("Sample predicted vs actual values:")
for i in range(5):
    print(f"Predicted: {y_pred_rescaled[i]}, Actual: {y_test_rescaled[i]}")


Epoch 1/50


  super().__init__(**kwargs)


[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 18ms/step - loss: 0.0496 - mae: 0.1816 - val_loss: 0.0453 - val_mae: 0.1732
Epoch 2/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - loss: 0.0452 - mae: 0.1752 - val_loss: 0.0448 - val_mae: 0.1742
Epoch 3/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - loss: 0.0445 - mae: 0.1736 - val_loss: 0.0453 - val_mae: 0.1739
Epoch 4/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - loss: 0.0446 - mae: 0.1738 - val_loss: 0.0445 - val_mae: 0.1731
Epoch 5/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - loss: 0.0449 - mae: 0.1744 - val_loss: 0.0443 - val_mae: 0.1732
Epoch 6/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 0.0442 - mae: 0.1729 - val_loss: 0.0440 - val_mae: 0.1724
Epoch 7/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step

In [21]:
from sklearn.metrics import r2_score

# Calculate R² score for each forecast year
r2_scores = {}
for i, year in enumerate(['forecasted_value_year_1', 'forecasted_value_year_2', 'forecasted_value_year_3']):
    r2 = r2_score(y_test_rescaled[:, i], y_pred_rescaled[:, i])
    r2_scores[year] = r2
    print(f"R² score for {year}: {r2:.4f}")


R² score for forecasted_value_year_1: 0.5033
R² score for forecasted_value_year_2: 0.6203
R² score for forecasted_value_year_3: 0.5308


In [22]:
# Save the model
model.save("/Users/isha/Datathon/lstm_portfolio_forecast_model.h5")
print("✅ Model saved successfully!")




✅ Model saved successfully!


In [38]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

# Load the dataset
file_path = "/mnt/data/static_time_series.csv"
df = pd.read_csv(file_path)

# Ensure time series columns (excluding client_id) are numeric
df.iloc[:, 1:37] = df.iloc[:, 1:37].apply(pd.to_numeric, errors='coerce')

# Dictionary to store categorized clients
client_groups = {
    "increasing_trend": [],
    "fluctuating_trend": [],
    "declining_trend": []
}

# Function to classify clients based on trend
def classify_client(client_series):
    client_series = client_series[~np.isnan(client_series)]  # Remove NaN values
    
    if len(client_series) < 2:
        return "fluctuating_trend"  # Default to fluctuating if too short

    diff_series = np.diff(client_series)

    if np.all(diff_series > 0):  
        return "increasing_trend"
    elif np.all(diff_series < 0):  
        return "declining_trend"
    else:
        return "fluctuating_trend"

# Iterate through clients and classify them
for client in df['client_id'].unique():
    client_data = df[df['client_id'] == client].iloc[:, 1:37].values.flatten()  # Extract time series
    trend_type = classify_client(client_data)
    
    if len(client_groups[trend_type]) < 5:  # Select only 5 per category
        client_groups[trend_type].append(client)

# Function to check stationarity
def check_stationarity(time_series):
    time_series = time_series[~np.isnan(time_series)]  # Remove NaN values
    
    if len(time_series) < 2:
        return 1.0  # Default p-value for insufficient data (not stationary)

    result = adfuller(time_series)
    return result[1]  # If p < 0.05, the series is stationary

# Apply stationarity check to selected clients
for trend, clients in client_groups.items():
    print(f"\nChecking stationarity for {trend.upper()} clients:")
    for client in clients:
        client_series = df[df['client_id'] == client].iloc[:, 1:37].values.flatten()
        p_value = check_stationarity(client_series)
        print(f"Client {client}: p-value = {p_value:.5f} {'(Stationary)' if p_value < 0.05 else '(Non-Stationary)'}")


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/static_time_series.csv'

Lagged Correlations

In [27]:
# Load target data
df = pd.read_csv("/Users/isha/Datathon/static_time_series.csv")

# Compute lagged correlation
correlations = []
for lag in [1, 3, 6, 12]:
    for feature in features:
        df[f"{feature}_lag_{lag}"] = df.groupby("client_id")[feature].shift(lag)
        corr = df[[f"{feature}_lag_{lag}", "forecasted_value_year_1"]].corr().iloc[0, 1]
        correlations.append((feature, lag, corr))

# Convert to DataFrame and display
correlation_df = pd.DataFrame(correlations, columns=["Feature", "Lag", "Correlation"])
print(correlation_df.sort_values(by="Correlation", ascending=False))


                        Feature  Lag  Correlation
12              portfolio_value   12     0.002144
8               portfolio_value    6     0.001779
4               portfolio_value    3     0.001628
0               portfolio_value    1     0.001545
15         monthly_contribution   12     0.001063
1         equity_allocation_pct    1     0.000813
5         equity_allocation_pct    3     0.000764
14  fixed_income_allocation_pct   12     0.000658
9         equity_allocation_pct    6     0.000625
3          monthly_contribution    1     0.000541
7          monthly_contribution    3     0.000047
11         monthly_contribution    6    -0.000280
10  fixed_income_allocation_pct    6    -0.000625
13        equity_allocation_pct   12    -0.000658
6   fixed_income_allocation_pct    3    -0.000764
2   fixed_income_allocation_pct    1    -0.000813


First ORder DIfferencing on Portfolio Value because of non stationarity

In [11]:
import pandas as pd

# Compute first-order differencing for portfolio_value
df['portfolio_value_diff'] = df.groupby("client_id")['portfolio_value'].diff()




# Convert to DataFrame and display
correlation_df = pd.DataFrame(correlations, columns=["Feature", "Lag", "Correlation"])
print(correlation_df.sort_values(by="Correlation", ascending=False))


                        Feature  Lag  Correlation
12              portfolio_value   12     0.002144
8               portfolio_value    6     0.001779
4               portfolio_value    3     0.001628
0               portfolio_value    1     0.001545
15         monthly_contribution   12     0.001063
1         equity_allocation_pct    1     0.000813
5         equity_allocation_pct    3     0.000764
14  fixed_income_allocation_pct   12     0.000658
9         equity_allocation_pct    6     0.000625
3          monthly_contribution    1     0.000541
7          monthly_contribution    3     0.000047
11         monthly_contribution    6    -0.000280
10  fixed_income_allocation_pct    6    -0.000625
13        equity_allocation_pct   12    -0.000658
6   fixed_income_allocation_pct    3    -0.000764
2   fixed_income_allocation_pct    1    -0.000813


In [32]:
df

Unnamed: 0,client_id,month,portfolio_value,equity_allocation_pct,fixed_income_allocation_pct,monthly_contribution,market_volatility_index,macroeconomic_score,sentiment_index,age,...,fixed_income_allocation_pct_lag_12,monthly_contribution_lag_12,portfolio_value_diff_1,portfolio_value_diff_2,portfolio_value_diff_3,portfolio_value_diff,portfolio_value_diff_lag_1,portfolio_value_diff_lag_2,portfolio_value_diff_lag_6,portfolio_value_diff_lag_12
0,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-03-01,89775.68,56.07,43.93,1562.11,10.41,7.85,7.16,63,...,,,89685.96,95056.55,101759.46,,,,,
1,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-04-01,89685.96,32.74,67.26,772.74,13.67,4.52,5.62,63,...,,,90114.05,96907.97,102420.93,-89.72,,,,
2,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-05-01,90114.05,56.71,43.29,709.24,15.84,4.83,5.28,63,...,,,90338.07,97664.44,104344.57,428.09,-89.72,,,
3,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-06-01,90338.07,67.11,32.89,799.51,20.28,5.96,3.23,63,...,,,92449.25,98037.37,106055.05,224.02,428.09,-89.72,,
4,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-07-01,92449.25,23.90,76.10,1923.33,29.31,7.04,4.52,63,...,,,94386.87,100066.40,106411.38,2111.18,224.02,428.09,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359995,41b1bc9d-b7ec-4092-803b-6ae2a1023605,2024-10-01,56962.78,20.70,79.30,1862.76,20.85,3.09,5.88,64,...,34.23,690.39,59890.52,,,1884.81,792.98,432.81,1529.15,910.66
359996,41b1bc9d-b7ec-4092-803b-6ae2a1023605,2024-11-01,59890.52,47.28,52.72,1926.25,22.79,4.85,4.91,64,...,40.89,1555.90,60708.53,,,2927.74,1884.81,792.98,1937.97,1427.08
359997,41b1bc9d-b7ec-4092-803b-6ae2a1023605,2024-12-01,60708.53,31.31,68.69,1958.70,15.64,5.05,3.71,64,...,74.35,501.23,61984.00,,,818.01,2927.74,1884.81,897.60,323.24
359998,41b1bc9d-b7ec-4092-803b-6ae2a1023605,2025-01-01,61984.00,44.34,55.66,1284.70,26.19,3.81,5.71,64,...,39.77,1705.93,62743.82,,,1275.47,818.01,2927.74,1555.02,1893.89


In [19]:
# Forward fill missing values
df.fillna(method="ffill", inplace=True)

# If any NaNs still remain at the start (e.g., first row of each client), backfill them
df.fillna(method="bfill", inplace=True)


  df.fillna(method="ffill", inplace=True)
  df.fillna(method="bfill", inplace=True)


In [20]:
import pandas as pd


# Define the selected features and lags
selected_lags = {
    "portfolio_value_diff": [1, 3, 6, 12],  # Differenced values
    "equity_allocation_pct": [1],
    "fixed_income_allocation_pct": [1],
    "monthly_contribution": [1],
}

# Apply lags
for feature, lags in selected_lags.items():
    for lag in lags:
        df[f"{feature}_lag_{lag}"] = df.groupby("client_id")[feature].shift(lag)

# Display the first few rows to check the new features
df.head()


Unnamed: 0,client_id,month,portfolio_value,equity_allocation_pct,fixed_income_allocation_pct,monthly_contribution,market_volatility_index,macroeconomic_score,sentiment_index,age,...,fixed_income_allocation_pct_lag_12,monthly_contribution_lag_12,portfolio_value_diff,portfolio_value_diff_lag_1,portfolio_value_diff_lag_3,portfolio_value_diff_lag_6,portfolio_value_diff_lag_12,portfolio_value_diff_1,portfolio_value_diff_2,portfolio_value_diff_3
0,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-03-01,89775.68,56.07,43.93,1562.11,10.41,7.85,7.16,63,...,43.93,1562.11,-89.72,,,,,894.95,1968.29,
1,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-04-01,89685.96,32.74,67.26,772.74,13.67,4.52,5.62,63,...,43.93,1562.11,-89.72,-89.72,,,,661.47,685.26,
2,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-05-01,90114.05,56.71,43.29,709.24,15.84,4.83,5.28,63,...,43.93,1562.11,428.09,-89.72,,,,1923.64,273.14,
3,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-06-01,90338.07,67.11,32.89,799.51,20.28,5.96,3.23,63,...,43.93,1562.11,224.02,428.09,-89.72,,,1710.48,1144.78,
4,96c4c0a3-bb3f-4ac1-81ad-0850cd29911f,2022-07-01,92449.25,23.9,76.1,1923.33,29.31,7.04,4.52,63,...,43.93,1562.11,2111.18,224.02,-89.72,,,356.33,1654.98,


In [29]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler

# Define target variable (forecasting next 3 years)
target = ["portfolio_value_diff_1", "portfolio_value_diff_2", "portfolio_value_diff_3"]

# Create new target columns for the next 3 years
df["portfolio_value_diff_1"] = df.groupby("client_id")["portfolio_value"].shift(-12)
df["portfolio_value_diff_2"] = df.groupby("client_id")["portfolio_value"].shift(-24)
df["portfolio_value_diff_3"] = df.groupby("client_id")["portfolio_value"].shift(-36)

# Select features (lagged variables)
features = [col for col in df.columns if "_lag_" in col]

# Handle missing values: Drop only target-related NaNs
df_lstm = df.dropna(subset=target).copy()

# **NEW FIX: Replace remaining NaNs in features with 0 or interpolation**
df_lstm[features] = df_lstm[features].fillna(0)  # or use df_lstm[features].interpolate()

# Scale features
scaler = StandardScaler()
df_lstm[features] = scaler.fit_transform(df_lstm[features])

# Split data (24 months train, 6 months validation, 6 months test)
train_size = 24
val_size = 6
test_size = 6

X = df_lstm[features].values
y = df_lstm[target].values  # Now has 3 columns (year 1, 2, 3)

X_train, X_val, X_test = X[:train_size], X[train_size:train_size+val_size], X[train_size+val_size:train_size+val_size+test_size]
y_train, y_val, y_test = y[:train_size], y[train_size:train_size+val_size], y[train_size+val_size:train_size+val_size+test_size]

# Reshape input for LSTM (samples, timesteps, features)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(1, X_train.shape[2])),
    Dropout(0.2),
    LSTM(64, return_sequences=False),
    Dropout(0.2),
    Dense(3)  # 3 outputs for year 1, year 2, year 3
])

# Compile model
model.compile(optimizer="adam", loss="mse")

# Train model
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_val, y_val), verbose=1)

# Predict for next 3 years
y_pred = model.predict(X_test)

# Store predictions in DataFrame
df_lstm.loc[df_lstm.index[-len(y_pred):], ["forecasted_value_year_1", "forecasted_value_year_2", "forecasted_value_year_3"]] = y_pred

# Save results
df_lstm.to_csv("forecasted_lstm_results.csv", index=False)


ValueError: Found array with 0 sample(s) (shape=(0, 16)) while a minimum of 1 is required by StandardScaler.

In [34]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler

# --------------------------------------
# 1. Load Data
# --------------------------------------
df = pd.read_csv("static_time_series.csv")  # Update path if needed

# Ensure sorting by time for each client
df = df.sort_values(["client_id", "date"])

# --------------------------------------
# 2. Define Features & Target
# --------------------------------------
features = ["portfolio_value", "equity_allocation_pct", "fixed_income_allocation_pct", "monthly_contribution"]
target = ["portfolio_value_year_1", "portfolio_value_year_2", "portfolio_value_year_3"]

# Create shifted targets (forecast next 3 years)
df["portfolio_value_year_1"] = df.groupby("client_id")["portfolio_value"].shift(-12)
df["portfolio_value_year_1"] = df.groupby("client_id")["portfolio_value"].shift(-24)
df["portfolio_value_year_1"] = df.groupby("client_id")["portfolio_value"].shift(-36)

# Drop rows with NaNs (due to shifting)
df = df.dropna(subset=target).reset_index(drop=True)

# --------------------------------------
# 3. Scale Features & Target
# --------------------------------------
scaler_X = StandardScaler()
scaler_y = StandardScaler()

df[features] = scaler_X.fit_transform(df[features])
df[target] = scaler_y.fit_transform(df[target])

# --------------------------------------
# 4. Create Sequences for LSTM
# --------------------------------------
time_steps = 12  # Use last 12 months to predict future

X, y = [], []
for i in range(len(df) - time_steps):
    X.append(df[features].iloc[i : i + time_steps].values)  # 12 months history
    y.append(df[target].iloc[i + time_steps].values)  # Next 3 years

X, y = np.array(X), np.array(y)  # Convert to NumPy arrays

# --------------------------------------
# 5. Train/Test Split
# --------------------------------------
train_size = int(0.7 * len(X))
val_size = int(0.15 * len(X))

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size : train_size + val_size], y[train_size : train_size + val_size]
X_test, y_test = X[train_size + val_size :], y[train_size + val_size :]

# --------------------------------------
# 6. Define LSTM Model
# --------------------------------------
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(time_steps, len(features))),
    Dropout(0.2),
    LSTM(64, return_sequences=False),
    Dropout(0.2),
    Dense(3)  # 3 outputs for 1yr, 2yr, 3yr forecasts
])

model.compile(optimizer="adam", loss="mse")

# --------------------------------------
# 7. Train Model
# --------------------------------------
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_val, y_val), verbose=1)

# --------------------------------------
# 8. Make Predictions
# --------------------------------------
y_pred = model.predict(X_test)

# Inverse Transform to get real values
y_pred = scaler_y.inverse_transform(y_pred)

# Store Predictions
df_test = df.iloc[train_size + val_size + time_steps:].copy()
df_test[["forecasted_value_year_1", "forecasted_value_year_1", "forecasted_value_year_1"]] = y_pred

# Save to CSV
df_test.to_csv("forecasted_lstm_results.csv", index=False)


KeyError: 'date'