<a href="https://colab.research.google.com/github/superjoe96/LSTM/blob/main/lstm_odl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [53]:
df = pd.read_csv('personal_transactions_10000.csv')

In [54]:
# 2. Data Preprocessing
# Convert Date to datetime if it's not already
df['Date'] = pd.to_datetime(df['Date'])

In [55]:
# Enhanced feature engineering
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
df['Quarter'] = df['Date'].dt.quarter
df['DayOfYear'] = df['Date'].dt.dayofyear
df['WeekOfYear'] = df['Date'].dt.isocalendar().week

# Create a more stable time key for sorting
df['YearWeek'] = df['Year'].astype(str) + '-' + df['WeekOfYear'].astype(str).str.zfill(2)

In [56]:
# Separate credits and debits for better modeling
weekly_debits = df[df['Transaction Type'] == 'debit'].groupby(['YearWeek', 'Category'])['Amount'].agg(['sum', 'count', 'mean']).reset_index()
weekly_debits['sum'] = weekly_debits['sum'].abs()  # Convert negative amounts to positive
weekly_debits['mean'] = weekly_debits['mean'].abs()  # Convert negative means to positive

In [68]:
# Pivot the data to get each category as columns with different aggregations
weekly_sum = weekly_debits.pivot_table(index='YearWeek', columns='Category', values='sum', fill_value=0)
weekly_count = weekly_debits.pivot_table(index='YearWeek', columns='Category', values='count', fill_value=0)
weekly_mean = weekly_debits.pivot_table(index='YearWeek', columns='Category', values='mean', fill_value=0)

# Add prefixes to differentiate the feature types
weekly_count.columns = [f'count_{col}' for col in weekly_count.columns]
weekly_mean.columns = [f'mean_{col}' for col in weekly_mean.columns]

# Merge all features
weekly_features = weekly_sum.join(weekly_count).join(weekly_mean)

# Reset index and create proper sorting key
weekly_features = weekly_features.reset_index()
weekly_features['sort_key'] = weekly_features['YearWeek'].apply(
    lambda x: int(x.split('-')[0]) * 100 + int(x.split('-')[1]))
weekly_features = weekly_features.sort_values('sort_key')
weekly_features = weekly_features.drop('sort_key', axis=1)

# Add lagged features (previous 1, 2, and 4 weeks)
numeric_columns = weekly_features.columns[1:]  # Exclude YearWeek column

for col in numeric_columns:
    for lag in [1, 2, 4]:
        weekly_features[f'{col}_lag{lag}'] = weekly_features[col].shift(lag)

# Calculate trending features (percentage changes)
for col in numeric_columns:
    weekly_features[f'{col}_pct_change'] = weekly_features[col].pct_change()

# Replace infinities and NaNs
weekly_features = weekly_features.replace([np.inf, -np.inf], np.nan)
weekly_features = weekly_features.fillna(0)

In [58]:
def create_sequences(data, seq_length, forecast_horizon=1):
    """Create sequences with multi-step forecasting capability"""
    X, y = [], []
    for i in range(len(data) - seq_length - forecast_horizon + 1):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length:i+seq_length+forecast_horizon])
    return np.array(X), np.array(y).reshape(len(y), -1)

In [67]:
# Get numeric data only (excluding the YearWeek column)
data = weekly_features.iloc[:, 1:].values

# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Define sequence parameters
seq_length = 12  # 8 weeks (2 months) - adjusted based on seasonality analysis
forecast_horizon = 1  # Predict next week

# Create sequences
X, y = create_sequences(scaled_data, seq_length, forecast_horizon)

# This helps detect overfitting early
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=False)

In [60]:
# 3. IMPROVED MODEL ARCHITECTURE
def build_advanced_lstm_model(input_shape, output_size):
    """Build a more sophisticated LSTM model with bidirectional layers"""
    model = Sequential([
        # Bidirectional LSTM for capturing patterns in both directions
        Bidirectional(LSTM(64, activation='tanh', return_sequences=True,
                          kernel_regularizer=tf.keras.regularizers.l2(0.001)),
                     input_shape=input_shape),
        BatchNormalization(),
        Dropout(0.3),

        # Second Bidirectional LSTM layer
        Bidirectional(LSTM(32, activation='tanh')),
        BatchNormalization(),
        Dropout(0.3),

        # Dense hidden layer
        Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
        BatchNormalization(),
        Dropout(0.2),

        # Output layer
        Dense(output_size)
    ])

    # Compile with Huber loss - more robust to outliers than MSE
    model.compile(
        optimizer=Adam(learning_rate=0.0005),  # Lower learning rate for stability
        loss=tf.keras.losses.Huber(),  # Huber loss is less sensitive to outliers
        metrics=['mean_absolute_error']
    )

    return model

In [61]:
# Build the improved model
input_shape = (X_train.shape[1], X_train.shape[2])  # (seq_length, num_features)
output_size = y_train.shape[1]  # Size of the output
lstm_model = build_advanced_lstm_model(input_shape, output_size)

In [None]:
# 5. Train the model
# Define callbacks for training
callbacks = [
    # More patient early stopping
    EarlyStopping(
        monitor='val_loss',
        patience=20,  # Wait longer before stopping
        restore_best_weights=True,
        verbose=1
    ),
    # Learning rate scheduler with better parameters
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,  # Reduce LR by more when plateau is detected
        patience=5,
        min_lr=0.00001,
        verbose=1
    ),
    # Save the best model
    ModelCheckpoint(
        'lstm_cashflow_model_improved.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
]

# Train with a larger batch size and more epochs
print("\nTraining the improved model...")
history = lstm_model.fit(
    X_train, y_train,
    epochs=100,  # More epochs with early stopping
    batch_size=32,  # Smaller batch size for better generalization
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1,
    shuffle=False  # Important for time series data
)

In [None]:
train_loss, train_mae = lstm_model.evaluate(X_train, y_train, verbose=0)
val_loss, val_mae = lstm_model.evaluate(X_val, y_val, verbose=0)
test_loss, test_mae = lstm_model.evaluate(X_test, y_test, verbose=0)

print("\nModel Performance Metrics:")
print(f"Training Loss: {train_loss:.4f}")
print(f"Training MAE: {train_mae:.4f}")
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation MAE: {val_mae:.4f}")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test MAE: {test_mae:.4f}")

In [None]:
train_predictions = lstm_model.predict(X_train, verbose=0)
val_predictions = lstm_model.predict(X_val, verbose=0)
test_predictions = lstm_model.predict(X_test, verbose=0)

# Convert back to original scale
y_train_orig = scaler.inverse_transform(y_train.reshape(y_train.shape[0], -1))
y_val_orig = scaler.inverse_transform(y_val.reshape(y_val.shape[0], -1))
y_test_orig = scaler.inverse_transform(y_test.reshape(y_test.shape[0], -1))

train_pred_orig = scaler.inverse_transform(train_predictions)
val_pred_orig = scaler.inverse_transform(val_predictions)
test_pred_orig = scaler.inverse_transform(test_predictions)

# Calculate metrics on the original scale
train_rmse = np.sqrt(mean_squared_error(y_train_orig, train_pred_orig))
val_rmse = np.sqrt(mean_squared_error(y_val_orig, val_pred_orig))
test_rmse = np.sqrt(mean_squared_error(y_test_orig, test_pred_orig))

train_r2 = r2_score(y_train_orig, train_pred_orig)
val_r2 = r2_score(y_val_orig, val_pred_orig)
test_r2 = r2_score(y_test_orig, test_pred_orig)

print("\nModel Performance (original scale):")
print(f"Training RMSE: ${train_rmse:.2f}")
print(f"Validation RMSE: ${val_rmse:.2f}")
print(f"Test RMSE: ${test_rmse:.2f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Validation R²: {val_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")

In [None]:
# Plot training history
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss', fontsize=15)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(history.history['mean_absolute_error'], label='Training MAE')
plt.plot(history.history['val_mean_absolute_error'], label='Validation MAE')
plt.title('Model MAE', fontsize=15)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('MAE', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('lstm_training_metrics.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# 7. COMPREHENSIVE VISUALIZATION OF MODEL PERFORMANCE

# Extract sum columns only (no count or mean) for clarity
category_columns = [col for col in weekly_features.columns[1:] if not col.startswith('count_')
                   and not col.startswith('mean_') and not '_lag' in col and not '_pct' in col]

print("\nVisualizing model performance on training and test data...")

# Get top categories by total spending across all datasets
all_actual_values = np.vstack([y_train_orig[:, :len(category_columns)],
                              y_val_orig[:, :len(category_columns)],
                              y_test_orig[:, :len(category_columns)]])
category_totals = all_actual_values.sum(axis=0)
top_categories = 5
top_indices = np.argsort(category_totals)[-top_categories:]

# Extract actual and predicted values for training and test sets
train_actual = y_train_orig[:, :len(category_columns)]
train_predicted = train_pred_orig[:, :len(category_columns)]
test_actual = y_test_orig[:, :len(category_columns)]
test_predicted = test_pred_orig[:, :len(category_columns)]

# 7.1 TRAINING SET VISUALIZATION
plt.figure(figsize=(18, 12))
plt.suptitle('Model Performance on Training Dataset', fontsize=16)

for i, idx in enumerate(top_indices):
    plt.subplot(top_categories, 1, i+1)

    # Plot training data
    weeks = range(len(train_actual[:, idx]))
    plt.plot(weeks, train_actual[:, idx], 'b-', linewidth=2, label='Actual')
    plt.plot(weeks, train_predicted[:, idx], 'r--', linewidth=2, label='Predicted')

    # Calculate metrics for this category on training data
    cat_mae = mean_absolute_error(train_actual[:, idx], train_predicted[:, idx])
    cat_rmse = np.sqrt(mean_squared_error(train_actual[:, idx], train_predicted[:, idx]))
    cat_r2 = r2_score(train_actual[:, idx], train_predicted[:, idx])

    plt.title(f'Category: {category_columns[idx]} - Training Data', fontsize=14)
    plt.ylabel('Amount ($)', fontsize=12)
    plt.legend([f'Actual', f'Predicted (MAE: ${cat_mae:.2f}, R²: {cat_r2:.3f})'])
    plt.grid(True, alpha=0.3)

    # Add error bars for selected points (every 5th point to avoid clutter)
    for j in range(0, len(weeks), 5):
        error = abs(train_actual[:, idx][j] - train_predicted[:, idx][j])
        plt.errorbar(weeks[j], train_predicted[:, idx][j], yerr=error, fmt='o', color='gray', alpha=0.5)

plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for the suptitle
plt.savefig('training_set_performance.png', dpi=300, bbox_inches='tight')
plt.show()

# 7.2 TEST SET VISUALIZATION
plt.figure(figsize=(18, 12))
plt.suptitle('Model Performance on Test Dataset', fontsize=16)

for i, idx in enumerate(top_indices):
    plt.subplot(top_categories, 1, i+1)

    # Plot test data
    weeks = range(len(test_actual[:, idx]))
    plt.plot(weeks, test_actual[:, idx], 'b-', linewidth=2, label='Actual')
    plt.plot(weeks, test_predicted[:, idx], 'r--', linewidth=2, label='Predicted')

    # Calculate metrics for this category on test data
    cat_mae = mean_absolute_error(test_actual[:, idx], test_predicted[:, idx])
    cat_rmse = np.sqrt(mean_squared_error(test_actual[:, idx], test_predicted[:, idx]))
    cat_r2 = r2_score(test_actual[:, idx], test_predicted[:, idx])

    plt.title(f'Category: {category_columns[idx]} - Test Data', fontsize=14)
    plt.ylabel('Amount ($)', fontsize=12)
    plt.legend([f'Actual', f'Predicted (MAE: ${cat_mae:.2f}, R²: {cat_r2:.3f})'])
    plt.grid(True, alpha=0.3)

    # Add error bars for all points in test set (typically smaller than training set)
    for j in range(len(weeks)):
        error = abs(test_actual[:, idx][j] - test_predicted[:, idx][j])
        plt.errorbar(weeks[j], test_predicted[:, idx][j], yerr=error, fmt='o', color='gray', alpha=0.5)

plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for the suptitle
plt.savefig('test_set_performance.png', dpi=300, bbox_inches='tight')
plt.show()

# 7.3 DIRECT COMPARISON OF TRAIN AND TEST PERFORMANCE
plt.figure(figsize=(18, 15))
plt.suptitle('Comparing Model Performance: Training vs Test Sets', fontsize=16)

for i, idx in enumerate(top_indices):
    plt.subplot(top_categories, 2, i*2+1)

    # Training data comparison
    train_weeks = range(len(train_actual[:, idx]))
    plt.plot(train_actual[:, idx], 'b-', linewidth=2, label='Actual')
    plt.plot(train_predicted[:, idx], 'r--', linewidth=2, label='Predicted')
    plt.title(f'{category_columns[idx]} - Training', fontsize=14)
    plt.ylabel('Amount ($)', fontsize=12)
    plt.grid(True, alpha=0.3)

    # Show distribution of errors
    errors = train_predicted[:, idx] - train_actual[:, idx]
    ax2 = plt.twinx()
    ax2.plot(errors, 'g.', alpha=0.5, label='Error')
    ax2.set_ylabel('Error ($)', color='g')
    ax2.tick_params(axis='y', labelcolor='g')

    plt.legend(['Actual', 'Predicted', 'Error'])

    plt.subplot(top_categories, 2, i*2+2)

    # Test data comparison
    test_weeks = range(len(test_actual[:, idx]))
    plt.plot(test_actual[:, idx], 'b-', linewidth=2, label='Actual')
    plt.plot(test_predicted[:, idx], 'r--', linewidth=2, label='Predicted')
    plt.title(f'{category_columns[idx]} - Test', fontsize=14)
    plt.ylabel('Amount ($)', fontsize=12)
    plt.grid(True, alpha=0.3)

    # Show distribution of errors
    errors = test_predicted[:, idx] - test_actual[:, idx]
    ax2 = plt.twinx()
    ax2.plot(errors, 'g.', alpha=0.5, label='Error')
    ax2.set_ylabel('Error ($)', color='g')
    ax2.tick_params(axis='y', labelcolor='g')

    plt.legend(['Actual', 'Predicted', 'Error'])

plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for the suptitle
plt.savefig('train_test_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# 7.4 ERROR DISTRIBUTION ANALYSIS
plt.figure(figsize=(15, 10))
plt.suptitle('Error Distribution Analysis', fontsize=16)

for i, idx in enumerate(top_indices):
    plt.subplot(3, 2, i+1)

    # Calculate errors
    train_errors = train_predicted[:, idx] - train_actual[:, idx]
    test_errors = test_predicted[:, idx] - test_actual[:, idx]

    # Plot error distributions as histograms
    plt.hist(train_errors, bins=20, alpha=0.5, label='Training Errors')
    plt.hist(test_errors, bins=20, alpha=0.5, label='Test Errors')

    plt.title(f'Error Distribution - {category_columns[idx]}', fontsize=12)
    plt.xlabel('Prediction Error ($)', fontsize=10)
    plt.ylabel('Frequency', fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.legend()

    # Add statistics
    train_mean_error = np.mean(train_errors)
    test_mean_error = np.mean(test_errors)
    train_std_error = np.std(train_errors)
    test_std_error = np.std(test_errors)

    plt.annotate(f'Train μ={train_mean_error:.2f}, σ={train_std_error:.2f}\n'
                f'Test μ={test_mean_error:.2f}, σ={test_std_error:.2f}',
                xy=(0.05, 0.95), xycoords='axes fraction',
                fontsize=9, bbox=dict(boxstyle="round,pad=0.3", fc="white", alpha=0.8))

# Add a subplot to show overall error distribution
plt.subplot(3, 2, 6)
all_train_errors = train_predicted.flatten() - train_actual.flatten()
all_test_errors = test_predicted.flatten() - test_actual.flatten()

plt.hist(all_train_errors, bins=30, alpha=0.5, label='All Training Errors')
plt.hist(all_test_errors, bins=30, alpha=0.5, label='All Test Errors')
plt.title('Overall Error Distribution (All Categories)', fontsize=12)
plt.xlabel('Prediction Error ($)', fontsize=10)
plt.ylabel('Frequency', fontsize=10)
plt.grid(True, alpha=0.3)
plt.legend()

plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for the suptitle
plt.savefig('error_distribution.png', dpi=300, bbox_inches='tight')
plt.show()