# Model Training Notebook
## AI Model Web Integration - Regression Model

This notebook:
1. Loads and preprocesses data
2. Trains multiple regression models (Linear, Ridge, Lasso, ElasticNet)
3. Performs cross-validation and hyperparameter tuning
4. Saves the best model and preprocessing objects

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

# Set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Display settings
pd.set_option('display.max_columns', None)
sns.set_style('darkgrid')
%matplotlib inline

## 1. Load Dataset

In [None]:
# Load California Housing dataset
print("Loading California Housing dataset...")
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='MedHouseVal')

print(f"Dataset shape: {X.shape}")
print(f"\nFeatures: {list(X.columns)}")
print(f"\nFirst few rows:")
X.head()

In [None]:
# Basic statistics
print("Dataset Statistics:")
X.describe()

## 2. Introduce Missing Values (for demonstration)

In [None]:
# Introduce some missing values randomly (5% of data)
print("Introducing missing values...")
X_with_missing = X.copy()
np.random.seed(RANDOM_STATE)

for col in X_with_missing.columns:
    mask = np.random.random(len(X_with_missing)) < 0.05
    X_with_missing.loc[mask, col] = np.nan

print("\nMissing values per column:")
print(X_with_missing.isnull().sum())
print(f"\nTotal missing values: {X_with_missing.isnull().sum().sum()}")

## 3. Train-Test Split

In [None]:
# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X_with_missing, y,
    test_size=0.2,
    random_state=RANDOM_STATE
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

## 4. Data Preprocessing

In [None]:
# Initialize preprocessing objects
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

# Fit and transform training data
print("Preprocessing training data...")
X_train_imputed = imputer.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train_imputed)

# Transform test data
print("Preprocessing test data...")
X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)

print("\n‚úÖ Preprocessing complete!")
print(f"Training data shape: {X_train_scaled.shape}")
print(f"Test data shape: {X_test_scaled.shape}")

## 5. Model Training & Comparison

In [None]:
# Define models to compare
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=RANDOM_STATE),
    'Lasso': Lasso(random_state=RANDOM_STATE),
    'ElasticNet': ElasticNet(random_state=RANDOM_STATE)
}

# Store results
results = []

print("Training and evaluating models...\n")
print("="*80)

for name, model in models.items():
    print(f"\n{name}:")
    print("-" * 40)
    
    # Cross-validation on training data
    cv_scores = cross_val_score(
        model, X_train_scaled, y_train,
        cv=5, scoring='r2', n_jobs=-1
    )
    
    # Train on full training set
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)
    
    # Store results
    results.append({
        'Model': name,
        'CV Mean R¬≤': cv_scores.mean(),
        'CV Std R¬≤': cv_scores.std(),
        'Train R¬≤': train_r2,
        'Test R¬≤': test_r2,
        'Test MAE': test_mae,
        'Test RMSE': test_rmse
    })
    
    # Print results
    print(f"Cross-validation R¬≤ (mean ¬± std): {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
    print(f"Training R¬≤: {train_r2:.4f}")
    print(f"Test R¬≤: {test_r2:.4f}")
    print(f"Test MAE: {test_mae:.4f}")
    print(f"Test RMSE: {test_rmse:.4f}")

print("\n" + "="*80)

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.round(4)
print("\nModel Comparison Results:")
results_df

## 6. Hyperparameter Tuning (Best Model)

In [None]:
# Select best model based on test R¬≤ score
best_model_name = results_df.loc[results_df['Test R¬≤'].idxmax(), 'Model']
print(f"Best model: {best_model_name}\n")

# Hyperparameter tuning for Ridge (usually performs well)
print("Performing hyperparameter tuning for Ridge Regression...")
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
}

grid_search = GridSearchCV(
    Ridge(random_state=RANDOM_STATE),
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV R¬≤ score: {grid_search.best_score_:.4f}")

# Get best model
best_model = grid_search.best_estimator_

## 7. Final Model Evaluation

In [None]:
# Evaluate best model
y_train_pred_final = best_model.predict(X_train_scaled)
y_test_pred_final = best_model.predict(X_test_scaled)

train_r2_final = r2_score(y_train, y_train_pred_final)
test_r2_final = r2_score(y_test, y_test_pred_final)
test_mae_final = mean_absolute_error(y_test, y_test_pred_final)
test_rmse_final = np.sqrt(mean_squared_error(y_test, y_test_pred_final))

print("\n" + "="*80)
print("FINAL MODEL PERFORMANCE")
print("="*80)
print(f"\nModel: Ridge Regression (Optimized)")
print(f"Parameters: {grid_search.best_params_}")
print(f"\nTraining R¬≤: {train_r2_final:.4f}")
print(f"Test R¬≤: {test_r2_final:.4f}")
print(f"Test MAE: {test_mae_final:.4f}")
print(f"Test RMSE: {test_rmse_final:.4f}")

# Check for overfitting
diff = abs(train_r2_final - test_r2_final)
print(f"\nDifference between train and test R¬≤: {diff:.4f}")

if diff > 0.1:
    print("‚ö†Ô∏è WARNING: Possible overfitting detected!")
else:
    print("‚úÖ Model generalizes well!")

print("="*80)

## 8. Visualizations

In [None]:
# Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred_final, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Residuals
residuals = y_test - y_test_pred_final

plt.figure(figsize=(10, 6))
plt.scatter(y_test_pred_final, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True)
plt.tight_layout()
plt.show()

## 9. Save Model and Preprocessing Objects

In [None]:
# Create model directory if it doesn't exist
model_dir = '../backend/model'
os.makedirs(model_dir, exist_ok=True)

# Save model
model_path = os.path.join(model_dir, 'trained_model.pkl')
joblib.dump(best_model, model_path)
print(f"‚úÖ Model saved to: {model_path}")

# Save scaler
scaler_path = os.path.join(model_dir, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"‚úÖ Scaler saved to: {scaler_path}")

# Save imputer
imputer_path = os.path.join(model_dir, 'imputer.pkl')
joblib.dump(imputer, imputer_path)
print(f"‚úÖ Imputer saved to: {imputer_path}")

# Save feature names
feature_names = list(X.columns)
features_path = os.path.join(model_dir, 'feature_names.pkl')
joblib.dump(feature_names, features_path)
print(f"‚úÖ Feature names saved to: {features_path}")

print("\nüéâ All objects saved successfully!")

## 10. Save Validation Report

In [None]:
# Save results to CSV
data_dir = '../data/processed'
os.makedirs(data_dir, exist_ok=True)

report_path = os.path.join(data_dir, 'model_validation_report.csv')
results_df.to_csv(report_path, index=False)
print(f"‚úÖ Validation report saved to: {report_path}")

print("\n" + "="*80)
print("MODEL TRAINING COMPLETE!")
print("="*80)
print("\nNext steps:")
print("1. Run the Flask backend: cd backend && python app.py")
print("2. Open frontend/index.html in your browser")
print("3. Test the predictions!")
print("="*80)