In [1]:
# Cell 1: Imports
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from src.data.data_loader import DataLoader, load_sample_data
from src.data.data_preprocessing import DataPreprocessor
from src.features.feature_builder import FeatureBuilder
from src.models.train import ModelTrainer
from src.utils.helpers import setup_logging, set_random_seed

setup_logging(log_level="INFO")
set_random_seed(42)
%matplotlib inline

print("Imports completed!")

[32m2025-11-27 15:22:30[0m | [1mINFO    [0m | [36mhelpers[0m:[36mset_random_seed[0m:[36m241[0m - [1mRandom seed set to 42[0m


Imports completed!


In [2]:
# Cell 2: Prepare Data
# Load and preprocess data
data_loader = DataLoader()
try:
    df = data_loader.load_csv()
except FileNotFoundError:
    df = load_sample_data()

preprocessor = DataPreprocessor()
df_clean = preprocessor.clean_data(df)
df_features = preprocessor.create_features(df_clean)

# Split data
train_df, test_df = train_test_split(df_features, test_size=0.2, random_state=42)

# Prepare features
X_train, y_train = preprocessor.prepare_features(train_df, fit=True)
X_test, y_test = preprocessor.prepare_features(test_df, fit=False)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

[32m2025-11-27 15:22:33[0m | [1mINFO    [0m | [36mconfig_loader[0m:[36m__init__[0m:[36m47[0m - [1mConfiguration loaded from D:\practice-1\src\config\config.yaml[0m
[32m2025-11-27 15:22:33[0m | [1mINFO    [0m | [36mdata_loader[0m:[36m__init__[0m:[36m62[0m - [1mDataLoader initialized[0m
[32m2025-11-27 15:22:33[0m | [1mINFO    [0m | [36mdata_loader[0m:[36mload_sample_data[0m:[36m333[0m - [1mCreated sample data with 10 rows[0m
[32m2025-11-27 15:22:33[0m | [1mINFO    [0m | [36mdata_preprocessing[0m:[36m__init__[0m:[36m46[0m - [1mDataPreprocessor initialized[0m
[32m2025-11-27 15:22:33[0m | [1mINFO    [0m | [36mdata_preprocessing[0m:[36mclean_data[0m:[36m59[0m - [1mStarting data cleaning. Initial shape: (10, 11)[0m
[32m2025-11-27 15:22:33[0m | [1mINFO    [0m | [36mdata_preprocessing[0m:[36m_remove_outliers[0m:[36m236[0m - [1mRemoved 2 outlier rows (20.00%)[0m
[32m2025-11-27 15:22:33[0m | [1mINFO    [0m | [36mdata_pre

Training set: (6, 16)
Test set: (2, 16)


In [3]:
# Cell 3: Train All Models
trainer = ModelTrainer()

# Train all configured models
models = trainer.train_all_models(X_train, y_train, tune_hyperparameters=False)

print(f"\nTrained {len(models)} models")

[32m2025-11-27 15:22:39[0m | [1mINFO    [0m | [36mtrain[0m:[36m__init__[0m:[36m57[0m - [1mModelTrainer initialized[0m
[32m2025-11-27 15:22:39[0m | [1mINFO    [0m | [36mtrain[0m:[36mtrain_all_models[0m:[36m76[0m - [1mStarting training for all configured models[0m
[32m2025-11-27 15:22:39[0m | [1mINFO    [0m | [36mtrain[0m:[36m_train_linear_regression[0m:[36m110[0m - [1mTraining Linear Regression model[0m
[32m2025-11-27 15:22:43[0m | [1mINFO    [0m | [36mtrain[0m:[36m_train_linear_regression[0m:[36m127[0m - [1mLinear Regression - CV Score: 95998.3015 (+/- 38983.9983)[0m
[32m2025-11-27 15:22:43[0m | [1mINFO    [0m | [36mhelpers[0m:[36mwrapper[0m:[36m219[0m - [1m_train_linear_regression executed in 4.0156 seconds[0m
[32m2025-11-27 15:22:43[0m | [1mINFO    [0m | [36mtrain[0m:[36m_train_random_forest[0m:[36m149[0m - [1mTraining Random Forest model[0m
[32m2025-11-27 15:22:46[0m | [1mINFO    [0m | [36mtrain[0m:[36m_t


Trained 3 models


In [4]:
# Cell 4: Training Summary
summary = trainer.get_training_summary()
print("\nTraining Summary (Cross-Validation RMSE):")
print(summary.to_string(index=False))


Training Summary (Cross-Validation RMSE):
            model  mean_cv_rmse   std_cv_rmse   min_cv_rmse   max_cv_rmse
linear_regression  95998.301512  38983.998321  42406.314854 148305.180532
          xgboost 133667.417187  55669.194302  89733.187500 239981.218750
    random_forest 255725.888700 100902.839645 115900.000000 428500.000000


In [5]:
# Cell 5: Select Best Model
best_name, best_model = trainer.select_best_model(X_test, y_test)
print(f"\nBest Model: {best_name}")

[32m2025-11-27 15:22:56[0m | [1mINFO    [0m | [36mtrain[0m:[36mselect_best_model[0m:[36m433[0m - [1mBest model: linear_regression with RMSE: 95998.3015[0m



Best Model: linear_regression


In [6]:
# Cell 6: Feature Importance
feature_names = preprocessor.get_feature_names()
importance_df = trainer.get_feature_importance(model_name=best_name, feature_names=feature_names)

if not importance_df.empty:
    plt.figure(figsize=(10, 8))
    top_n = min(15, len(importance_df))
    plt.barh(importance_df['feature'].head(top_n)[::-1],
             importance_df['importance'].head(top_n)[::-1])
    plt.xlabel('Importance')
    plt.title(f'Top {top_n} Feature Importance ({best_name})')
    plt.tight_layout()
    plt.show()



In [7]:
# Cell 7: Save Models
save_path = project_root / 'models'
trainer.save_models(save_all=True, save_path=save_path)
preprocessor.save_preprocessor(save_path / 'preprocessor.pkl')
print(f"\nModels saved to {save_path}")

[32m2025-11-27 15:23:03[0m | [1mINFO    [0m | [36mhelpers[0m:[36msave_pickle[0m:[36m115[0m - [1mObject saved to D:\practice-1\models\linear_regression_model.pkl[0m
[32m2025-11-27 15:23:03[0m | [1mINFO    [0m | [36mtrain[0m:[36msave_models[0m:[36m460[0m - [1mSaved linear_regression model to D:\practice-1\models\linear_regression_model.pkl[0m
[32m2025-11-27 15:23:04[0m | [1mINFO    [0m | [36mhelpers[0m:[36msave_pickle[0m:[36m115[0m - [1mObject saved to D:\practice-1\models\random_forest_model.pkl[0m
[32m2025-11-27 15:23:04[0m | [1mINFO    [0m | [36mtrain[0m:[36msave_models[0m:[36m460[0m - [1mSaved random_forest model to D:\practice-1\models\random_forest_model.pkl[0m
[32m2025-11-27 15:23:04[0m | [1mINFO    [0m | [36mhelpers[0m:[36msave_pickle[0m:[36m115[0m - [1mObject saved to D:\practice-1\models\xgboost_model.pkl[0m
[32m2025-11-27 15:23:04[0m | [1mINFO    [0m | [36mtrain[0m:[36msave_models[0m:[36m460[0m - [1mSaved x


Models saved to D:\practice-1\models
