# Vietnam Housing Price Prediction - Model Training

This notebook demonstrates model training and evaluation for housing price prediction.

## Steps:
1. Load processed data
2. Feature engineering
3. Train multiple models
4. Model comparison
5. Hyperparameter tuning
6. Final evaluation
7. Save best model

## 1. Import Libraries

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.append('../src')

from model import HousingPriceModel
from preprocessing import HousingDataPreprocessor
import utils

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("✓ Libraries imported successfully")

## 2. Load and Prepare Data

In [None]:
# Load processed data
data_path = '../data/processed_housing_data.csv'

try:
    df = pd.read_csv(data_path)
    print(f"✓ Processed data loaded: {df.shape}")
except FileNotFoundError:
    print("⚠️ Processed data not found. Creating sample data...")
    
    # Create sample data
    np.random.seed(42)
    n_samples = 1000
    
    sample_data = {
        'Quận': np.random.choice(['Ba Đình', 'Hoàn Kiếm', 'Đống Đa', 'Hai Bà Trưng', 
                                  'Cầu Giấy', 'Thanh Xuân'], n_samples),
        'Huyện': np.random.choice(['Ba Đình', 'Hoàn Kiếm', 'Đống Đa'], n_samples),
        'Giá': np.random.uniform(2e9, 15e9, n_samples),
        'Diện tích': np.random.uniform(40, 150, n_samples),
        'Giá/m²': np.random.uniform(30e6, 120e6, n_samples),
        'Số tầng': np.random.randint(1, 5, n_samples),
        'Số phòng ngủ': np.random.randint(2, 5, n_samples),
        'Dài': np.random.uniform(5, 15, n_samples),
        'Rộng': np.random.uniform(4, 12, n_samples),
        'Loại hình nhà ở': np.random.choice(['Nhà riêng', 'Nhà mặt phố', 'Biệt thự'], n_samples),
        'Giấy tờ pháp lý': np.random.choice(['Sổ đỏ/ Sổ hồng', 'Hợp đồng mua bán'], n_samples)
    }
    
    df = pd.DataFrame(sample_data)
    print(f"✓ Sample data created: {df.shape}")

# Display info
print("\nDataset Info:")
df.info()

## 3. Feature Engineering

In [None]:
# Encode categorical variables
preprocessor = HousingDataPreprocessor()
preprocessor.df = df.copy()

# Encode categorical features
df_encoded = preprocessor.encode_categorical()

print("\nEncoded Dataset:")
print(df_encoded.head())

## 4. Prepare Training and Test Sets

In [None]:
# Initialize model trainer
model_trainer = HousingPriceModel(random_state=42)

# Prepare data
X_train, X_test, y_train, y_test = model_trainer.prepare_data(
    df_encoded,
    target_col='Giá',
    test_size=0.2
)

print("\nData Split Summary:")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Features: {X_train.shape[1]}")

## 5. Initialize Models

In [None]:
# Initialize all models
models = model_trainer.initialize_models()

print("\nInitialized Models:")
for name, model in models.items():
    print(f"  - {name}: {type(model).__name__}")

## 6. Train All Models

In [None]:
# Train and evaluate all models
results = model_trainer.train_all_models(evaluate=True)

## 7. Model Comparison

In [None]:
# Create comparison dataframe
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('R2', ascending=False)

print("\nModel Comparison:")
print(results_df)

# Format for better display
print("\n" + "="*80)
print("DETAILED RESULTS")
print("="*80)
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    utils.print_metrics(metrics, title="")

In [None]:
# Visualize results
model_trainer.plot_results(figsize=(14, 10))

## 8. Cross-Validation

In [None]:
# Perform cross-validation on best models
print("Performing Cross-Validation...\n")

cv_results = {}
for model_name in ['Random Forest', 'XGBoost', 'LightGBM']:
    if model_name in model_trainer.models:
        print(f"\n{model_name}:")
        cv_result = model_trainer.cross_validate_model(model_name, cv=5)
        cv_results[model_name] = cv_result

# Display CV results
print("\n" + "="*60)
print("CROSS-VALIDATION SUMMARY")
print("="*60)
for model_name, result in cv_results.items():
    print(f"{model_name}: {result['mean_score']:.2f} (+/- {result['std_score']:.2f})")

## 9. Feature Importance

In [None]:
# Get feature importance from best model
best_model_name = model_trainer.best_model_name
print(f"Best Model: {best_model_name}\n")

# Display feature importance
importance_df = model_trainer.get_feature_importance(top_n=15)
print("Top 15 Most Important Features:")
print(importance_df)

# Plot feature importance
model_trainer.plot_feature_importance(top_n=15)

## 10. Hyperparameter Tuning (Optional)

In [None]:
# Example: Tune Random Forest
# Uncomment to run (takes time)

# param_grid_rf = {
#     'n_estimators': [100, 200],
#     'max_depth': [15, 20, 25],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
# }

# best_rf = model_trainer.hyperparameter_tuning(
#     'Random Forest',
#     param_grid_rf,
#     cv=3
# )

print("Hyperparameter tuning skipped (uncomment code to run)")

## 11. Prediction Visualization

In [None]:
# Make predictions on test set
y_pred = model_trainer.predict(X_test)

# Plot actual vs predicted
plt.figure(figsize=(12, 5))

# Scatter plot
plt.subplot(1, 2, 1)
plt.scatter(y_test / 1e9, y_pred / 1e9, alpha=0.5)
plt.plot([y_test.min() / 1e9, y_test.max() / 1e9], 
         [y_test.min() / 1e9, y_test.max() / 1e9], 
         'r--', lw=2)
plt.xlabel('Actual Price (tỷ VNĐ)')
plt.ylabel('Predicted Price (tỷ VNĐ)')
plt.title(f'Actual vs Predicted - {best_model_name}')
plt.grid(True, alpha=0.3)

# Residual plot
plt.subplot(1, 2, 2)
residuals = (y_test - y_pred) / 1e9
plt.scatter(y_pred / 1e9, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--', lw=2)
plt.xlabel('Predicted Price (tỷ VNĐ)')
plt.ylabel('Residuals (tỷ VNĐ)')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 12. Make Sample Predictions

In [None]:
# Create sample input
sample_input = utils.create_sample_input(
    district='Ba Đình',
    property_type='Nhà riêng',
    area=100.0,
    floors=3,
    bedrooms=3,
    length=10.0,
    width=10.0
)

print("Sample Input:")
for key, value in sample_input.items():
    print(f"  {key}: {value}")

# Note: For actual prediction, we need to encode the input
print("\n⚠️ To make actual predictions, the input needs to be encoded using the same")
print("encoders used during training. See the Streamlit app for full implementation.")

## 13. Save Best Model

In [None]:
# Save the best model
model_path = '../models/best_housing_model.pkl'

model_trainer.save_model(filepath=model_path)

print(f"\n✓ Best model ({best_model_name}) saved successfully!")
print(f"Model path: {model_path}")

## 14. Test Loading the Saved Model

In [None]:
# Create a new model trainer and load the saved model
test_trainer = HousingPriceModel()
loaded_model = test_trainer.load_model(model_path)

# Make a test prediction
test_pred = test_trainer.predict(X_test[:5])

print("\nTest Predictions (first 5):")
for i, (actual, predicted) in enumerate(zip(y_test[:5], test_pred)):
    print(f"Sample {i+1}:")
    print(f"  Actual:    {actual/1e9:.2f} tỷ VNĐ")
    print(f"  Predicted: {predicted/1e9:.2f} tỷ VNĐ")
    print(f"  Error:     {abs(actual - predicted)/1e9:.2f} tỷ VNĐ\n")

## Summary

In this notebook, we:
1. ✓ Loaded processed data
2. ✓ Performed feature engineering
3. ✓ Trained 4 different ML models
4. ✓ Compared model performance
5. ✓ Performed cross-validation
6. ✓ Analyzed feature importance
7. ✓ Saved the best model

**Best Model**: {best_model_name}

**Next Step**: Use the Streamlit app (`streamlit run app/streamlit_app.py`) for interactive predictions!