# House Prices Regression Techniques

**Goal:** Predict house sale prices using 79 explanatory variables

**Evaluation Metric:** RMSE between log of predicted and observed prices

## 1. Import Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import skew

# Settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
import warnings
warnings.filterwarnings('ignore')

print("‚úì Libraries loaded")

## 2. Load Data

In [None]:
# Load datasets
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')

print(f"Train: {train_df.shape}")
print(f"Test: {test_df.shape}")

# Display first few rows
train_df.head()

## 3. Exploratory Data Analysis

In [None]:
# Target variable distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(train_df['SalePrice'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Sale Price Distribution')
axes[0].set_xlabel('Sale Price')

axes[1].hist(np.log1p(train_df['SalePrice']), bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Log-Transformed Sale Price')
axes[1].set_xlabel('Log(Sale Price)')

plt.tight_layout()
plt.show()

print(f"Skewness: {train_df['SalePrice'].skew():.2f}")

In [None]:
# Missing values analysis
def check_missing(df, name='Dataset'):
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    missing_pct = (missing / len(df)) * 100
    
    result = pd.DataFrame({'Missing': missing, 'Percent': missing_pct})
    print(f"\n{name}: {len(result)} features with missing values")
    return result

missing_train = check_missing(train_df, 'Training Data')
missing_train.head(10)

## 4. Data Preprocessing

In [None]:
# Save target and IDs
y_train = train_df['SalePrice'].copy()
test_ids = test_df['Id'].copy()

# Drop ID and target
train_df = train_df.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df.drop(['Id'], axis=1)

# Combine for consistent preprocessing
n_train = len(train_df)
all_data = pd.concat([train_df, test_df], axis=0, ignore_index=True)

print(f"Combined data: {all_data.shape}")

In [None]:
# Handle missing values

# 1. Features where NA means "None"
none_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                 'MasVnrType']

for col in none_features:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna('None')

# 2. Numerical features to fill with 0
zero_features = ['GarageYrBlt', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea', 'MasVnrArea']

for col in zero_features:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna(0)

# 3. LotFrontage: fill with median by neighborhood
if 'LotFrontage' in all_data.columns:
    all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
        lambda x: x.fillna(x.median()))

# 4. Other categorical: fill with mode
for col in all_data.select_dtypes(include=['object']).columns:
    if all_data[col].isnull().sum() > 0:
        all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# 5. Remaining numerical: fill with median
for col in all_data.select_dtypes(include=['int64', 'float64']).columns:
    if all_data[col].isnull().sum() > 0:
        all_data[col] = all_data[col].fillna(all_data[col].median())

print(f"Remaining missing values: {all_data.isnull().sum().sum()}")

In [None]:
# Feature engineering
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = (all_data['FullBath'] + 0.5 * all_data['HalfBath'] +
                         all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'])
all_data['TotalPorchSF'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
                            all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
                            all_data['WoodDeckSF'])
all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['YearsSinceRemod'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['HasPool'] = (all_data['PoolArea'] > 0).astype(int)
all_data['Has2ndFloor'] = (all_data['2ndFlrSF'] > 0).astype(int)
all_data['HasGarage'] = (all_data['GarageArea'] > 0).astype(int)
all_data['HasBsmt'] = (all_data['TotalBsmtSF'] > 0).astype(int)
all_data['HasFireplace'] = (all_data['Fireplaces'] > 0).astype(int)

print(f"‚úì Features engineered: {all_data.shape}")

In [None]:
# Handle skewed features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75]

print(f"Skewed features: {len(skewed_feats)}")

for feat in skewed_feats.index:
    all_data[feat] = np.log1p(all_data[feat])

print("‚úì Skewness corrected")

In [None]:
# One-hot encoding
all_data = pd.get_dummies(all_data, drop_first=True)

print(f"‚úì Encoded features: {all_data.shape}")

In [None]:
# Split back to train and test
X_train = all_data[:n_train].copy()
X_test = all_data[n_train:].copy()
y_train_log = np.log1p(y_train)

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print("\n‚úì Preprocessing complete")

## 5. Model Training

In [None]:
# Helper functions
def rmse_cv(model, X, y, cv=5):
    rmse = np.sqrt(-cross_val_score(model, X, y, 
                                     scoring="neg_mean_squared_error", cv=cv))
    return rmse

def train_and_evaluate(model, X, y, name):
    scores = rmse_cv(model, X, y)
    print(f"{name:20s} | CV RMSE: {scores.mean():.6f} (+/- {scores.std():.6f})")
    return scores.mean(), scores.std()

print("‚úì Helper functions defined")

In [None]:
# Train and compare models
print("=" * 70)
print("MODEL COMPARISON")
print("=" * 70)

models = {
    'Ridge': Ridge(alpha=10.0, random_state=42),
    'Lasso': Lasso(alpha=0.0005, random_state=42, max_iter=10000),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, 
                                           min_samples_split=5, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=300, learning_rate=0.05,
                                                   max_depth=4, min_samples_split=5,
                                                   random_state=42)
}

results = {}
for name, model in models.items():
    mean_score, std_score = train_and_evaluate(model, X_train, y_train_log, name)
    results[name] = (mean_score, std_score)

# Find best model
best_model_name = min(results, key=lambda x: results[x][0])
print(f"\n‚úì Best Model: {best_model_name}")

## 6. Final Model & Predictions

In [None]:
# Train final model on full training data
final_model = models[best_model_name]
final_model.fit(X_train, y_train_log)

# Make predictions
y_pred_log = final_model.predict(X_test)
y_pred = np.expm1(y_pred_log)

print(f"‚úì Final model trained: {best_model_name}")
print(f"\nPrediction Statistics:")
print(f"  Mean: ${y_pred.mean():,.2f}")
print(f"  Median: ${np.median(y_pred):,.2f}")
print(f"  Range: ${y_pred.min():,.2f} - ${y_pred.max():,.2f}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred
})

submission.to_csv('submission.csv', index=False)
print("‚úì Submission file created: submission.csv")
submission.head(10)

## 7. Results Summary

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Model scores
model_names = list(results.keys())
mean_scores = [results[m][0] for m in model_names]
std_scores = [results[m][1] for m in model_names]

colors = ['#4ECDC4' if m == best_model_name else '#95E1D3' for m in model_names]
axes[0].barh(model_names, mean_scores, xerr=std_scores, color=colors, alpha=0.8, 
             edgecolor='black', linewidth=2)
axes[0].set_xlabel('CV RMSE')
axes[0].set_title('Model Performance Comparison', fontweight='bold')
axes[0].invert_yaxis()
axes[0].grid(axis='x', alpha=0.3)

# Feature importance (if available)
if hasattr(final_model, 'feature_importances_'):
    importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': final_model.feature_importances_
    }).sort_values('importance', ascending=False).head(15)
    
    axes[1].barh(range(len(importance)), importance['importance'], 
                 color='#FF6B6B', alpha=0.8, edgecolor='black', linewidth=2)
    axes[1].set_yticks(range(len(importance)))
    axes[1].set_yticklabels(importance['feature'], fontsize=9)
    axes[1].set_xlabel('Importance')
    axes[1].set_title('Top 15 Feature Importances', fontweight='bold')
    axes[1].invert_yaxis()
    axes[1].grid(axis='x', alpha=0.3)
else:
    axes[1].text(0.5, 0.5, 'Feature importance\nnot available for this model', 
                ha='center', va='center', fontsize=12)
    axes[1].axis('off')

plt.suptitle('üè† House Prices Prediction - Results', fontsize=15, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Final summary
print("=" * 70)
print("PROJECT COMPLETE")
print("=" * 70)
print(f"\n‚úì Training samples: {len(X_train)}")
print(f"‚úì Test predictions: {len(y_pred)}")
print(f"‚úì Features: {X_train.shape[1]}")
print(f"‚úì Best model: {best_model_name}")
print(f"‚úì CV RMSE: {results[best_model_name][0]:.6f}")
print(f"\nüìÅ Submission file: submission.csv")
print("\nüéØ Next steps:")
print("   1. Upload submission.csv to Kaggle")
print("   2. Review leaderboard score")
print("   3. Iterate with hyperparameter tuning")
print("=" * 70)