In [2]:
#This notebook is for Support Vector Machines.

# Data cleaning and engineering

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, lower_quantile=0.01, upper_quantile=0.99):
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.lower_bounds = None
        self.upper_bounds = None

    def fit(self, X, y=None):
        self.lower_bounds = np.quantile(X, self.lower_quantile, axis=0)
        self.upper_bounds = np.quantile(X, self.upper_quantile, axis=0)
        return self

    def transform(self, X):
        return np.clip(X, self.lower_bounds, self.upper_bounds)

class YearConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.min_year = None
        self.max_year = None

    def fit(self, X, y=None):
        X_flat = X.ravel() if X.ndim > 1 else X
        self.min_year = np.min(X_flat)
        self.max_year = min(np.max(X_flat), pd.Timestamp.now().year)
        return self

    def transform(self, X):
        X_numeric = pd.to_numeric(X.ravel() if X.ndim > 1 else X, errors='coerce')
        X_clipped = np.clip(X_numeric, self.min_year, self.max_year)
        return X_clipped.reshape(X.shape)

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Create new features
        X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
        
        current_year = pd.Timestamp.now().year
        X['HouseAge'] = current_year - X['YearBuilt']
        X['TimeSinceRemodel'] = current_year - X['YearRemodAdd']
        
        X['TotalBathrooms'] = X['FullBath'] + (0.5 * X['HalfBath']) + X['BsmtFullBath'] + (0.5 * X['BsmtHalfBath'])
        X['IsNewHouse'] = (X['YearBuilt'] == X['YrSold']).astype(int)
        X['HasPool'] = (X['PoolArea'] > 0).astype(int)
        X['TotalPorchSF'] = X['OpenPorchSF'] + X['EnclosedPorch'] + X['3SsnPorch'] + X['ScreenPorch']
        X['OverallHouseCondition'] = X['OverallQual'] * X['OverallCond']
        
        # Create interaction features
        X['TotalSF_OverallQual'] = X['TotalSF'] * X['OverallQual']
        X['GrLivArea_TotRmsAbvGrd'] = X['GrLivArea'] * X['TotRmsAbvGrd']
        X['HouseAge_OverallQual'] = X['HouseAge'] * X['OverallQual']
        X['GarageArea_GarageCars'] = X['GarageArea'] * X['GarageCars']
        X['YearBuilt_YearRemodAdd'] = X['YearBuilt'] * X['YearRemodAdd']
        X['TotalSF_HouseAge'] = X['TotalSF'] * X['HouseAge']
        X['1stFlrSF_2ndFlrSF'] = X['1stFlrSF'] * X['2ndFlrSF']
        X['TotalSF_OverallCond'] = X['TotalSF'] * X['OverallCond']
        
        # Interaction with categorical variable (requires encoding)
        X['GrLivArea_Neighborhood'] = X['GrLivArea'] * pd.factorize(X['Neighborhood'])[0]
        
        return X

def pandas_to_numpy(X):
    return X.to_numpy() if isinstance(X, pd.DataFrame) else X

def preprocess_and_engineer(X):
    # Apply FeatureEngineer first
    feature_engineer = FeatureEngineer()
    X_engineered = feature_engineer.fit_transform(X.copy())
    
    # Identify numeric, categorical, and year columns
    numeric_features = X_engineered.select_dtypes(include=['int64', 'float64']).columns.drop(['YearBuilt', 'YearRemodAdd', 'YrSold'])
    categorical_features = X_engineered.select_dtypes(include=['object']).columns
    year_features = ['YearBuilt', 'YearRemodAdd', 'YrSold']
    
    print("Number of features before preprocessing:")
    print(f"Numeric: {len(numeric_features)}")
    print(f"Categorical: {len(categorical_features)}")
    print(f"Year: {len(year_features)}")
    
    # Create preprocessing steps
    numeric_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('outlier_capper', OutlierCapper()),
        ('scaler', StandardScaler()),
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore', max_categories=10)),
    ])

    year_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('converter', YearConverter()),
    ])

    # Create and fit the preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('year', year_transformer, year_features)
        ])
    
    X_preprocessed = preprocessor.fit_transform(X_engineered)
    
    # Generate feature names
    numeric_feature_names = list(numeric_features)
    categorical_feature_names = []
    onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
    
    print("\nCategorical feature encoding details:")
    for i, feature in enumerate(categorical_features):
        categories = onehot_encoder.categories_[i]
        n_categories = min(len(categories), 10)  # Account for max_categories=10
        n_encoded = n_categories - 1  # Subtract 1 due to drop='first'
        print(f"{feature}: {n_categories} categories, {n_encoded} encoded features")
        categorical_feature_names.extend([f"{feature}_{cat}" for cat in categories[1:n_categories]])
    
    year_feature_names = list(year_features)
    
    feature_names = (numeric_feature_names + 
                     categorical_feature_names + 
                     year_feature_names)
    
    print("\nNumber of features after preprocessing:")
    print(f"Numeric: {len(numeric_feature_names)}")
    print(f"Categorical (one-hot encoded): {len(categorical_feature_names)}")
    print(f"Year: {len(year_feature_names)}")
    
    print(f"\nTotal number of features: {len(feature_names)}")
    print(f"Number of columns in preprocessed data: {X_preprocessed.shape[1]}")
    
    # Ensure the number of feature names matches the number of columns in X_preprocessed
    if len(feature_names) != X_preprocessed.shape[1]:
        print(f"\nWarning: Number of feature names ({len(feature_names)}) "
              f"does not match number of columns in preprocessed data ({X_preprocessed.shape[1]})")
        print("Adjusting feature names...")
        if len(feature_names) > X_preprocessed.shape[1]:
            feature_names = feature_names[:X_preprocessed.shape[1]]
        else:
            feature_names += [f'Unknown_{i}' for i in range(X_preprocessed.shape[1] - len(feature_names))]
    
    # Store feature names as an attribute of the DataFrame
    df = pd.DataFrame(X_preprocessed, columns=feature_names, index=X.index)
    df.attrs['feature_names'] = feature_names
    
    return df

# Load the data
df = pd.read_csv('/Users/ttanaka/Desktop/Website/house-prices-advanced-regression-techniques/train.csv')

# Separate features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Full pipeline
full_pipeline = Pipeline([
    ('preprocess_and_engineer', FunctionTransformer(preprocess_and_engineer, validate=False)),
    ('to_numpy', FunctionTransformer(pandas_to_numpy))
])

# Apply the pipeline
X_processed = full_pipeline.fit_transform(X)

# Validation
print("\nFinal validation:")
print("Shape after preprocessing:", X_processed.shape)
print("Missing values after preprocessing:", np.isnan(X_processed).sum())

# Access feature names from the intermediate DataFrame
intermediate_df = full_pipeline.named_steps['preprocess_and_engineer'].transform(X)
feature_names = intermediate_df.attrs.get('feature_names', [])
print("Number of features:", len(feature_names))
print("First 10 feature names:", feature_names[:10])
print("Last 10 feature names:", feature_names[-10:])

Number of features before preprocessing:
Numeric: 51
Categorical: 43
Year: 3

Categorical feature encoding details:
MSZoning: 5 categories, 4 encoded features
Street: 2 categories, 1 encoded features
Alley: 3 categories, 2 encoded features
LotShape: 4 categories, 3 encoded features
LandContour: 4 categories, 3 encoded features
Utilities: 2 categories, 1 encoded features
LotConfig: 5 categories, 4 encoded features
LandSlope: 3 categories, 2 encoded features
Neighborhood: 10 categories, 9 encoded features
Condition1: 9 categories, 8 encoded features
Condition2: 8 categories, 7 encoded features
BldgType: 5 categories, 4 encoded features
HouseStyle: 8 categories, 7 encoded features
RoofStyle: 6 categories, 5 encoded features
RoofMatl: 8 categories, 7 encoded features
Exterior1st: 10 categories, 9 encoded features
Exterior2nd: 10 categories, 9 encoded features
MasVnrType: 4 categories, 3 encoded features
ExterQual: 4 categories, 3 encoded features
ExterCond: 5 categories, 4 encoded features

In [4]:
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score


# Split the processed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Initialize the model
svm_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Fit the model
svm_model.fit(X_train, y_train)

# Define custom scoring metrics
scoring = {
    'MAE': 'neg_mean_absolute_error',
    'MSE': 'neg_mean_squared_error',
    'MAPE': make_scorer(lambda y, y_pred: np.mean(np.abs((y - y_pred) / y)) * 100, greater_is_better=False),
    'MedAE': 'neg_median_absolute_error',
    'R2': 'r2',
    'RMSE': make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)), greater_is_better=False)
}

# Perform cross-validation
cv_results = cross_validate(svm_model, X_train, y_train, cv=5, scoring=scoring)

# Print results
for metric, scores in cv_results.items():
    if metric.startswith('test_'):
        metric_name = metric[5:]
        mean_score = scores.mean()
        std_score = scores.std()
        print(f"{metric_name}: {-mean_score:.4f} (+/- {std_score * 2:.4f})")

# Calculate RMSE from MSE for comparison
mse = -cv_results['test_MSE'].mean()
rmse = np.sqrt(mse)
print(f"RMSE (calculated from MSE): {rmse:.4f}")

MAE: 54617.0820 (+/- 4973.2525)
MSE: 6245487573.4001 (+/- 1705847121.9797)
MAPE: 31.1824 (+/- 6.3484)
MedAE: 37949.0237 (+/- 5500.0619)
R2: 0.0526 (+/- 0.0669)
RMSE: 78846.6130 (+/- 10714.3243)
RMSE (calculated from MSE): 79028.3973


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform

param_distributions = {
    'C': loguniform(1e-3, 1e3),
    'gamma': loguniform(1e-4, 1e0),
    'epsilon': uniform(0, 1)
}

random_search = RandomizedSearchCV(
    SVR(kernel='rbf'), 
    param_distributions=param_distributions,
    n_iter=100, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1,
    random_state=42
)
random_search.fit(X_train, y_train)

In [8]:
best_params = random_search.best_params_
print("Best parameters found:")
for param, value in best_params.items():
    print(f"{param}: {value}")

Best parameters found:
C: 834.2988013047346
epsilon: 0.7722447692966574
gamma: 0.0006235377135673155


In [10]:
best_svm = random_search.best_estimator_

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import numpy as np

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

scoring = {
    'MAE': 'neg_mean_absolute_error',
    'MSE': 'neg_mean_squared_error',
    'MAPE': make_scorer(mean_absolute_percentage_error, greater_is_better=False),
    'MedAE': 'neg_median_absolute_error',
    'R2': 'r2'
}

cv_results = cross_validate(best_svm, X_train, y_train, cv=5, scoring=scoring)

# Print results
print("\nCross-validation results for the best SVM model:")
for metric, scores in cv_results.items():
    if metric.startswith('test_'):
        metric_name = metric[5:]
        mean_score = scores.mean()
        std_score = scores.std()
        if metric_name in ['MAE', 'MSE', 'MAPE', 'MedAE']:
            print(f"{metric_name}: {-mean_score:.4f} (+/- {std_score * 2:.4f})")
        else:
            print(f"{metric_name}: {mean_score:.4f} (+/- {std_score * 2:.4f})")

# Calculate RMSE
mse = -cv_results['test_MSE'].mean()
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")


Cross-validation results for the best SVM model:
MAE: 40781.2855 (+/- 3696.0260)
MSE: 4107569522.1179 (+/- 1386421715.3452)
MAPE: 22.0351 (+/- 4.3208)
MedAE: 25938.2489 (+/- 4341.1270)
R2: 0.3111 (+/- 0.0269)
RMSE: 64090.3232


In [14]:
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import time

best_params = {
    'C': 834.2988013047346,
    'epsilon': 0.7722447692966574,
    'gamma': 0.0006235377135673155
}

kernels = ['linear', 'rbf', 'sigmoid']

for kernel in kernels:
    print(f"Starting evaluation for {kernel} kernel...")
    start_time = time.time()
    
    try:
        if kernel == 'linear':
            # Linear kernel doesn't use gamma
            svm_model = SVR(kernel=kernel, C=best_params['C'], epsilon=best_params['epsilon'])
        else:
            svm_model = SVR(kernel=kernel, C=best_params['C'], epsilon=best_params['epsilon'], gamma=best_params['gamma'])
        
        scores = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        mse = -scores.mean()
        rmse = np.sqrt(mse)
        
        print(f"Kernel: {kernel}")
        print(f"Mean MSE: {mse:.2f}")
        print(f"RMSE: {rmse:.2f}")
    except Exception as e:
        print(f"An error occurred with {kernel} kernel: {str(e)}")
    
    end_time = time.time()
    print(f"Time taken for {kernel} kernel: {end_time - start_time:.2f} seconds")
    print("---")

Starting evaluation for linear kernel...
Kernel: linear
Mean MSE: 799574664.23
RMSE: 28276.75
Time taken for linear kernel: 1.81 seconds
---
Starting evaluation for rbf kernel...
Kernel: rbf
Mean MSE: 4107569522.12
RMSE: 64090.32
Time taken for rbf kernel: 0.44 seconds
---
Starting evaluation for sigmoid kernel...
Kernel: sigmoid
Mean MSE: 6245490652.89
RMSE: 79028.42
Time taken for sigmoid kernel: 0.30 seconds
---


In [16]:
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
import pandas as pd

# Define the best parameters
best_params = {
    'C': 834.2988013047346,
    'epsilon': 0.7722447692966574,
    'gamma': 0.0006235377135673155
}

# Create the SVM model with linear kernel
svm_model = SVR(kernel='linear', C=best_params['C'], epsilon=best_params['epsilon'])

# Define custom MAPE scorer
def custom_mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(custom_mape, greater_is_better=False)

# Define the scoring metrics
scoring = {
    'MAE': 'neg_mean_absolute_error',
    'MSE': 'neg_mean_squared_error',
    'MAPE': mape_scorer,
    'MedAE': 'neg_median_absolute_error',
    'R2': 'r2'
}

# Perform cross-validation
cv_results = cross_validate(svm_model, X_train, y_train, cv=5, scoring=scoring)

# Function to calculate mean and standard error
def mean_and_se(scores):
    mean = np.mean(scores)
    se = np.std(scores) / np.sqrt(len(scores))
    return f"{-mean:.4f} (±{se*2:.4f})"

# Calculate metrics
metrics = {
    'MAE': mean_and_se(cv_results['test_MAE']),
    'MSE': mean_and_se(cv_results['test_MSE']),
    'RMSE': mean_and_se(np.sqrt(-cv_results['test_MSE'])),
    'MAPE': mean_and_se(cv_results['test_MAPE']),
    'MedAE': mean_and_se(cv_results['test_MedAE']),
    'R2': mean_and_se(-cv_results['test_R2'])  # R2 is already positive
}

# Create a DataFrame for easy viewing
results_df = pd.DataFrame.from_dict(metrics, orient='index', columns=['Value (±SE)'])

print("SVM Linear Kernel Metrics (with Standard Errors):")
print(results_df)

# If you want to access individual values
for metric, value in metrics.items():
    print(f"{metric}: {value}")

SVM Linear Kernel Metrics (with Standard Errors):
                            Value (±SE)
MAE             15927.6613 (±1229.8022)
MSE    799574664.2282 (±295950843.2339)
RMSE           -27667.5758 (±5221.4874)
MAPE                   9.1859 (±0.8514)
MedAE             9916.5237 (±756.9828)
R2                     0.8691 (±0.0365)
MAE: 15927.6613 (±1229.8022)
MSE: 799574664.2282 (±295950843.2339)
RMSE: -27667.5758 (±5221.4874)
MAPE: 9.1859 (±0.8514)
MedAE: 9916.5237 (±756.9828)
R2: 0.8691 (±0.0365)
