## House Prices - Advanced Regression Techniques

****** HERE WILL BE ID *******

### TL;DR

### Part 1 Imports and Definitions


In [None]:
# import numpy, matplotlib, etc. 
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import sweetviz as sw
import os
from tqdm import tqdm


# sklearn imports
from sklearn import metrics
from sklearn import pipeline
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import neural_network
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split


# define plt settings
sns.set_theme()
plt.rcParams["font.size"] = 20
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["xtick.labelsize"] = 20
plt.rcParams["ytick.labelsize"] = 20
plt.rcParams["legend.fontsize"] = 20
plt.rcParams["legend.markerscale"] = 1.5
plt.rcParams["figure.figsize"] = (20, 10)
plt.rcParams["legend.title_fontsize"] = 20


- define the input and output folders


In [None]:
input_folder = "input/"

train_data_path = os.path.join(input_folder, "train.csv")
test_data_path = os.path.join(input_folder, "test.csv")

  - define the show graphs variable


In [None]:
SHOW_GRAPHS = False

#### Load the traning data
  - Load the csv data to variables


In [None]:
train_data = pd.read_csv(train_data_path)

test_data = pd.read_csv(test_data_path)

# display the first few rows of the data
train_data.head()

### Part 2 Data Investigation EDA


#### Data Cleaning

- Remove the id column


In [None]:
# Drop the "Id" column from the train_data DataFrame
train_data = train_data.drop("Id", axis=1)

# Drop the "Id" column from the test_data DataFrame
test_id = test_data["Id"]
test_data = test_data.drop("Id", axis=1)


train_data.head()

- Count the number of feuatures

In [None]:
print(f"Number of features: {train_data.shape[1]}")

Get summary statistics for the training dataset show only the numerical columns


In [None]:
train_data.describe()


- Get the data types of the columns in the training dataset


In [None]:
display(train_data.info())

We can see that most of the data is object

- Check for missing values

In [None]:
def show_missing_data_with_percentage(data):
    print("Missing values in the dataset:")
    print("-----------------------------------------")
    print("Total Rows: ", len(data))
    print("_________________________________________")
    # Display missing values in each column of the training dataset
    missing_values = data.isnull().sum()
    missing_percentage = (missing_values / len(train_data)) * 100
    missing_data = pd.concat([missing_values, missing_percentage], axis=1, keys=['Missing Values', 'Percentage'])
    missing_data.sort_values(by='Missing Values', ascending=False, inplace=True)
    print(missing_data.head(20))
    
    print("\n\nTotal missing values: ", missing_data['Missing Values'].sum())
    
    


In [None]:
# Display missing values in the training dataset
show_missing_data_with_percentage(train_data)

We can see that the columns "Alley", "PoolQC", "Fence", "MiscFeature" have a lot of missing values.

- Handle the missing data<br><br>
First step to remove highly missing features (by threshold)

In [None]:
def drop_highly_missing_features(data, fetures_to_drop):
    data = data.drop(fetures_to_drop, axis=1)
    return data


def find_features_with_missing_values_threshold(data, threshold):
    missing_values = data.isnull().sum()
    missing_percentage = (missing_values / len(train_data)) * 100
    missing_data = pd.concat([missing_values, missing_percentage], axis=1, keys=['Missing Values', 'Percentage'])
    missing_data.sort_values(by='Missing Values', ascending=False, inplace=True)
    features_to_drop = missing_data[missing_data['Percentage'] > threshold].index
    return features_to_drop


In [None]:
# Set the threshold for missing values to remove
threshold = 80
# for 80 it return # ["Alley", "PoolQC", "Fence", "MiscFeature"]
drop_features = find_features_with_missing_values_threshold(train_data, threshold) 


train_data = drop_highly_missing_features(train_data, drop_features)

test_data = drop_highly_missing_features(test_data, drop_features)

print("Remove this features: ", drop_features)

In [None]:
show_missing_data_with_percentage(train_data)

Handling Missing Values for Numerical Features

In [None]:
# fill missing numerical values with median
def handle_missing_values_numerical(data):
    for column in data.select_dtypes(include=[np.number]).columns:
        data[column].fillna(data[column].median(), inplace=True)
    return data

Handling Missing Values for Categorical Features

In [None]:
# Fill missing categorical values with most frequent value
def handle_missing_values_categorical(data):
    for column in data.select_dtypes(include=[object]).columns:
        data[column].fillna(data[column].mode()[0], inplace=True)
    return data

One function to handle the missing values

In [None]:
def handle_missing_values(data):
    data = handle_missing_values_numerical(data)
    data = handle_missing_values_categorical(data)
    return data

In [None]:
# fill the missing values in the train data
train_data = handle_missing_values(train_data)

# fill the missing values in the test data
test_data = handle_missing_values(test_data)

Verify No More Missing Values

In [None]:
print("\nMissing values in the training dataset after filling:")
print(train_data.isnull().sum().sum())

print("\nMissing values in the test dataset after filling:")
print(test_data.isnull().sum().sum())


In [None]:
train_data.head()

Convert Categorical Features to Numeric Using One-Hot Encoding

In [None]:
print("Data types in training data:")
print(train_data.dtypes)


In [None]:
# Identify categorical columns
categorical_columns = train_data.select_dtypes(include=['object']).columns

# Use one hot encoding to convert categorical columns to numerical columns
train_data_encoded = pd.get_dummies(train_data, columns=categorical_columns)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_columns)

# Save the target variable (SalePrice) and then drop it from train_data_encoded before alignment
sale_price = train_data_encoded['SalePrice']
train_data_encoded = train_data_encoded.drop('SalePrice', axis=1)

# Align train and test data to ensure they have the same columns
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='inner', axis=1)

# Reattach the SalePrice column to train_data_encoded
train_data_encoded['SalePrice'] = sale_price


In [None]:
# Reassigning the encoded DataFrame back to the original variable
train_data = train_data_encoded
test_data = test_data_encoded

# Display the first few rows of the training data after encoding
train_data.head()

#### Feature Analysis & Visualization 

Descriptive Statistics

In [None]:
desc_stats = train_data.describe()
print("Descriptive Statistics:\n", desc_stats)


Histogram for SalePrice

In [None]:
if SHOW_GRAPHS:
    fig = px.histogram(train_data, x='SalePrice', title='Distribution of SalePrice')
    fig.show()

We can see the distribution of SalePrice

- Box plot for SalePrice

In [None]:
if SHOW_GRAPHS:
    fig = px.box(train_data, y='SalePrice', title='Boxplot of SalePrice')
    fig.show()

GrLivArea: Above grade (ground) living area square feet

In [None]:
if SHOW_GRAPHS:
    fig = px.scatter(train_data, x='GrLivArea', y='SalePrice', title='GrLivArea vs SalePrice')
    fig.show()


In [None]:
if SHOW_GRAPHS:
    fig = px.scatter(train_data, x='GrLivArea', y='SalePrice', marginal_x='histogram', marginal_y='histogram', title='Joint Plot of GrLivArea vs SalePrice')
    fig.show()

TotalBsmtSF: Total square feet of basement area

In [None]:
if SHOW_GRAPHS:
    fig = px.scatter(train_data, x='TotalBsmtSF', y='SalePrice', title='TotalBsmtSF vs SalePrice')
    fig.show()

OverallQual: Rates the overall material and finish of the house

In [None]:

if SHOW_GRAPHS:
    overall_qual_mean = train_data.groupby('OverallQual')['SalePrice'].mean()
    fig = px.bar(overall_qual_mean, x=overall_qual_mean.index, y='SalePrice', title='OverallQual vs SalePrice')
    fig.add_trace(go.Scatter(x=overall_qual_mean.index, y=overall_qual_mean.values, mode='lines', name='lines'))
    fig.show()


We calculate the correlation of each feature with SalePrice and sort them to identify the strongest relationships.

In [None]:
correlation_matrix = train_data.corr()
# use abs with the correlation matrix
correlation_matrix = correlation_matrix.abs()
correlation_with_target = correlation_matrix['SalePrice'].sort_values(ascending=False)


- Change N to the number of top features you want to analyze 

In [None]:
N = 10
top_features = correlation_with_target.index[1:N+1] 
filtered_data = train_data[top_features.to_list() + ['SalePrice']]


Top correlation with SalePrice

In [None]:
print(f"Top {N} features with the highest correlation with SalePrice:")
print(correlation_with_target.head(N + 1))

Descriptive Statistics with the N Top correlation

In [None]:
desc_stats = filtered_data.describe()
print("Descriptive Statistics for Top Features:\n", desc_stats)


Pairplot Visualizing Correlation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_and_calculate_statistics(filtered_data):
    """
    For each feature in the filtered_data (excluding 'SalePrice'), plots the feature against 'SalePrice'
    and calculates relevant statistics.
    
    Parameters:
    - filtered_data: DataFrame containing the top N features most correlated with 'SalePrice' and 'SalePrice' itself.
    """
    # Exclude 'SalePrice' from the features to plot
    features = [column for column in filtered_data.columns if column != 'SalePrice']
    
    for feature in features:
        # Plotting
        plt.figure(figsize=(10, 6))
        if filtered_data[feature].dtype == 'object' or len(filtered_data[feature].unique()) <= 20:
            # For categorical features or numerical features with few unique values, use boxplot
            sns.boxplot(x=feature, y='SalePrice', data=filtered_data)
        else:
            # For continuous numerical features, use scatterplot
            sns.scatterplot(x=feature, y='SalePrice', data=filtered_data)
        plt.title(f'Sale Price by {feature}')
        plt.show()
        
        # Calculating and displaying statistics
        if filtered_data[feature].dtype == 'object' or len(filtered_data[feature].unique()) <= 20:
            # For categorical features, display mean SalePrice for each category
            mean_prices = filtered_data.groupby(feature)['SalePrice'].mean().sort_values(ascending=False)
            print(f"Mean Sale Price for each {feature}:")
            print(mean_prices)
        else:
            # For continuous numerical features, consider displaying correlation or other relevant statistics
            correlation = filtered_data[[feature, 'SalePrice']].corr().iloc[0, 1]
            print(f"Correlation between {feature} and Sale Price: {correlation:.2f}")


In [None]:
if SHOW_GRAPHS:
    plot_and_calculate_statistics(filtered_data)

Heatmap for correlation matrix


In [None]:
import plotly.graph_objects as go

def show_top_correlated_features(train_data, n):
    """
    Displays an interactive heatmap for the top n features most correlated with 'SalePrice', including 'SalePrice' at the most right, with annotations for each cell, using Plotly.
    
    Parameters:
    - train_data: DataFrame containing the training data.
    - n: The number of top features to consider for the heatmap.
    """
    # Calculate the correlation matrix
    correlation_matrix = train_data.corr()
    
    # Find the top n features most correlated with 'SalePrice', excluding 'SalePrice' itself
    top_n_features = correlation_matrix['SalePrice'].abs().sort_values(ascending=False)[1:n+1].index
    
    # Ensure 'SalePrice' is at the end of the list of features to display
    features_to_display = list(top_n_features) + ['SalePrice'] 
    
    # Filter the correlation matrix to include only the selected features
    filtered_corr_matrix = train_data[features_to_display].corr()
    
    # Plotting
    fig = go.Figure(data=go.Heatmap(
        z=filtered_corr_matrix.values,
        x=filtered_corr_matrix.columns,
        y=filtered_corr_matrix.index,
        colorscale='Viridis',
        colorbar=dict(title='Correlation'),
        text=[["{:.2f}".format(val) for val in row] for row in filtered_corr_matrix.values],  # Annotations
        texttemplate="%{text}",
        hoverinfo="none"  # Disable hover info to only show the annotations
    ))
    
    fig.update_layout(
        title=f'Top {n} Features Correlated with SalePrice (Including SalePrice)',
        xaxis_title="Features",
        yaxis_title="Features",
        xaxis=dict(tickmode="array", tickvals=list(range(len(features_to_display))), ticktext=features_to_display),
        yaxis=dict(tickmode="array", tickvals=list(range(len(features_to_display))), ticktext=features_to_display)
    )
    
    fig.show()

In [None]:
if SHOW_GRAPHS:
    show_top_correlated_features(filtered_data, N)

#### Feature Engineering

Total Square Footage
- We create a new feature TotalSF by summing up the total basement square footage, first floor square footage, second floor square footage, and garage area. This feature represents the total square footage of the house.

In [None]:
features_engineering_list = []

# create TotalSF feature
def create_TotalSF_feature(data, features_engineering_list=None):
    data['TotalSqureF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF'] + data['GarageArea']
    if features_engineering_list is not None:
        features_engineering_list.append('TotalSqureF')
    return data

# Create the TotalSF feature for the train and test data
train_data = create_TotalSF_feature(train_data, features_engineering_list)

test_data = create_TotalSF_feature(test_data)

Age of the House
- We calculate the age of the house at the time of sale by subtracting the year the house was built from the year it was sold.

In [None]:
# create age_of_house feature
def create_age_of_house_feature(data, features_engineering_list=None):
    data['AgeOfHouse'] = data['YrSold'] - data['YearBuilt']
    if features_engineering_list is not None:
        features_engineering_list.append('AgeOfHouse')
    return data


# Create the AgeOfHouse feature for the train and test data
train_data = create_age_of_house_feature(train_data, features_engineering_list)

test_data = create_age_of_house_feature(test_data)

Age of the Renovation
- We calculate the age of the house since its most recent renovation by subtracting the year of the most recent renovation from the year it was sold.

In [None]:
# create age_of_renovation feature
def create_age_of_renovation_feature(data, features_engineering_list=None):
    data['AgeOfRenovation'] = data['YrSold'] - data['YearRemodAdd']
    if features_engineering_list is not None:
        features_engineering_list.append('AgeOfRenovation')
    return data

# Create the AgeOfRenovation feature for the train and test data
train_data = create_age_of_renovation_feature(train_data, features_engineering_list)

test_data = create_age_of_renovation_feature(test_data)

Total Bathrooms
- We create a new feature TotalBath by summing up the number of full and half bathrooms in the basement and above grade, with half bathrooms counted as 0.5.

In [None]:
# create TotalBath feature
def create_TotalBath_feature(data, features_engineering_list=None):
    data['TotalBath'] = data['FullBath'] + 0.5 * data['HalfBath'] + data['BsmtFullBath'] + 0.5 * data['BsmtHalfBath']
    if features_engineering_list is not None:
        features_engineering_list.append('TotalBath')
    return data

# Create the TotalBath feature for the train and test data
train_data = create_TotalBath_feature(train_data, features_engineering_list)

test_data = create_TotalBath_feature(test_data)

Total Porch Area
- We create a new feature TotalPorchSF by summing up the area of all porch-related features, representing the total porch area of the house.

In [None]:
# create TotalPorchSF feature
def create_TotalPorchSF_feature(data, features_engineering_list=None):
    data['TotalPorchSF'] = data['OpenPorchSF'] + data['EnclosedPorch'] + data['3SsnPorch'] + data['ScreenPorch']
    if features_engineering_list is not None:
        features_engineering_list.append('TotalPorchSF')
    return data

# Create the TotalPorchSF feature for the train and test data
train_data = create_TotalPorchSF_feature(train_data, features_engineering_list)

test_data = create_TotalPorchSF_feature(test_data)

Display the New Features

In [None]:
print(train_data[features_engineering_list].head())


In [None]:
if SHOW_GRAPHS:
    show_top_correlated_features(train_data, N)

### Part 3 Regression Models & Cross Validation

- Split the data

In [None]:
from sklearn.model_selection import cross_val_score


# split the data into features and target variable
X = train_data.drop(['SalePrice'], axis=1)
y = train_data['SalePrice']

# Encode categorical variables
X = pd.get_dummies(X)
X_test = pd.get_dummies(test_data)

# Align the columns in the test set to match the train set
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

- Standardize the features


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

- Define a function for computing RMSE


In [None]:
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train_scaled, y_train, scoring="neg_mean_squared_error", cv=5))
    return rmse.mean()

#### Regression models

We will implementing different regression models<br> evaluating them using Cross Validation, and computing the RMSE

-  Linear Regression

In [None]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg_rmse = rmse_cv(lin_reg)
print(f"Linear Regression RMSE: {lin_reg_rmse}")

# Fit the model and predict
lin_reg.fit(X_train_scaled, y_train)
y_pred = lin_reg.predict(X_val_scaled)
lin_reg_val_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Linear Regression Validation RMSE: {lin_reg_val_rmse}")


- SGD Regressor

In [None]:
# SGD Regressor
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
sgd_reg_rmse = rmse_cv(sgd_reg)
print(f"SGD Regressor RMSE: {sgd_reg_rmse}")

# Fit the model and predict
sgd_reg.fit(X_train_scaled, y_train)
y_pred = sgd_reg.predict(X_val_scaled)
sgd_reg_val_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"SGD Regressor Validation RMSE: {sgd_reg_val_rmse}")


- Ridge Regression

In [None]:
# Ridge Regression
from sklearn.linear_model import Ridge


ridge_reg = Ridge(alpha=1.0)
ridge_reg_rmse = rmse_cv(ridge_reg)
print(f"Ridge Regression RMSE: {ridge_reg_rmse}")

# Fit the model and predict
ridge_reg.fit(X_train_scaled, y_train)
y_pred = ridge_reg.predict(X_val_scaled)
ridge_reg_val_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Ridge Regression Validation RMSE: {ridge_reg_val_rmse}")


- Lasso Regression

In [None]:
# Lasso Regression
from sklearn.linear_model import Lasso


lasso_reg = Lasso(alpha=0.1)
lasso_reg_rmse = rmse_cv(lasso_reg)
print(f"Lasso Regression RMSE: {lasso_reg_rmse}")

# Fit the model and predict
lasso_reg.fit(X_train_scaled, y_train)
y_pred = lasso_reg.predict(X_val_scaled)
lasso_reg_val_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Lasso Regression Validation RMSE: {lasso_reg_val_rmse}")


- Elastic Net Regression

In [None]:
# Elastic Net Regression
from sklearn.linear_model import ElasticNet


elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net_rmse = rmse_cv(elastic_net)
print(f"Elastic Net Regression RMSE: {elastic_net_rmse}")

# Fit the model and predict
elastic_net.fit(X_train_scaled, y_train)
y_pred = elastic_net.predict(X_val_scaled)
elastic_net_val_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Elastic Net Regression Validation RMSE: {elastic_net_val_rmse}")


# Fix the jump!