# House Prices - Advanced Regression Techniques
1. [General](#general)
2. [Data Overview](#summary)
3. [Data Processing and Cleaning](#dpc)
4. [Feature Selection](#feature)
5. [Machine Learning and Modeling](#modeling)

<a id="general"></a>
## General

In [None]:
!pip3 install pandas
!pip3 install plotly
!pip3 install statsmodels
!pip3 install sklearn
!pip3 install matplotlib

### Imports 

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Plotting
import plotly.express as px
import plotly.graph_objects as go

# Feature Engineering
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Modeling and Machine Learning
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

target = ['SalePrice']

### Functions

In [None]:
# Computational Functions
def vif(df, features):
    x = df[features].assign(const=1)
    vif = pd.DataFrame()
    vif["variable"] = features
    vif["value"] = [variance_inflation_factor(x.values, i) for i in range(len(features))]
    vif.sort_values('value', ascending=False, inplace=True)
    print(vif)

In [None]:
# Figure and Plotting Functions
def heatmap(df):

    fig = go.Figure(
        data=go.Heatmap(
            z=df,
            x=df.columns,
            y=df.columns,
            hoverongaps = False
        )
    )
    
    fig.show()

def scattermatrix(df, height, width):
    fig = px.scatter_matrix(df)
    
    fig.update_layout(
        dragmode='select',
        width=width,
        height=height,
        hovermode='closest',
    )
    
    fig.show()

In [None]:
# Helper Functions
def printMLScore(algo, score, opt_score):
    print(f"Model: {algo} \t Score: {score:.4f} \t Score (GridSearchCV): {opt_score:.4f}")

def saveSubmission(scores, ids, algo):
    df = pd.DataFrame({"Id": ids, "SalePrice": scores}, columns=["Id", "SalePrice"])
    df = df.set_index("Id")
    df.to_csv(f'House Price Submission (Kaggle) - {algo}.csv')

<a id="summary"></a>
##  Data Overview

In [None]:
# Apply one hot encoding for nominal

train_nom = train.select_dtypes(exclude=['int64', 'float64']).columns
test_nom = test.select_dtypes(exclude=['int64', 'float64']).columns

train_dummy = train
test_dummy = test

for nom in train_nom:
    train_dummy = pd.get_dummies(train_dummy, drop_first=True, columns=[nom], prefix=nom)

for nom in train_nom:
    test_dummy = pd.get_dummies(test_dummy, drop_first=True, columns=[nom], prefix=nom)
    
train = train_dummy
test = test_dummy

# Apply one hot encoding for ordinal

In [None]:
train.head(5)

In [None]:
train = train.drop('Id', axis=1)

In [None]:
train.shape, test.shape

In [None]:
train.describe()

In [None]:
train.columns

In [None]:
train.dtypes

In [None]:
hist = train[target].hist(bins=50)

In [None]:
skew = float(train[target].skew())
kurt = float(train[target].kurt())

print(f"Skewness: {skew:.2f} \t Kurtosis: {kurt:.2f}")

In [None]:
# Get Correlation Matrix
corr = train.corr()
corr_sp = abs(corr[target])
index = corr_sp[corr_sp['SalePrice'] > 0.5]
index = list(index.index)
index

In [None]:
heatmap(corr[index])

In [None]:
scattermatrix(train[index], 1300, 1300)

<a id="dpc"></a>
## Data Processing and Cleaning

### Duplicates

In [None]:
if len(train[train.duplicated()]) > 0:
    print(f"Found duplicates")

### Deleting Columns of Null Values

In [None]:
train_nan = pd.isna(train).sum()
test_nan = pd.isna(test).sum()

# Get columns that have missing values on both train and test data
nan_features = list(set().union(train_nan[train_nan > 0].index, test_nan[test_nan > 0].index))

# Display the table summary of null values
nan_table = pd.concat([train_nan.loc[nan_features], test_nan.loc[nan_features]], axis=1, keys=["Train", "Test"])
print(nan_table)

imp_features = list(set(nan_features).intersection(set(index)))

# Exclude important features from the ones that will be deletes
nan_features = list(set(nan_features) - set(index))

In [None]:
# Drop columns with missing values
train = train.drop(nan_features, axis=1)
test = test.drop(nan_features, axis=1)

### Imputation of Missing Values

In [None]:
# Impute missing values present in Test Data (TotalBsmtSF, GarageCars, GarageArea)

for feature in imp_features:
    test[feature].fillna(test[feature].mean(), inplace=True)

<a id="feature"></a>
## Feature Selection

In [None]:
# Create variable for all variables without target
features = list(set(index) - set(target))

In [None]:
# Applying a logarithmic function to the Sale Price produces a more "normal distribution"
hist_log = np.log(train["SalePrice"]).hist(bins=50)

In [None]:
# Create a new variable based on the logarithmic value of the sale price
# ML algorithms are usually better at normally distributed values
train["LogPrice"] = np.log(train["SalePrice"])
train["LogPrice"]

In [None]:
# Display VIF of features as to detect multicollinearity
vif(train, features)

In [None]:
# Remove at least 2 features to have all variables with a VIF below 5
features = list(set(features) - set(['GarageCars', 'GrLivArea']))
vif(train, features)

<a id="modeling"></a>
## Modeling with Machine Learning

#### Splitting for Training and Testing

In [None]:
# Create x and y for training
train_x = train[features]
train_y  = train['LogPrice']

In [None]:
# Create validation data
s_train_x, s_test_x, s_train_y, s_test_y = train_test_split(train_x, train_y, train_size=0.7, random_state=0)

### Linear Regression

In [None]:
# Create Linear Regression Model
lm = LinearRegression()
lm.fit(s_train_x, s_train_y)

# Optimize hyperparameters with GridSearchCV
param_lm = {"fit_intercept" : [True, False], "copy_X" : [True, False]}

model_lm = GridSearchCV(lm, param_lm, verbose=1 , scoring = "r2")
model_lm.fit(s_train_x, s_train_y)

printMLScore("Linear Regression", lm.score(s_test_x, s_test_y), model_lm.score(s_test_x, s_test_y))

In [None]:
# Predictions
pred_lm = model_lm.predict(test[features])
saveSubmission(np.exp(pred_lm), test['Id'], "Linear Regression")

### Decision Tree Regression

In [None]:
# Create Decision Tree Regression Model
dtr = DecisionTreeRegressor(random_state=0)
dtr.fit(s_train_x, s_train_y)

# Optimize hyperparameters with GridSearchCV
param_dtr = {
    "criterion" : ["friedman_mse"], 
    "splitter" : ["best", "random"], 
    "min_samples_split" : [2, 3, 5, 10], 
    "max_features" : ["auto", "log2"]
}

model_dtr = GridSearchCV(dtr, param_dtr, verbose=1, scoring="r2")
model_dtr.fit(s_train_x, s_train_y)

printMLScore(f"Decision Tree Regression", dtr.score(s_test_x, s_test_y), model_dtr.score(s_test_x, s_test_y))

In [None]:
# Predictions
pred_dtr = model_dtr.predict(test[features])
saveSubmission(np.exp(pred_dtr), test['Id'], "Decision Tree Regression")

### Random Forest Regression

In [None]:
# Create Random Forest Regression Model
rfr = RandomForestRegressor(random_state=0)
rfr.fit(s_train_x, s_train_y)

# Optimize hyperparameters with GridSearchCV
param_rfr = {
    "n_estimators" : [5, 10, 15, 20], 
    "min_samples_split" : [2, 3, 5, 10], 
    "max_features" : ["auto", "log2"]
}

model_rfr = GridSearchCV(rfr, param_rfr, verbose=1, scoring="r2")
model_rfr.fit(s_train_x, s_train_y)

printMLScore("Random Forest Regression", rfr.score(s_test_x, s_test_y), model_rfr.score(s_test_x, s_test_y))

In [None]:
# Predictions
pred_rfr = model_rfr.predict(test[features])
saveSubmission(np.exp(pred_rfr), test['Id'], "Random Forest Regression")

### Ridge Regression

In [None]:
# Create Ridge Regression Model
rid = Ridge()
rid.fit(s_train_x, s_train_y)

# Optimize hyperparameters with GridSearchCV
param_ridge = {
    "fit_intercept" : [True, False],
    "copy_X" : [True, False],
    "solver" : ["auto"]
}

model_rid = GridSearchCV(rid, param_ridge, verbose=1, scoring="r2")
model_rid.fit(s_train_x, s_train_y)

printMLScore("Ridge Regression", rid.score(s_test_x, s_test_y), model_rid.score(s_test_x, s_test_y))

In [None]:
# Predictions
pred_rid = model_rid.predict(test[features])
saveSubmission(np.exp(pred_rid), test['Id'], "Ridge Regression")

### Gradient Boosting Regression

In [None]:
# # Create Gradient Boosing Regression Regression Model

# model_gbr = GradientBoostingRegressor()
# model_gbr.fit(s_train_x, s_train_y)

# printMLScore("Gradient Boosting Regression", gbr.score(s_test_x, s_test_y), model_gbr.score(s_test_x, s_test_y))

In [None]:
# # Predictions
# pred_gbr = model_gbr.predict(test[features])
# saveSubmission(np.exp(pred_gbr), test['Id'], "Gradient Boosting Regression")