# House Prices

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn Imports
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import *


In [None]:
# Ignoring the warnings, pretending everything is fine *whistle*
import warnings

warnings.filterwarnings("ignore")


## Importing the dataframe

In [None]:
# Importing training data
dataset_origin = pd.read_csv("Data/train.csv")
dataset_origin.head(10)


In [None]:
# Copying the original dataframe to keep a copy untouched
dataset = dataset_origin.copy()


We will start by using the dataframe with a version in which we dealt with the "NaN" values.

After that, we will see how our previous analysis can be used to adapt and improve our data selection and transformation.

In [None]:
# Dataframe cleaning up that we did in the previous notebook

# Step 1 : setting a new index
dataset.set_index("Id", inplace=True)

# Step 2 : replacing "NaN" values with "NA" as it was intended.
dataset_nan_values = [
    "Alley",
    "BsmtQual",
    "BsmtCond",
    "BsmtExposure",
    "BsmtFinType1",
    "BsmtFinType2",
    "FireplaceQu",
    "GarageType",
    "GarageFinish",
    "GarageQual",
    "GarageCond",
    "PoolQC",
    "Fence",
    "MiscFeature",
]

for columns in dataset_nan_values:
    dataset[columns].fillna("NA", inplace=True)

# Step 3 : replacing "NaN" values in LotFrontage column with most frequent data
dataset["LotFrontage"] = dataset["LotFrontage"].fillna(
    dataset["LotFrontage"].value_counts().index[0]
)

# Step 4 : replacing "NaN" values in MasVnrType column with most frequent data
dataset["MasVnrType"] = dataset["MasVnrType"].fillna(
    dataset["MasVnrType"].value_counts().index[0]
)

# Step 5 : replacing "NaN" values in MasVnrArea column with most frequent data
dataset["MasVnrArea"] = dataset["MasVnrArea"].fillna(
    dataset["MasVnrArea"].value_counts().index[0]
)

# Step 6 : replacing "NaN" values in Electrical column with most frequent data
dataset["Electrical"].fillna(dataset["Electrical"].mode()[0], inplace=True)

# Step 7 : replacing "NaN" values in "GarageYrBlt" column with "YearBuilt" data
dataset["GarageYrBlt"].fillna(dataset["YearBuilt"], inplace=True)


In [None]:
print(f"Non-Null Count Verification: \n {dataset.info()}")


Now that the dataframe is a bit cleaned up, we can start trying out different algorithms using regression.

## Machine Learning Prediction

### Regression Models

We'll use the "big_fit" function that was used in Titanic and we'll adapt it to the regression algorithms that we will try out first.

I've selected :
- Gradient Boosting Regressor
- Random Forest Regressor
- Decision Tree Regressor

Others could be tested but you've got to start somewhere, right?

### Big Fit Function for our algorithms

In [None]:
# Function to train different algorithms with the following parameters
def big_fit(X_train, y_train, X_test, y_test, transformer):

    # Regression models that we will try out
    estimators = [
        ("gbr", GradientBoostingRegressor(random_state=42)),
        ("rfr", RandomForestRegressor(random_state=42)),
        ("dtr", DecisionTreeRegressor(random_state=42)),
    ]

    # Setting up a dictionnary to store algorithm scores
    default_error = dict()

    # Scoring parameters
    for estimator in estimators:
        pipe = Pipeline(steps=[("transformer", transformer), estimator])
        pipe.fit(X_train, y_train)

        y_pred = pipe.predict(X_test)
        mea_score = mean_absolute_error(y_test, y_pred)
        mse_score = mean_squared_error(y_test, y_pred)
        medae_score = median_absolute_error(y_test, y_pred)
        r2score = r2_score(y_test, y_pred)
        name = estimator[0]
        default_error[name] = {
            "mea": mea_score,
            "mse": mse_score,
            "medae": medae_score,
            "r2": r2score,
        }

    # Hyperparamaters for each algorithm
    param_grids = [
        {
            "gbr__n_estimators": [100, 250, 500],
            "gbr__min_samples_split": np.arange(2, 5),
            "gbr__min_samples_leaf": np.arange(1, 4),
            "gbr__max_features": ["sqrt", "log2", 0.2, 0.3, 0.4],
        },
        {
            "rfr__n_estimators": [100, 250, 500],
            "rfr__min_samples_split": np.arange(2, 5),
            "rfr__min_samples_leaf": np.arange(1, 4),
            "rfr__max_features": ["sqrt", "log2", 0.2, 0.3, 0.4],
        },
        {
            "dtr__min_samples_split": np.arange(2, 5),
            "dtr__min_samples_leaf": np.arange(1, 4),
            "dtr__max_features": ["sqrt", "log2", 0.2, 0.3, 0.4],
        },
    ]

    # Dictionnaries to store training scores
    best_estimators = dict()
    best_params = dict()
    best_scores = dict()
    predict_error = dict()
    error = dict()

    # Gridsearch
    score = "r2"

    for estimator, param_grid in zip(estimators, param_grids):

        grid = GridSearchCV(
            Pipeline(steps=[("preprocessor", preprocessor), estimator]),
            param_grid,
            cv=5,
            scoring=score,
            return_train_score=True,
            verbose=0,
            n_jobs=2,
        )

        name = estimator[0]
        print(f"Trainning with {name}")

        # Fit train
        grid.fit(X_train, y_train)

        # Bests
        best_estimators[name] = grid.best_estimator_
        best_params[name] = grid.best_params_
        best_scores[name] = round(grid.best_score_ * 100, 2)

        # Prediction
        y_pred = grid.predict(X_test)

        # Scores
        mea_score = mean_absolute_error(y_test, y_pred)
        mse_score = mean_squared_error(y_test, y_pred)
        medae_score = median_absolute_error(y_test, y_pred)
        r2score = r2_score(y_test, y_pred)

        # Predict Error & Errors
        predict_error[name] = {
            "mea": mea_score,
            "mse": mse_score,
            "medae": medae_score,
            "r2": r2score,
        }
        error[name] = [abs(x - y) for x, y in zip(y_test, y_pred)]

    return (
        default_error,
        best_scores,
        error,
        predict_error,
        best_params,
        best_estimators,
    )


In [None]:
""" Big fit alternative to keep in mind

def big_fit(X_train, y_train, X_test, y_test):
    estimators = [
        ("rfr", RandomForestRegressor(random_state=42)),
        ("gbr", GradientBoostingRegressor(random_state=42)),
    ]

    default_scores = dict()

    for estimator in estimators:
        model = estimator[1]
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        scores = [
            mean_squared_error(y_test, y_pred),
            median_absolute_error(y_test, y_pred),
            mean_absolute_error(y_test, y_pred),
            r2_score(y_test, y_pred),
        ]
        name = estimator[0]

        print(f"Trainning with {name}")
        default_scores[name] = [round(s, 2) for s in scores]

    param_grids = [
        {
            "n_estimators": np.arange(100, 400, 100),
        },
        {
            "n_estimators": np.arange(100, 400, 100),
        },
    ]

    best_estimators = dict()
    best_params = dict()
    best_scores = dict()
    predict_scores = dict()

    for estimator, param_grid in zip(estimators, param_grids):
        grid = GridSearchCV(
            estimator[1],
            param_grid,
            cv=5,
            scoring="neg_mean_squared_error",
            return_train_score=True,
            verbose=1,
            n_jobs=8,
        )
        name = estimator[0]
        grid.fit(X_train, y_train)
        best_estimators[name] = grid.best_estimator_
        best_params[name] = grid.best_params_
        best_scores[name] = round(grid.best_score_, 2)

        y_pred = grid.predict(X_test)
        scores = [
            mean_squared_error(y_test, y_pred),
            median_absolute_error(y_test, y_pred),
            mean_absolute_error(y_test, y_pred),
            r2_score(y_test, y_pred),
        ]
        predict_scores[name] = [round(s, 2) for s in scores]

    return default_scores, best_scores, predict_scores, best_params, best_estimators"""


### Plotting Learning Curves Function

In [None]:
# Function for plotting the learning curves


def learning_curve(name, model, X_train, y_train):

    # Using the default scoring method "r2" then we'll see
    score = "r2"

    train_sizes, train_scores, test_scores = learning_curve(
        model,
        X_train,
        y_train,
        train_sizes=np.linspace(0.1, 1, 20),
        cv=5,
        random_state=42,
        scoring=score,
    )

    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    # Plot details
    plt.title(f"Learning Curve with {name}")
    plt.xlabel("Train size")
    plt.ylabel("Score")

    plt.plot(train_sizes, train_scores_mean, label="Training score")
    plt.plot(train_sizes, test_scores_mean, label="Cross validation score")
    plt.legend()

    plt.show()


### Plotting Error Histograms Function

In [None]:
# Function for plotting error histograms (or Errorists)


def error_hist(name, model):
    plt.title(f"Error Distribution for {name}")
    plt.hist(model, bins=50)
    plt.show()


### Preprocessing our data

In [None]:
# Splitting numerical values and categorical values to transform them differently
numerical_features = make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_exclude=np.number)

# Using RobustScaler + KNNImputer on numerical values to better deal with outliers
numerical_pipeline = make_pipeline(KNNImputer(), RobustScaler())

# Using SimpleImputer and OneHotEncoder on categorical values
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", drop="first"),
)

# Putting the two pipelines together
preprocessor = make_column_transformer(
    (numerical_pipeline, numerical_features),
    (categorical_pipeline, categorical_features),
)

# Splitting features from target
features, target = dataset.drop("SalePrice", axis=1), dataset["SalePrice"]

# Splitting the dataframe in two to have one part for training and one part for testing
feat_train, feat_test, target_train, target_test = train_test_split(
    features, target, test_size=0.25, random_state=42
)

print("Let's PAR-TAY with your Big Fit")


In [None]:
""" Preprocessing alternative to keep in mind

transformer = make_column_transformer(
    # Ici éventuellement vos Imputers...
    (RobustScaler(), numeric_cols), # Ou autre
    (OneHotEncoder(drop="first"), categorical_cols),  # Ou autre (C'est ce que fait pd.get_dummies())
)

df_train_X, df_train_y = df_train.drop("SalePrice", axis=1), df_train["SalePrice"]

df_train_X_enc = transformer.fit_transform(df_train_X)

trans_features = transformer.get_feature_names_out()
print(f"{trans_features.size} features after trans")

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(df_train_new_X_enc, df_train_new_y, test_size=0.3, random_state=42)

print("Let's BIG FIT")

default_scores, best_scores, predict_scores, best_params, best_estimators = big_fit(
    X_train_new, y_train_new, X_test_new, y_test_new
)
print("Train Finished! \o/")"""


### Testing the algorithms with our transformed data

In [None]:
(
    default_scores,
    best_scores,
    predict_scores,
    best_params,
    best_estimators,
    mae,
    rmse,
    meae,
    err,
    feat_names,
    feat_scores,
) = big_fit(feat_train, target_train, feat_test, target_test, preprocessor)

print("Train Finished! Let's pop the champagne!")


In [None]:
# Creating a dataframe to store the scores and better visualise them
algorithm_scores = pd.DataFrame(
    {
        "Name": [name for name in best_estimators.keys()],
        "Default Test Score": list(default_scores.values()),
        "Grid Cross-Validation Score": list(best_scores.values()),
        "Grid Test Score": list(predict_scores.values()),
        "Mean Abs Error": list(mae.values()),
        "RMSE": list(rmse.values()),
        "Median Abs Error": list(meae.values()),
    }
)

algorithm_scores.sort_values(by="Grid Test Score", ascending=False)


In [None]:
# Checking which algorithm performed the best
best_algo = max(predict_scores, key=predict_scores.get)
print(f"The algorithm that performed the best is : {best_algo}")
winners = {"default": best_estimators[best_algo]}


In [None]:
# Checking which hyperparameters where the best
for name, it in best_params.items():
    print(f"{name}: ", end="")
    print(" ; ".join(f"{p.split('__')[1]}: {pv}" for p, pv in it.items()))


In [None]:
learning_curve(best_algo, best_estimators[best_algo], feat_train, target_train)


Because I do not discriminate against low performing algorithms and because they tried their best, I'm still going to look at their learning curves.

That's how I roll.

In [None]:
learning_curve("gbr", best_estimators["gbr"], feat_train, target_train)
