# EDA and model building for House price on kaggle

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

def ShowDataGraph(data_frame):

    colors = ["Red", "Green", "Blue", "Orange", "Gold", "Darkseagreen"]

    len_columns = len(data_frame.columns)

    columns_name = list(data_frame.columns)

    fig = make_subplots(rows=len_columns//2 + 1, cols=2, subplot_titles=tuple(columns_name))
    current_col = 1

    for i in columns_name:
        if data_frame[i].dtype == "object":
            fig.add_trace(go.Bar(x=list(dict(data_frame[i].value_counts(sort=False)).keys()) ,y=list(dict(data_frame[i].value_counts(sort=False)).values()) ), row=columns_name.index(i) //2 + 1 , col=current_col)
        
        else:
            fig.add_trace(go.Histogram(x=list(data_frame[i])), row=columns_name.index(i) //2 + 1 , col=current_col)
        current_col = current_col + 1 if current_col < 2 else 1
            
    fig.update_layout(height=200 * len_columns// 2 , width= 900 ,title="Feature values",template="plotly_white", showlegend=False)
    
    fig.show()

# Load data

In [None]:
train_path = r"D:\Coding_practice\_Data\Housing_prices_competition\train.csv"
X_full = pd.read_csv(train_path, index_col='Id')

test_path = r"D:\Coding_practice\_Data\Housing_prices_competition\test.csv"
X_test_full = pd.read_csv(test_path, index_col='Id')


## Simple preprocess
- ### Drop NA columns (only drop the column that have len(X_full\[col_name]) > len(X_full)//4)
- ### Drop y value

In [None]:
y_column = "SalePrice"
X_train_full = X_full.copy()
X_test = X_test_full.copy()

X_train_full.dropna(axis=1, inplace=True, thresh=len(X_full)//4)
X_test.dropna(axis=1, inplace=True, thresh=len(X_test_full)//4)

y = X_train_full[y_column]
X_train_full.drop([y_column], axis=1, inplace=True)
print("Done")
print(f"Numerical columns: {len([cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64','float64']])}")
print(f"Categorical columns: {len([cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['object']])}")

In [None]:
X_test

In [None]:
ShowDataGraph(X_train_full)

## Buidling pipeline
- ### Split Train and Validation
- ### Preprocess
    - [x] Imputation (for numerical value) 
        - Median imputation
    - [x] Handle categorical variables (Label encoding / One-hot encoding)
        - ~~Try with One-hot encoding first~~
        - Use Label encoding due to One-hot lead too many features
    - [x] Handle outlier
    - [] Consider remove leakage data
- ### Building model
    - [] Simple Random Forest
    - [] Build Ensemble model

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
## Using simple Random Forest Regressor as base model to evaluate the preprocess methods
def method_evaluate(X, y, method_formulas, method_names, list_estimator, list_random_state):
    assert len(method_formulas) == len(method_names)
    results = []

    for random_state in list_random_state:
        print(f"Running state: {random_state}")
        for n_estimator in list_estimator:
            method_scores = []
            regressor = RandomForestRegressor(n_estimators=n_estimator, random_state=random_state)

            print(f"The number of trees in the forest: {n_estimator}")
            
            for method in method_formulas:
                estimator = make_pipeline(method, regressor)
                method_score =  -1 * cross_val_score(estimator, X, y, scoring="neg_mean_squared_log_error", cv=5)

                method_scores.append(np.sqrt(method_score.mean()))

            # for i in range(len(method_names)):
            #     print(f"{method_names[i]}: \t{method_scores[i]}")

            results.append(np.argmin(np.array(method_scores)))
            print("-"*20)

    unique_results = list(dict.fromkeys(results))
    duplicate_unique = [results.count(i) for i in unique_results]
    best_method = unique_results[np.argmax(np.array(duplicate_unique))]

    print(f"{method_names[best_method]} is the best method with {max(duplicate_unique)} time max on {len(list_random_state) * len(list_estimator)}")

## Handle outlier for numerical variables
def handle_outlier(data_frame):
    numeric_cols = [cname for cname in data_frame.columns if data_frame[cname].dtype in ['int64', 'float64']]
    df_ = data_frame.copy()

    for cname in numeric_cols:
        if len(df_[cname].unique()) > 100:
            upper_lim = df_[cname].quantile(.95)
            lower_lim = df_[cname].quantile(.05)

            df_.loc[(df_[cname] > upper_lim), cname] = upper_lim
            df_.loc[(df_[cname] < lower_lim), cname] = lower_lim

    return df_

## Get mean and standard deviation of everypoints per columns
def get_mean_n_std(data_frame):
    col_names = [cname for cname in data_frame.columns]
    mean_n_std_df = pd.DataFrame()

    for cname in col_names:
        mean, std = data_frame[cname].mean(), data_frame[cname].std()
        mean_n_std_df[cname] = [mean, std]

    return mean_n_std_df

## Standarize data point every columns in dataframe
def get_z_score(data_frame, mean_n_std_frame):
    assert len(data_frame.columns) == len(mean_n_std_frame.columns)

    col_names = [cname for cname in data_frame.columns]
    
    standarize_df = data_frame.copy()

    for cname in col_names:
        
        standarize_df[cname] = (standarize_df[cname] - mean_n_std_frame[cname][0]) / mean_n_std_frame[cname][1]

    return standarize_df

In [None]:
# X_testing = pd.get_dummies(X_pp)
# X_testing
# X_testing["SaleType_ConLw"].dtype

print(X_train_full["LotFrontage"].mean())
print(X_train_full["LotFrontage"].max())
print(X_train_full["LotFrontage"].min())


In [None]:
## Preprocess numerical value
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

X_imputation = X_train_full[numeric_cols]

imputation_methods =[
    SimpleImputer(strategy="constant", fill_value=-1),
    SimpleImputer(strategy="mean"),
    SimpleImputer(strategy="median"),
    KNNImputer(),
    IterativeImputer(random_state=0, n_nearest_features=5)
]

imputation_names =[
    "Minus Imputation",
    "Mean Imputation",
    "Median Imputation",
    "KNN Imputation",
    "Iterative Imputation"
]

method_evaluate(X_imputation, y, imputation_methods, imputation_names, [200,500,1000,1500], [0,10,100])

## Split train and validate
## Only use this split to build a final model to predict test set
# X_train, X_valid, y_train, y_valid = train_test_split(X_pp, y, train_size=0.8, random_state=0)

In [None]:
## Handle numerical data
def preprocess_numerical_data(train_data, test_data):

    assert list(train_data.columns) == list(test_data.columns)
    numerical_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
    X_train = train_data[numerical_cols].copy()
    X_test = test_data[numerical_cols].copy()

    ## Imputation
    imputation = IterativeImputer(random_state=0, n_nearest_features=5)
    # imputation = SimpleImputer(strategy="median")
    imputed_X_train = pd.DataFrame(imputation.fit_transform(X_train), 
                        index=pd.RangeIndex(start=1, stop=len(X_train) + 1))

    imputed_X_test = pd.DataFrame(imputation.transform(X_test), 
                        index=pd.RangeIndex(start=min(X_test.index), stop=max(X_test.index)+ 1 ))

    imputed_X_train.columns = X_train.columns
    imputed_X_test.columns = X_test.columns

    ## Outlier
    outlier_X_train = handle_outlier(imputed_X_train)
    outlier_X_test = handle_outlier(imputed_X_test)
    # print(outlier_X_test["MSSubClass"])

    ## Scaling
    mean_n_std_df = get_mean_n_std(outlier_X_train)
    scaled_X_train = get_z_score(outlier_X_train, mean_n_std_df)
    scaled_X_test = get_z_score(outlier_X_test, mean_n_std_df)

    # print(scaled_X_train)
    train_data_frame = train_data.copy()
    test_data_frame = test_data.copy()

    # print(data_frame)

    train_data_frame[numerical_cols] = scaled_X_train[numerical_cols]
    test_data_frame[numerical_cols] = scaled_X_test[numerical_cols]

    return train_data_frame, test_data_frame

## Handle categorical data
def preprocess_categorical_data(train_data_frame, test_data_frame):
    from sklearn import preprocessing

    le = preprocessing.LabelEncoder()
    train_df = train_data_frame.copy()
    test_df = test_data_frame.copy()

    categorical_cols = [cname for cname in train_data_frame.columns if train_data_frame[cname].dtype in ['object']]
    # ## One-hot encoding
    # data_frame = pd.get_dummies(df_)
    # Label encoding
    for cname in categorical_cols:
        # print(df_[cname])
        column = pd.concat([train_df[cname], test_df[cname]])
        if train_df[cname].isna().values.any():
            print(cname)

        codes, unique = column.factorize(sort=True)
        train_df[cname] = codes[min(train_df.index) - 1: max(train_df.index)]
        test_df[cname] = codes[min(test_df.index) - 1: max(test_df.index)]
        # print(len(codes[min(train_df.index) - 1: max(train_df.index)]))
        # break

    return train_df, test_df

In [None]:
X_train_numerical, X_test_numerical = preprocess_numerical_data(X_train_full, X_test)
X_pp, X_test_pp = preprocess_categorical_data(X_train_numerical, X_test_numerical)
# print(X_temp)
# print(X_pp)
# print(X_test_numerical["MSSubClass"])

In [None]:
X_pp.isna().stack()[lambda x:x].index.tolist()

## Model evaluation
- ### [] Using Cross-Validation to choose parameters for model
    - Choose the best ensemble models
    - Tune the models with right hyper parameters
- ### [] Export submission file on Test data


In [None]:
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn import metrics
from xgboost import XGBRegressor

In [None]:
model_Names =[
    "Bagging Regressor",
    "Random Forest Regressor",
    "Gradient Boosting Regressor",
    "Ada Boost Regressor",
    "Extra Trees Regressor",
    "XGB Regressor",
    "Voting Regressor"
]

my_Model = [
     BaggingRegressor(DecisionTreeRegressor(criterion="mae", min_samples_split=12),  n_estimators=1000, max_samples=0.8, max_features=1.0, random_state=0),
     RandomForestRegressor(n_estimators=1000, criterion="mae", min_samples_split=12, max_samples=0.8, random_state=0),
     GradientBoostingRegressor(learning_rate=0.1, n_estimators=1000, criterion="mae",min_samples_split=12, random_state=0),
     AdaBoostRegressor(DecisionTreeRegressor(criterion='mae', min_samples_split=12), n_estimators=1000, learning_rate=1.0, random_state=0),
     ExtraTreesRegressor(n_estimators=1000, criterion="mae", min_samples_split=12, max_samples=0.8, random_state=0),
     XGBRegressor(n_estimators=200, random_state=0, learning_rate=0.3, tree_method='gpu_hist', gpu_id=0, gamma=10)
]
list_model = [(str(i), model) for i, model in enumerate(my_Model)]
ensemble_regressor = VotingRegressor(list_model)

my_Model.append(ensemble_regressor)

print("Mean_cross_validate:\n")
assert len(model_Names) == len(my_Model)

for i, regressor in enumerate(my_Model):
    scores = -1 * cross_val_score(regressor, X_pp, y, cv=5, scoring='neg_mean_absolute_error')
    
    print(f"{model_Names[i]}: \t{np.mean(scores)}")

In [None]:
# Predict_model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=1000, criterion="mae",min_samples_split=12, random_state=0)
my_Model = [
     BaggingRegressor(DecisionTreeRegressor(criterion="mse", min_samples_split=12),  n_estimators=1000, max_samples=0.8, max_features=1.0, random_state=0),
     RandomForestRegressor(n_estimators=1000, criterion="mse", min_samples_split=12, max_samples=0.8, random_state=0),
     GradientBoostingRegressor(learning_rate=0.1, n_estimators=1000, criterion="mse",min_samples_split=12, random_state=0),
     AdaBoostRegressor(DecisionTreeRegressor(criterion='mse', min_samples_split=12), n_estimators=1000, learning_rate=1.0, random_state=0),
     ExtraTreesRegressor(n_estimators=1000, criterion="mse", min_samples_split=12, max_samples=0.8, random_state=0),
     XGBRegressor(n_estimators=200, random_state=0, learning_rate=0.3, tree_method='gpu_hist', gpu_id=0, gamma=10)
]
list_model = [(str(i), model) for i, model in enumerate(my_Model)]
Predict_model = VotingRegressor(list_model)
Predict_model.fit(X_pp, y)

In [None]:
preds_test = Predict_model.predict(X_test_pp)

output = pd.DataFrame({
    'Id': X_test.index,
    'Saleprice': preds_test
})

output.to_csv('submission.csv', index=False)