In [73]:
# TODO 
# 0. Create a class to preprocess data.
# 1. Use entire training data to determine model accuracy with cross validation and and XGBoost model.
# 2. Used optimized model parameters to make predictions on test data.
# 3. Submit predictions to Kaggle Competition: https://www.kaggle.com/competitions/home-data-for-ml-course/leaderboard#


# Notes:
# Train and test data must pass through same preprocessing .... features that train on should be present in test
# Save model into JSON format.
# clf.save_model("clf.json")

In [74]:
import pandas as pd
import numpy as np
from collections import Counter
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',500)

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone

In [75]:
# test_model = XGBRegressor()
# test_model

In [76]:

class PreprocessData:
    """ 
        - The PreProcessData class reads a CSV file and processes the data therein. 
        - PreprocessData can take in either a train or test data set.
        - Encoders applied to training data can be passed to test data to apply the same transformations.
        - Preprocessing occurs before a train/valid/test split.
        - The class returns target (y), predictors (X), and transformer objects
        (imputer, ordinal encoder, one-hot encoder) applied to the data.

    """
    def __init__(self,imputer=None, ordinal_encoder = None,one_hot_encoder=None, is_train = True, new_feature_names=None):
        # target and predictors
        self.full_data = None
        self.y = None
        self.X = None

        # columns
        self.numerical_cols = []
        self.categorical_cols = []
        self.ordinal_cols = []

        # column transformers
        self.imputer = imputer
        self.ordinal_encoder = ordinal_encoder
        self.one_hot_encoder = one_hot_encoder
        

        # dataframes
        self.imputed_df = None
        self.ordinal_encoded_df = None

        self.new_feature_names = new_feature_names
        self.one_hot_encoded_df = None

        self.is_train = is_train

    def get_original_data(self):
        return self.full_data
    
    def set_data(self,directory,index_column=None):
        self.full_data = pd.read_csv(directory, index_col=index_column)
    

    def set_X_y(self,target_column_name=None):
        if self.is_train and target_column_name:
            all_data = self.get_original_data().copy()
            # remove rows with no y values
            all_data.dropna(axis=0,subset=[target_column_name],inplace=True)
            # set target
            self.y = all_data[target_column_name]
            # drop SalePrice column from predictors
            self.X  = all_data.drop([target_column_name],axis=1,inplace=False)
        else: #Note: Kaggle test data has no y-values
            self.X = self.get_original_data().copy()


    def get_y(self):
        return self.y
    
    def get_X(self):
        return self.X

    def handle_missing_values(self,input_df,strat):
        if self.imputer:
            self.imputed_df = pd.DataFrame(self.imputer.transform(input_df))
            self.imputed_df.columns = input_df.columns
            self.imputed_df.index = input_df.index
            #preserve data types
            self.imputed_df = self.imputed_df.astype(input_df.dtypes.to_dict())
            return self.imputed_df
        elif self.imputer == None: # Note: no imputer provided thus instantiate
            self.imputer = SimpleImputer(strategy=strat)
            self.imputed_df = pd.DataFrame(self.imputer.fit_transform(input_df))
            self.imputed_df.columns = input_df.columns
            self.imputed_df.index = input_df.index
            #preserve data types
            self.imputed_df = self.imputed_df.astype(input_df.dtypes.to_dict())
            return self.imputed_df,self.imputer
            

    def get_imputed_df(self):
        return self.imputed_df
    
    def get_imputer(self):
        return self.imputer
    

    def set_numerical_columns(self, input_df):
        self.numerical_cols =  [ col for col in input_df.columns.values
                        if input_df[col].dtype == "float64" or \
                            input_df[col].dtype == "int64"]

    def set_categorical_columns(self,input_df):
        self.categorical_cols = [col for col in input_df.columns.values
                if input_df[col].dtype == "object"]
    

    def set_ordinal_columns(self, column_list):
        self.ordinal_cols = column_list
        # if self.get_categorical_columns():
        #     self.categorical_cols = [ col for col in self.get_categorical_columns() if col not in column_list]

    def get_ordinal_columns(self):
        return self.ordinal_cols

    def get_numerical_columns(self):
        return self.numerical_cols

    def get_categorical_columns(self):
        return self.categorical_cols
    

    def get_ordinal_encoder(self):
        return self.ordinal_encoder

    def encode_ordinal_data(self,input_df):
        if self.ordinal_encoder:
            self.ordinal_encoded_df = pd.DataFrame(self.ordinal_encoder.transform(input_df))
            self.ordinal_encoded_df.index = input_df.index
            self.ordinal_encoded_df.columns = input_df.columns
            return self.ordinal_encoded_df
        elif self.ordinal_encoder == None: #Note: training dataframe
            self.ordinal_encoder = OrdinalEncoder()
            self.ordinal_encoded_df = pd.DataFrame(self.ordinal_encoder.fit_transform(input_df))
            self.ordinal_encoded_df.index = input_df.index
            self.ordinal_encoded_df.columns = input_df.columns
            return self.ordinal_encoded_df, self.ordinal_encoder

    def get_low_high_cardinality_columns(self, input_df,column_list, max_cardinality):
        low_card = []
        high_card = []
        for col in column_list:
            if input_df[col].nunique() <= max_cardinality and input_df[col].dtype == "object":
                low_card.append(col)
            else:
                high_card.append(col)
        return low_card, high_card
    


    def reduce_cardinality(self,column, threshold = 0.70, return_categories = True):
        threshold_value = threshold*len(column)
        frequency_sum = 0

        new_category_list = []
        counts=Counter(column)
        most_common = dict(counts.most_common())

        for category,count in most_common.items():
            frequency_sum = frequency_sum + count
            new_category_list.append(category)
            if frequency_sum >= threshold_value:
                break
        # Test
        new_category_list.append("Other")
        new_column = column.apply(lambda x: x if x in new_category_list else "Other")
        
        if return_categories:
            return new_column,new_category_list
        else:
            return new_column
        
    def create_low_cardinality_df(self,input_df,low_card_col,high_card_col):
        low_card_df = input_df[low_card_col]
        for col in high_card_col:
            transformed_col = self.reduce_cardinality(input_df[col],threshold=0.7,return_categories=False)
            low_card_df = pd.concat([low_card_df,transformed_col.to_frame()],axis=1)
    
        return low_card_df
    

    def one_hot_encode(self,input_df):
        if self.one_hot_encoder == None: # Note: training data
            self.one_hot_encoder = OneHotEncoder(handle_unknown="ignore",sparse_output=False)
            self.one_hot_encoded_df = pd.DataFrame(self.one_hot_encoder.fit_transform(input_df))
            self.one_hot_encoded_df.index = input_df.index
            self.new_feature_names = self.one_hot_encoder.get_feature_names_out(input_df.columns.values)
            self.one_hot_encoded_df.columns = self.new_feature_names
            self.one_hot_encoded_df.columns = self.one_hot_encoded_df.columns.astype(str)
            return self.one_hot_encoded_df,self.one_hot_encoder,self.new_feature_names

        elif self.one_hot_encoder != None: #Note: test data
            self.one_hot_encoded_df = pd.DataFrame(self.one_hot_encoder.transform(input_df))
            self.one_hot_encoded_df.index = input_df.index
            self.one_hot_encoded_df.columns = self.new_feature_names
            self.one_hot_encoded_df.columns = self.one_hot_encoded_df.columns.astype(str)
            return self.one_hot_encoded_df
        

    # def convert_column_dtypes(self,input_df, current_dtype,new_dtype):
    #     columns_to_change = input_df.loc[:,input_df.dtypes==input_dtype]



In [77]:
#TODO test as python script ... if __name__ == "main": run the code below

# Note: read data and setting X and y
process_training_data = PreprocessData()
process_training_data.set_data("./home-data-for-ml-course/train.csv",index_column="Id")
process_training_data.set_X_y("SalePrice")
y_train = process_training_data.get_y()
X_train = process_training_data.get_X()



# Note: impute missing values
imputed_train_df, imputer = process_training_data.handle_missing_values(X_train,strat="most_frequent")

# Note: determined examining data ... 
ordinal_columns = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2",\
"HeatingQC","KitchenQual","Functional","FireplaceQu","GarageQual","GarageCond"]

# Test set and get of numerical, categorical, ordinal columns
process_training_data.set_numerical_columns(imputed_train_df)
process_training_data.set_categorical_columns(imputed_train_df)
process_training_data.set_ordinal_columns(ordinal_columns) 


# TODO these should be class attributes
train_numerical_columns = process_training_data.get_numerical_columns()
train_categorical_columns = process_training_data.get_categorical_columns()
train_ordinal_columns = process_training_data.get_ordinal_columns()

ordinal_encoded_df, ordinal_encoder = \
process_training_data.encode_ordinal_data(imputed_train_df[ordinal_columns])

one_hot_columns = [col for col in train_categorical_columns if col not in train_ordinal_columns]

low_card, high_card =\
    process_training_data.get_low_high_cardinality_columns(imputed_train_df,one_hot_columns,6)

df_to_1hot_encode = process_training_data.\
    create_low_cardinality_df(imputed_train_df,low_card, high_card)


one_hot_encoded_df,one_hot_encoder, encoded_features = process_training_data.\
                    one_hot_encode(df_to_1hot_encode)


train_dfs_to_combine = [ imputed_train_df[train_numerical_columns],ordinal_encoded_df,one_hot_encoded_df]

X_train_df = pd.concat(train_dfs_to_combine,axis=1)


# TODO
# instantiate process test data after all steps for training data complete ... so can pass imputer, encoder ....
# where and when to use previously written valid_categorical_columns() .... 
# maybe can simple use  handle_unknown="use_encoded_value", unknown_value=10
# how to provide custom dictionary to encoder ... maybe must provided the encoded list for column i



In [78]:
# TODO fit and score function to use XGBoost early stopping, to prevent overfit, and cross-val

#Notes: https://xgboost.readthedocs.io/en/latest/python/sklearn_estimator.html
# Notes: https://www.kaggle.com/code/robikscube/cross-validation-visualized-youtube-tutorial/notebook
# Notes: https://xgboost.readthedocs.io/en/stable/python/python_api.html
# 

def fit_and_score(model,X_train,X_test, y_train,y_test):
    model.fit(X_train,y_train,eval_set=[(X_test,y_test)], verbose=False)
    r_squared = model.score(X_test,y_test) #coefficient of determination of the prediction
    #Note: If early stopping occurs: best_score, best_iteration and best_ntree_limit
    return model.best_score, model.best_ntree_limit, r_squared

kf = KFold(n_splits=5,shuffle=True,random_state=1)
my_model = XGBRegressor(random_state=1,n_estimators=500,\
                        learning_rate=0.05,\
                        early_stopping_rounds=5,\
                        eval_metric=mean_absolute_error, # from sklearn
                        # verbosity=0
                        )

results = {}

for fold,(train_idx, val_idx) in enumerate(kf.split(X_train_df,y_train)):

    X_tr = X_train_df.iloc[train_idx]
    y_tr = y_train.iloc[train_idx]

    X_val = X_train_df.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    
    best_score, best_num_trees, r2 = fit_and_score(clone(my_model),X_tr,X_val,y_tr,y_val)
    results[best_num_trees] = [best_score, r2]

    

#print(results)

    # Note: fit model on training data
    # my_model = XGBRegressor(random_state=0,n_estimators=100,learning_rate=0.1)
    #my_model.fit(X_train_df,y_train)
    # Note: make predictions on validation data
    #prediction = my_model.predict(X_val)
    #print("Fold: ",fold, "Mean Absolute Error: ", mean_absolute_error(prediction,y_val))


    # Test
    #my_model = XGBRegressor(random_state=0,n_estimators=500,learning_rate=0.05)
    #my_model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=5,verbose=False)
    #train_score = my_model.score(X_tr,y_tr)
    #valid_score = my_model.score(X_val,y_val)
    #print(train_score,valid_score)
    # Test

  


In [79]:
results

{171: [16122.106445, 0.8444595581132778],
 145: [16563.832031, 0.8964752968373668],
 151: [17535.564453, 0.8662301253799178],
 143: [17532.054688, 0.8191237436112073],
 170: [13657.378906, 0.925793041530367]}

In [80]:
# TODO
# 1. Preprocess test data used for competition: https://www.kaggle.com/competitions/home-data-for-ml-course/leaderboard#
# 2. With optimal hyperparemeters determined with train/val data, fit model on all training data
# 3. Generate predictions for competition on test data using final model

# Note: initialize class with column transformers used on training data
process_test_data = PreprocessData(imputer=imputer,ordinal_encoder=ordinal_encoder,\
                                   one_hot_encoder=one_hot_encoder,\
                                   new_feature_names = encoded_features,\
                                    is_train=False)
process_test_data.set_data("./home-data-for-ml-course/test.csv",index_column="Id")
process_test_data.set_X_y()

X_test = process_test_data.get_X()

#print(X_test.shape)

imputed_test_df = process_test_data.handle_missing_values(X_test,strat="most_frequent")

#print(imputed_test_df.shape)

# Note: determined by examining data ... 
ordinal_columns = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2",\
"HeatingQC","KitchenQual","Functional","FireplaceQu","GarageQual","GarageCond"]

# Note: set and get numerical, categorical, ordinal columns
process_test_data.set_numerical_columns(imputed_test_df)
process_test_data.set_categorical_columns(imputed_test_df)
process_test_data.set_ordinal_columns(ordinal_columns) 

test_numerical_columns = process_test_data.get_numerical_columns()
test_categorical_columns = process_test_data.get_categorical_columns()
test_ordinal_columns = process_test_data.get_ordinal_columns()

ordinal_encoded_test_df = \
process_training_data.encode_ordinal_data(imputed_test_df[ordinal_columns])

#print(ordinal_encoded_test_df.shape)

one_hot_test_cols = [col for col in test_categorical_columns if col not in test_ordinal_columns]

low_c_test, high_c_test =\
    process_training_data.get_low_high_cardinality_columns(imputed_train_df,one_hot_columns,6)

test_df_to_1hot_encode = process_test_data.\
    create_low_cardinality_df(imputed_test_df,low_c_test, high_c_test)


one_hot_encoded_test_df = process_training_data.\
                    one_hot_encode(test_df_to_1hot_encode)


test_dfs_to_combine = [ imputed_test_df[test_numerical_columns],ordinal_encoded_test_df,one_hot_encoded_test_df]

X_test_df = pd.concat(test_dfs_to_combine,axis=1)

# 2. With optimal hyperparameters determined with train/val data, fit model on all training data
my_competition_model = XGBRegressor(random_state=1,n_estimators=170,learning_rate=0.05)
my_competition_model.fit(X_train_df,y_train)

# 3. Generate predictions for competition on test data using final model
competition_predictions = my_competition_model.predict(X_test_df)

print(X_test_df.shape, len(competition_predictions))


(1459, 170) 1459


In [81]:
# Save test predictions to file for Kaggle competition submission: https://www.kaggle.com/c/home-data-for-ml-course
output = pd.DataFrame({'Id': X_test_df.index,
                       'SalePrice': competition_predictions})
output.to_csv('submission_cross_val_xgboos_early_stopt.csv', index=False)