In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as lm
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier  
from sklearn.ensemble import RandomForestRegressor 




In [2]:
# Data processing

def data_process(X,category):
    # X : pd.DataFrame, No PID, No price
    row,col = X.shape
    column_names = X.columns
    X_new = np.empty([row,1])
    if category == False:
        column_names_new = []
        category_dict = {}
        for column_name in column_names:
            if X[column_name].dtype != 'object':
                column_temp = X[column_name].to_numpy().reshape(row,-1)
                X_new = np.concatenate((X_new,column_temp),axis = 1)
                column_names_new = column_names_new + [column_name,]
            else:
                encoder = OneHotEncoder(sparse=False,categories='auto')
                # print(X[column_name].shape)
                encoded_data = encoder.fit_transform(X[[column_name]])
                X_new = np.concatenate((X_new,encoded_data),axis = 1)
                column_names_new = column_names_new + list(encoder.categories_[0])
                category_dict[column_name] = encoder.categories_
        # X_new_array = np.array(X_new)
        X_new = np.delete(X_new, 0, axis=1)
        X_new = np.nan_to_num(X_new, nan=0)
        return X_new,column_names_new,category_dict

    else:
        for column_name in column_names:
            if X[column_name].dtype != 'object':
                column_temp = X[column_name].to_numpy().reshape(row,-1)
                X_new = np.concatenate((X_new,column_temp),axis = 1)
                # column_names_new = column_names_new + column_name
            else:
                encoder = OneHotEncoder(sparse=False,categories=category[column_name],handle_unknown='ignore')
                encoded_data = encoder.fit_transform(X[[column_name]])
                X_new = np.concatenate((X_new,encoded_data),axis = 1)
                # column_names_new = column_names_new + list(encoder.categories_[0])
                # category_dict[column_name] = encoder.categories_
        X_new = np.delete(X_new, 0, axis=1)
        X_new = np.nan_to_num(X_new, nan=0)
        return X_new


                

In [3]:
# Data input


folder_address = './proj1/'
name_list = ['/fold1','/fold2','/fold3','/fold4','/fold5','/fold6','/fold7','/fold8','/fold9','/fold10']
train_data = []
train_PID = []
train_price = []
train_data_name = []
train_data_category = []
test_data = []
test_PID = []
true_price = []
true_price_PID = []
train_name = '/train.csv'
test_name = '/test.csv'
testy_name = '/test_y.csv'

for index, fold_name in enumerate(name_list):
    # print(index)
    #read training data
    train_file_name = folder_address + fold_name + train_name
    data = pd.read_csv(train_file_name)
    PID = data['PID'] # Save the PID in the dataset into a seperate column
    train_PID.append(PID) 
    Price = np.array(data['Sale_Price'])
    Price_log = np.log(Price)
    train_price.append(Price_log) # Response = price
    X = data.drop(['PID','Sale_Price'],axis = 1) # All predictors 
    category = False
    X_new,column_names_new,category_dict = data_process(X,category)
    train_data.append(X_new)
    train_data_category.append(category_dict)
    train_data_name.append(column_names_new)

    #read test file
    test_file_name = folder_address+fold_name+test_name
    data = pd.read_csv(test_file_name)
    PID = data['PID']
    test_PID.append(PID)
    X = data.drop(['PID'],axis = 1)
    category = category_dict
    X_new = data_process(X,category)
    test_data.append(X_new)

    #read test_y file
    true_price_file = folder_address + fold_name + testy_name
    data = pd.read_csv(true_price_file)
    PID = data['PID']
    true_price_PID.append(PID)
    Price = np.array(data['Sale_Price'])
    Price_log = np.log(Price)
    true_price.append(Price_log)



In [5]:
# Attempt one: Random Forest for the price prediction task
np.random.seed(2417)

price_pred = []
rmse = []
reg = RandomForestRegressor(n_estimators= 500, random_state=42) 

for index in range(len(name_list)):
    reg.fit(train_data[index], train_price[index])
    y_pred = reg.predict(test_data[index])
    # price_pred.append(y_pred)
    rmse_index = np.sqrt(mean_squared_error(true_price[index], y_pred))
    rmse.append(rmse_index)
    print(rmse_index)


0.13851241807832565
0.14101569277155732
0.1328242415439633
0.13853494110710635
0.12796759149354486
0.14911653891320378
0.15281254281552256
0.1443927646814941
0.1486760173467215
0.13922031023642378


In [9]:
# Attempt two: XGboost model for the price prediction task

np.random.seed(2417)

rmse_xgb = []
xgb_model = XGBRegressor(n_estimators = 500,eta = 0.05, subsample= 0.5)

for index in range(len(name_list)):
    xgb_model.fit(train_data[index], train_price[index])
    y_pred_xgb = xgb_model.predict(test_data[index])
    rmse_index = np.sqrt(mean_squared_error(true_price[index], y_pred_xgb))
    rmse_xgb.append(rmse_index)
    print(rmse_index)

0.11378088176721403
0.11755640524084741
0.11589603336767859
0.11843562084609424
0.10784543122492683
0.12866615199770726
0.1325701213273024
0.12798373482307962
0.13178518650464946
0.11686705308435665


In [10]:
print(sum(rmse)/len(rmse))
print(sum(rmse_xgb)/len(rmse_xgb))

0.14130730589878632
0.12113866201838568
