In [97]:
import numpy as np
import pandas as pd
import sklearn
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA


In [98]:
#read file
X_raw = pd.read_csv('train.csv')
X_test_raw = pd.read_csv('test.csv')

y = X_raw['SalePrice']
X_raw = X_raw.drop(['Id'], axis=1)
X_test_raw = X_test_raw.drop(['Id'], axis=1)

X_raw = X_raw.drop(['SalePrice'], axis=1)

# 1460, 80
# 1459, 80
X_mix = X_raw.append(X_test_raw)


(2919, 79)


In [99]:

# clean
def clean(x):
    # general
    def drop_columns_with_missing_data(x):
        col_removed = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
        x = x.drop(col_removed, axis=1)
        return x
    
    def replace_missing_value(x):
        def replace_nan(x, col, value):
            x[col] = x[col].fillna(value)
            return x
        def replace_with_mean(x):
            col_replace_with_mean = ['LotFrontage']
            for col in col_replace_with_mean:
                mean = x[col].mean()
                x[col] = x[col].fillna(mean)
            return x
        def replace_GarageYrBlt(x):
            temp = x[x['GarageType'] == 'Attchd']
            mean = x['GarageYrBlt'].mean()
            x['GarageYrBlt'] = x['GarageYrBlt'].fillna(mean)
            return x
        
        x = replace_with_mean(x)
        x = replace_nan(x, col=['MasVnrType'], value='None')
        x = replace_nan(x, col=['MasVnrArea'], value=0)
        x = replace_nan(x, col=['BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond'], value='TA')
        x = replace_nan(x, col=['BsmtExposure'], value='No')
        x = replace_nan(x, col=['BsmtFinType1', 'BsmtFinType2', 'GarageFinish'], value='Unf')
        x = replace_nan(x, col=['Electrical'], value='SBrkr')
        x = replace_nan(x, col=['GarageType'], value='Attchd')
        x = replace_GarageYrBlt(x)
        return x
    
    x = drop_columns_with_missing_data(x)
    x = replace_missing_value(x)
    return x

def encoding(x):
    def encoding_dummy(x):
        dummy_features = ['RoofMatl', 'HouseStyle', 'Heating', 'Exterior1st', 'Exterior2nd', 'Electrical', 
                          'Condition2', 'MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 
                          'LotConfig', 'Neighborhood', 'Condition1', 'BldgType', 'RoofStyle', 
                          'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 
                          'GarageType', 'GarageFinish', 'PavedDrive', 'SaleType', 'SaleCondition']    
        for feature in dummy_features:
            x =  pd.concat([x, pd.get_dummies(x[feature], drop_first=True, prefix=feature)], axis=1) 
            x = x.drop(feature, 1)
        return x
    def encoding_quality(x):
        quality_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 
                            'GarageQual', 'GarageCond']
        quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1}
        for feature in quality_features:
            x[feature] = x[feature].map(quality_map)
        return x
    def encoding_BsmtExposure(x):
        mapping = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1}
        x['BsmtExposure'] = x['BsmtExposure'].map(mapping)
        return x
    def encoding_CentralAir(x):
        mapping = {'Y':1, 'N': 0}
        x['CentralAir'] = x['CentralAir'].map(mapping)
        return x
    def encoding_LandSlope(x):
        mapping = {'Gtl': 2, 'Mod': 1, 'Sev': 0}
        x['LandSlope'] = x['LandSlope'].map(mapping)
        return x
    
    x = encoding_dummy(x)
    x = encoding_quality(x)
    x = encoding_BsmtExposure(x)
    x = encoding_CentralAir(x)
    x = encoding_LandSlope(x)
    return x

X = clean(X_mix)
X = encoding(X)


X_train = X.iloc[:1460,:]
X_test = X.iloc[1460:,:]


In [100]:

# training
model = XGBRegressor().fit(X_train, y)
y_test = model.predict(X_test)
result = pd.DataFrame({'Id': range(1461, 2920), 'SalePrice': y_test})
result.to_csv('result.csv', index=False)