In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.io.pytables import dropna_doc
import seaborn as sns  
import missingno as msno
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from  sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

train_data = pd.read_csv("HousePrices/train.csv")
test_data = pd.read_csv("HousePrices/test.csv")
testIds = test_data['Id']


# print(train_data.isnull().any())
# print(test_data.isnull().any())
# print(test_data.isna().sum())
# print(train_data.isnull().values.sum())
# print(test_data.isnull().values.sum())
# msno.bar(train_data)
# print(train_data['MiscFeature'].isna().sum())


def clean(data):
    data = data.drop(['Id', 'MiscFeature', 'Fence', 'PoolQC', 'FireplaceQu', 'Alley', 'BsmtFinType1', 'BsmtFinType2', 
                    'MoSold', 'SaleType', 'SaleCondition', 'GarageYrBlt', 'PoolArea', 'GarageCars',
                    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'MiscVal', 'Condition1', 'Condition2', 'Street', 
                    'LandSlope', 'PoolQC', 'MasVnrType', 'LotConfig', 'HouseStyle'], 
                    axis=1)
    
    num_cols = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 
                'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea','BsmtFullBath', 'BsmtHalfBath', 
                'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
                'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'YrSold']

    data['MSZoning'] = data['MSZoning'].str.replace(" ", "")
    data['Exterior1st'] = data['Exterior1st'].str.replace(" ", "")
    data['Exterior2nd'] = data['Exterior2nd'].str.replace(" ", "")
    
    for col in num_cols:
            data[col].fillna(data[col].median(), inplace=True)
            
    m_cols = ['GarageCond', 'GarageQual', 'GarageType', 'BsmtCond', 'BsmtQual', 'GarageFinish',
                'Functional','KitchenQual', 'Electrical', 'BsmtExposure', 'MSZoning', 'Utilities']
    for mcol in m_cols:
        data[mcol].fillna('NA', inplace=True)
    
    mee = ['Exterior1st', 'Exterior2nd']
    for mlcol in mee:
        data[mlcol] = data[mlcol].fillna('NA', inplace=True)
    
    
    return data
        
            

train_data = clean(train_data)
test_data = clean(test_data)
# print(test_data.head(5))
# print(test_data.info())

OneH_cols = ['LandContour','BldgType']
train_data = pd.get_dummies(train_data, columns=OneH_cols)
test_data = pd.get_dummies(test_data, columns=OneH_cols)

LaEn_cols = ['Neighborhood', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'Heating', 'CentralAir', 
            'Foundation', 'PavedDrive']

le = LabelEncoder()
for Lcol in LaEn_cols:
    train_data[Lcol] = le.fit_transform(train_data[Lcol])
    test_data[Lcol] = le.transform(test_data[Lcol])
    # print(le.classes_)


OrEn_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'Electrical', 'KitchenQual', 
            'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'Utilities', 'LotShape','MSZoning']

Or_dict = {
    'Ex':5, 'Gd':4,	'TA':3, 'Fa':2,	'Po':1, 'NA':0,
    'Gd':3, 'Av':2, 'Mn':1, 'No':0,
    'SBrkr':2, 'FuseA':2, 'FuseF':1, 'FuseP':1, 'Mix':1.5,
    'Typ':5, 'Min1':4, 'Min2':4, 'Mod':3, 'Maj1':2,'Maj2':2, 'Sev':1, 'Sal':0,
    '2Types':3, 'CarPort':2, 'Detchd':2, 'BuiltIn':1, 'Basment':1, 'Attchd':1,
    'Fin':3, 'RFn':2, 'Unf':0,
    'Reg':1, 'IR1':2, 'IR2':3, 'IR3':4,
    'A': 1, 'C':5, 'FV':3, 'I':1, 'RH':4, 'RL':2, 'RP':2, 'RM':3, 'C(all)':5,
    'AllPub':4, 'NoSewr':3, 'NoSeWa':2, 'ELO':1
}

for orcol in OrEn_cols:
    train_data[orcol] = train_data[orcol].map(Or_dict)
    test_data[orcol] = test_data[orcol].map(Or_dict)
    
# for orcol in OrEn_cols:
#     # print(test_data[orcol].isnull().sum().sum())
#     print(train_data[orcol].isnull().sum().sum())

X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']
# print(X.head(5))
# print(test_data.head(5))

seed = 7
n_trees = 30
kfold = KFold(n_splits=8, random_state=42, shuffle=True)
for train_index, test_index in kfold.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
clf = XGBRegressor(n_estimators=n_trees, random_state=seed)
model = clf.fit(X_train, y_train)
y_pred1 = model.predict(test_data)
# y_pred1 = model.predict(X_test)
# mae_xg = mean_absolute_error(y_test, y_pred1)
# mse_xg = mean_squared_error(y_test, y_pred1)
# print(mae_xg)
# print(mse_xg)


for train_index, test_index in kfold.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
rf = RandomForestRegressor(n_estimators=60)
rf.fit(X_train, y_train)
y_pred2 = rf.predict(test_data)
# mae_rf = mean_absolute_error(y_test, y_pred2)
# mse_rf = mean_squared_error(y_test, y_pred2)
# print(mae_rf)
# print(mse_rf)


df = pd.DataFrame({
    'Id': testIds.values, 
    'SalePrice': y_pred2
})
df.to_csv('Houseprices.csv', index=False)