# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

### Train Data

In [22]:
train_data = pd.read_csv('./train.csv', index_col='Id')
train_data
# train_data.shape
train_data = train_data.dropna(axis=1)


X = train_data.dropna(axis=0, subset="SalePrice")

y = train_data.SalePrice

X = X.drop("SalePrice", axis=1)



X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [24]:
print(X.head())

X.shape

    MSSubClass MSZoning  LotArea Street LotShape LandContour Utilities  \
Id                                                                       
1           60       RL     8450   Pave      Reg         Lvl    AllPub   
2           20       RL     9600   Pave      Reg         Lvl    AllPub   
3           60       RL    11250   Pave      IR1         Lvl    AllPub   
4           70       RL     9550   Pave      IR1         Lvl    AllPub   
5           60       RL    14260   Pave      IR1         Lvl    AllPub   

   LotConfig LandSlope Neighborhood  ... OpenPorchSF EnclosedPorch 3SsnPorch  \
Id                                   ...                                       
1     Inside       Gtl      CollgCr  ...          61             0         0   
2        FR2       Gtl      Veenker  ...           0             0         0   
3     Inside       Gtl      CollgCr  ...          42             0         0   
4     Corner       Gtl      Crawfor  ...          35           272         0   
5

(1460, 60)

# Preprocessing

In [51]:
features = ["MSSubClass",	"MSZoning",	"LotArea",	"Street",	"LotShape",	"LandContour",	"Utilities",	"LotConfig",	"LandSlope",	"Neighborhood",
 "EnclosedPorch",	"3SsnPorch",	"ScreenPorch",	"PoolArea",	"MiscVal", "MoSold","YrSold", "SaleType", "SaleCondition"]

features_1 = [
	"WoodDeckSF",	"OpenPorchSF",	"EnclosedPorch",	"3SsnPorch",	"ScreenPorch",	"PoolArea",	"MiscVal",	"MoSold",	"YrSold",	"SaleCondition",
    	"MSSubClass",	"LotArea",	"Street",	"LotShape",	"LandContour",	"LotConfig",	"LandSlope",	"Neighborhood",	"Condition1",	"Condition2",
]

features_2 = [ 'MSSubClass', 'LotArea', 'Street', 'LotShape', 'LandContour',
'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond',
'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
'1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
'MoSold', 'YrSold', 'SaleCondition']

feature_3 = list(set(X.columns) - set(['BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtUnfSF',
 'Exterior1st',
 'Exterior2nd',
 'Functional',
 'GarageArea',
 'GarageCars',
 'KitchenQual',
 'MSZoning',
 'SaleType',
 'TotalBsmtSF',
 'Utilities']))

refined_X = X[feature_3]

numerical_cols = refined_X.select_dtypes(include="number").columns

print("Numerical cols: \n", numerical_cols)

categorical_cols = refined_X.select_dtypes(include="object").columns

print("\nCategorical cols: \n", categorical_cols)

Numerical cols: 
 Index(['2ndFlrSF', 'HalfBath', 'MiscVal', 'PoolArea', 'FullBath',
       'LowQualFinSF', 'ScreenPorch', 'EnclosedPorch', 'OverallCond',
       '3SsnPorch', 'MoSold', 'LotArea', 'WoodDeckSF', 'BedroomAbvGr',
       'TotRmsAbvGrd', 'Fireplaces', 'MSSubClass', 'YrSold', 'GrLivArea',
       'OverallQual', 'YearRemodAdd', '1stFlrSF', 'YearBuilt', 'KitchenAbvGr',
       'OpenPorchSF'],
      dtype='object')

Categorical cols: 
 Index(['Neighborhood', 'ExterCond', 'LandContour', 'PavedDrive', 'RoofMatl',
       'RoofStyle', 'ExterQual', 'Foundation', 'HeatingQC', 'HouseStyle',
       'LandSlope', 'BldgType', 'CentralAir', 'Condition2', 'SaleCondition',
       'Condition1', 'Street', 'LotConfig', 'Heating', 'LotShape'],
      dtype='object')


In [54]:
from xgboost import XGBRegressor

numerical_transformer = StandardScaler()

categorical_transformer = Pipeline(steps=[  
    # ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

mlpipe = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', RandomForestRegressor(n_estimators=1000, random_state=0))
])

mlpipe.fit(X_train, y_train)

preds = mlpipe.predict(X_valid)

mae = mean_absolute_error(y_valid, preds)


print("MAE:", mae)

MAE: 17999.492087573388


In [55]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(mlpipe, f)


In [56]:
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

model

### Making Predictions on Test Data

In [58]:
test_data = pd.read_csv('./test.csv', index_col='Id')
X_test = test_data.dropna(axis=1)

X_test = X_test[feature_3]

preds_test = mlpipe.predict(X_test)
preds_test

array([124090.023, 156056.885, 179238.789, ..., 149933.078, 128585.8  ,
       239014.927])

In [59]:
out = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test.astype(int)})

out = out.to_csv('submission.csv', index=False)

In [63]:
out