# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

### Train Data

In [2]:
train_data = pd.read_csv('./train.csv', index_col='Id')
train_data
# train_data.shape
train_data = train_data.dropna(axis=1)

y = train_data.SalePrice

{'Exterior1st',
 'Exterior2nd',
 'Functional',
 'KitchenQual',
 'MSZoning',
 'SaleType',
 'Utilities'}

dropped_features = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea', 'SalePrice', 'Exterior1st',
 'Exterior2nd',
 'Functional',
 'KitchenQual',
 'MSZoning',
 'SaleType',
 'Utilities']

X = train_data.drop(dropped_features, axis=1)



X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Preprocessing

In [3]:
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = SimpleImputer()

categorical_transformer = Pipeline(steps=[  
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

mlpipe = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', RandomForestRegressor(n_estimators=100, random_state=0))
])

mlpipe.fit(X_train, y_train)

preds = mlpipe.predict(X_valid)

mae2 = mean_absolute_error(y_valid, preds)


print("MAE:", mae2)

MAE: 18327.196130136985


In [4]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(mlpipe, f)


In [5]:
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

model

### Test Data

In [6]:
test_data = pd.read_csv('./test.csv', index_col='Id')
X_test = test_data.dropna(axis=1)

set(X.columns) - set(X_test.columns)

# X_test.head()
preds_test = model.predict(X_test)

In [7]:



out = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test.astype(int)})

out = out.to_csv('submission.csv', index=False)

In [8]:
out