In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read Data
def get_inputs():
    x_train = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
    y_train = x_train.SalePrice
    x_train.drop('SalePrice', axis=1, inplace=True)
    x_valid = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')
    return _pre_process(x_train), y_train, _pre_process(x_valid)

In [None]:
# data cleaning
def _pre_process(input_df):
    # adding logic based NA values
    input_df.loc[:, 'GarageYrBlt'] = 1900
    input_df.loc[:, 'MasVnrArea'] = 0
    
    # drop cols for large missing value columns
    drop_cols = ['FireplaceQu',
                        'Fence',
                        'Alley',
                        'MiscFeature',
                        'PoolQC']

    return input_df.drop(columns=drop_cols)

In [None]:
def get_categorical_columns():
    all_cat = ['MSZoning',
             'Street',
             'LotShape',
             'LandContour',
             'Utilities',
             'LotConfig',
             'LandSlope',
             'Neighborhood',
             'Condition1',
             'Condition2',
             'BldgType',
             'HouseStyle',
             'RoofStyle',
             'RoofMatl',
             'Exterior1st',
             'Exterior2nd',
             'MasVnrType',
             'ExterQual',
             'ExterCond',
             'Foundation',
             'BsmtQual',
             'BsmtCond',
             'BsmtExposure',
             'BsmtFinType1',
             'BsmtFinType2',
             'Heating',
             'HeatingQC',
             'CentralAir',
             'Electrical',
             'KitchenQual',
             'Functional',
             'GarageType',
             'GarageFinish',
             'GarageQual',
             'GarageCond',
             'PavedDrive',
             'SaleType',
             'SaleCondition']

    return all_cat

In [None]:
def get_numerical_cols():
    all_cols = ['MSSubClass',
     'LotFrontage',
     'LotArea',
     'OverallQual',
     'OverallCond',
     'YearBuilt',
     'YearRemodAdd',
     'MasVnrArea',
     'BsmtFinSF1',
     'BsmtFinSF2',
     'BsmtUnfSF',
     'TotalBsmtSF',
     '1stFlrSF',
     '2ndFlrSF',
     'LowQualFinSF',
     'GrLivArea',
     'BsmtFullBath',
     'BsmtHalfBath',
     'FullBath',
     'HalfBath',
     'BedroomAbvGr',
     'KitchenAbvGr',
     'TotRmsAbvGrd',
     'Fireplaces',
     'GarageYrBlt',
     'GarageCars',
     'GarageArea',
     'WoodDeckSF',
     'OpenPorchSF',
     'EnclosedPorch',
     '3SsnPorch',
     'ScreenPorch',
     'PoolArea',
     'MiscVal',
     'MoSold',
     'YrSold']
    return all_cols

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [None]:
cat_imputer = SimpleImputer(strategy='constant', fill_value='Missing')
cat_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
num_mean_imputer = SimpleImputer(strategy='mean')
# garage_imputer = SimpleImputer(strategy='min')
# num_zero_imputer = SimpleImputer(strategy='constant', fill_value='0')

### Create Pipeline

In [None]:
# Data Pipeline
cat_transform = Pipeline(
    steps=[
        ('imputer', cat_imputer),
        ('encoding', cat_encoder)
    ]
)
num_transform = Pipeline(
    steps=[
        ('imputer', num_mean_imputer),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transform, get_numerical_cols()),
        ('cat', cat_transform, get_categorical_columns())
    ]
)

In [None]:
# from sklearn.compose import make_column_transformer

# preprocessor = make_column_transformer(
#     (SimpleImputer(strategy='constant', fill_value=1900), ['GarageYrBlt']),
#     (SimpleImputer(strategy='constant', fill_value=0), ['MasVnrArea'])
# )

In [None]:
# model pipeline
model = XGBRegressor(n_estimators=150, learning_rate=.05)
ml_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [None]:
# start pipeline testing and evaluation
# get data
train_df, Y, test_df = get_inputs()

score = cross_val_score(ml_pipeline, train_df, Y, cv=5, scoring='neg_mean_absolute_error')

In [None]:
print(score.mean())

In [None]:
ml_pipeline.fit(train_df, Y)

In [None]:
prediction = ml_pipeline.predict(test_df)

In [None]:
output = pd.DataFrame({'Id': test_df.Id,
                       'SalePrice': prediction})
output.to_csv('submission.csv', index=False)
