
![Ames Housing dataset image](https://i.imgur.com/lTJVG4e.png)

This notebook is Fork of Exercise: Pipelines from [Intermediate Machine Learning Home Page](https://www.kaggle.com/learn/intermediate-machine-learning)


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/test.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/data_description.txt


In [2]:
# import my own script
import utility as util


In [5]:
help(util)

Help on module utility:

NAME
    utility

DESCRIPTION
    # %% [code]
    ## utility.py
    ## This code is made by Tomo Shimobayashi.
    ## Usage of Functions:
    ## 
    ## - load_data(path, filename)
    ##   import cvs to dataframe 
    ##   Returns pd.Dataframe
    ##
    ## - check_df(df)
    ##   checking the numbers of NaNs and Duplicates in the data frame
    ##
    ## - num_cat_colname(X)
    ##   Returns two arrays: num_array, cat_array as an array
    ##       numerical value array and categorical value array
    ##   input: X should be a Dafaframe
    ##   
    ## - regres_models(X_train, y_train)
    ##   Try some regression models: Random Forest and Gradient Boosting
    ##   The scoring method is neg_mean_abs_error
    ##

FUNCTIONS
    check_df(df)
    
    load_data(path, filename)
    
    num_cat_colname(X)
    
    regres_models(X_train, y_train)

FILE
    /kaggle/usr/lib/utility/utility.py




In [6]:
# Read the data
path = '/kaggle/input/home-data-for-ml-course/'
X_full = util.load_data(path, 'train.csv')
X_test_full = util.load_data(path, 'test.csv')

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
# categorical_cols = [cname for cname in X_train_full.columns if
#                     X_train_full[cname].nunique() < 10 and 
#                     X_train_full[cname].dtype == "object"]

# # Select numerical columns
# numerical_cols = [cname for cname in X_train_full.columns if 
#                 X_train_full[cname].dtype in ['int64', 'float64']]

numerical_cols, categorical_cols = util.num_cat_colname(X_train_full)

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [8]:
X_train.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,774,0,108,0,0,260,0,0,7,2007
870,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,...,308,0,0,0,0,0,0,0,8,2009
92,RL,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,...,432,0,0,44,0,0,0,0,8,2009
817,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,...,857,150,59,0,0,0,0,0,7,2008
302,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,843,468,81,0,0,0,0,0,1,2006


The next code cell uses code from the tutorial to preprocess the data and train a model.

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 17740.290308219177


The code yields a value around 17862 for the mean absolute error (MAE).  In the next step, I amend the code to do better.

# Step 1: Improve the performance


In [10]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean') 

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0) 

In [11]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)


MAE: 17721.08506849315


In [None]:
X_train_p = preprocessor.fit_transform(X_train)
X_valid_p = preprocessor.transform(X_valid)
X_test_p = preprocessor.transform(X_test)

In [None]:
util.regres_models(X_train_p, y_train)

# Step 2: Generate test predictions


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=0)
gbr.fit(X_train_p, y_train)
preds_test = gbr.predict(X_test_p)

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test['Id'],
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)

In [None]:
# import eli5
# from eli5.sklearn import PermutationImportance
# X_train_p = preprocessor.transform(X_train)
# imputer = SimpleImputer(strategy='most_frequent')
# X_train_imp = imputer.fit_transform(X_train[categorical_cols])
# onehot = OneHotEncoder(handle_unknown='ignore')
# onehot.fit(X_train_imp)

# perm = PermutationImportance(model, random_state=1).fit(X_train_p, y_train)
# column_array = X_train[numerical_cols].columns.tolist()
# column_array.extend(onehot.get_feature_names(X_train[categorical_cols].columns.tolist()))
# eli5.show_weights(perm, feature_names = column_array)