In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

from sklearn import set_config

In [3]:
df_train_raw = pd.read_csv('./datasets/train.csv')

df = df_train_raw.copy()

In [4]:
df_test_raw = pd.read_csv('./datasets/test.csv')
df_test = df_test_raw.copy()

In [5]:
#column_filter = pd.Series(df_dummies.corr()['SalePrice']).sort_values(ascending=False) > 0.5

In [6]:
df.loc[:, df.isnull().sum() > 0]

Unnamed: 0,Lot Frontage,Alley,Mas Vnr Type,Mas Vnr Area,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,...,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Pool QC,Fence,Misc Feature
0,,,BrkFace,289.0,TA,TA,No,GLQ,533.0,Unf,...,Attchd,1976.0,RFn,2.0,475.0,TA,TA,,,
1,43.0,,BrkFace,132.0,Gd,TA,No,GLQ,637.0,Unf,...,Attchd,1997.0,RFn,2.0,559.0,TA,TA,,,
2,68.0,,,0.0,TA,TA,No,GLQ,731.0,Unf,...,Detchd,1953.0,Unf,1.0,246.0,TA,TA,,,
3,73.0,,,0.0,Gd,TA,No,Unf,0.0,Unf,...,BuiltIn,2007.0,Fin,2.0,400.0,TA,TA,,,
4,82.0,,,0.0,Fa,Gd,No,Unf,0.0,Unf,...,Detchd,1957.0,Unf,2.0,484.0,TA,TA,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,79.0,,,0.0,Gd,TA,Av,GLQ,1011.0,Unf,...,Attchd,2007.0,Fin,2.0,520.0,TA,TA,,,
2047,,,,0.0,TA,TA,No,BLQ,262.0,Unf,...,Detchd,1961.0,Unf,2.0,539.0,TA,TA,,,
2048,57.0,,,0.0,TA,TA,No,Unf,0.0,Unf,...,Detchd,1929.0,Unf,2.0,342.0,Fa,Fa,,,
2049,80.0,,,0.0,TA,TA,No,Rec,155.0,LwQ,...,Attchd,1956.0,Unf,1.0,294.0,TA,TA,,,


In [7]:
# Fixing Nulls by creating a third categeory 'other' which preserves the null meaning

In [8]:
df.loc[:, df.dtypes == object] = df.loc[:, df.dtypes == object].fillna('Other')

In [9]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,Other,IR1,Lvl,...,0,0,Other,Other,Other,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,Other,IR1,Lvl,...,0,0,Other,Other,Other,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,Other,Reg,Lvl,...,0,0,Other,Other,Other,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,Other,Reg,Lvl,...,0,0,Other,Other,Other,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,Other,IR1,Lvl,...,0,0,Other,Other,Other,0,3,2010,WD,138500


In [10]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [11]:
# Setting up Parameters, into a pipe, into a GridSearch for optimization

In [12]:
ss = SimpleImputer()

In [13]:
X

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,109,533352170,60,RL,,13517,Pave,Other,IR1,Lvl,...,0,0,0,Other,Other,Other,0,3,2010,WD
1,544,531379050,60,RL,43.0,11492,Pave,Other,IR1,Lvl,...,0,0,0,Other,Other,Other,0,4,2009,WD
2,153,535304180,20,RL,68.0,7922,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,1,2010,WD
3,318,916386060,60,RL,73.0,9802,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,4,2010,WD
4,255,906425045,50,RL,82.0,14235,Pave,Other,IR1,Lvl,...,0,0,0,Other,Other,Other,0,3,2010,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,79.0,11449,Pave,Other,IR1,HLS,...,0,0,0,Other,Other,Other,0,1,2008,WD
2047,785,905377130,30,RL,,12342,Pave,Other,IR1,Lvl,...,0,0,0,Other,Other,Other,0,3,2009,WD
2048,916,909253010,50,RL,57.0,7558,Pave,Other,Reg,Bnk,...,0,0,0,Other,Other,Other,0,3,2009,WD
2049,639,535179160,20,RL,80.0,10400,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,11,2009,WD


In [14]:
ColumnTransformer

sklearn.compose._column_transformer.ColumnTransformer

In [15]:
SimpleImputer

sklearn.impute._base.SimpleImputer

In [16]:
StandardScaler

sklearn.preprocessing._data.StandardScaler

In [17]:
OneHotEncoder

sklearn.preprocessing._encoders.OneHotEncoder

In [18]:
KNeighborsRegressor

sklearn.neighbors._regression.KNeighborsRegressor

In [19]:
list(X.loc[:, X.dtypes == object].columns)

['MS Zoning',
 'Street',
 'Alley',
 'Lot Shape',
 'Land Contour',
 'Utilities',
 'Lot Config',
 'Land Slope',
 'Neighborhood',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Roof Style',
 'Roof Matl',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Exter Qual',
 'Exter Cond',
 'Foundation',
 'Bsmt Qual',
 'Bsmt Cond',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'BsmtFin Type 2',
 'Heating',
 'Heating QC',
 'Central Air',
 'Electrical',
 'Kitchen Qual',
 'Functional',
 'Fireplace Qu',
 'Garage Type',
 'Garage Finish',
 'Garage Qual',
 'Garage Cond',
 'Paved Drive',
 'Pool QC',
 'Fence',
 'Misc Feature',
 'Sale Type']

In [20]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
#numeric_features = list(X.loc[:, X.dtypes == object].columns)
numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('num_scaler', StandardScaler())])

#cat_features = list(X.loc[:, X.dtypes != object].columns)
categorical_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Other')),
    ('cat_onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessing = ColumnTransformer(transformers=[
    ('num', numeric_transformer, make_column_selector(dtype_include=np.number)),
    ('cat', categorical_transformer, make_column_selector(dtype_include='object'))
])

In [21]:
params={
    #'simpleimputer__strategy': ['mean', 'median', 'most_frequent', 'constant'],
    #'simpleimputer__strategy': ['mean'],
    #'simpleimputer__fill_value': [0, 1],
    'kneighborsregressor__n_neighbors': list(range(3, 31, 2)),
    #'kneighborsregressor__leaf_size': list(range(10,100))
    'kneighborsregressor__leaf_size': list(range(20,51))
}

In [22]:
pipe = make_pipeline(preprocessing, KNeighborsRegressor())

In [23]:
gs = GridSearchCV(pipe, param_grid=params, n_jobs=-1)

In [24]:
#set_config(display='diagram')
set_config(display='text')
gs

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('num_imputer',
                                                                                          SimpleImputer()),
                                                                                         ('num_scaler',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f04e46fdac0>),
                                                                        ('cat',
                                                                         Pipeline(steps=[('cat_imputer',
                                                                                       

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [26]:
X_train

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
1862,286,909100080,30,RL,67.0,4853,Pave,Other,Reg,Bnk,...,0,0,0,Other,MnPrv,Other,0,5,2010,WD
1275,874,907285020,60,RL,,9375,Pave,Other,Reg,Lvl,...,0,0,0,Other,GdPrv,Other,0,2,2009,WD
361,1701,528118040,60,RL,118.0,13654,Pave,Other,IR1,Lvl,...,0,0,0,Other,Other,Other,0,5,2007,WD
254,2813,907414040,20,RL,65.0,8772,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,9,2006,New
131,657,535376060,60,RL,60.0,10800,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,8,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453,1447,907202010,20,RL,,12250,Pave,Other,IR1,Lvl,...,0,0,0,Other,Other,Other,0,5,2008,WD
1522,2388,528118020,60,RL,108.0,13418,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,9,2006,WD
1591,1981,902128160,30,RM,60.0,7200,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,5,2007,WD
1097,799,905480210,70,RL,60.0,9084,Pave,Other,Reg,Lvl,...,0,0,0,Other,MnPrv,Other,0,10,2009,WD


In [27]:
X_test

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
319,1108,528365080,60,RL,91.0,10010,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,7,2008,WD
2030,306,911202100,50,C (all),66.0,8712,Pave,Pave,Reg,HLS,...,0,0,0,Other,Other,Other,0,1,2010,WD
1328,1949,535378080,50,RL,60.0,10284,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,8,2007,WD
1475,2491,532376080,20,RL,,9308,Pave,Other,IR1,Lvl,...,0,0,0,Other,MnPrv,Other,0,7,2006,WD
780,1575,916382120,20,RL,71.0,9520,Pave,Other,IR1,HLS,...,0,0,0,Other,Other,Other,0,4,2008,New
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,2386,528116030,20,RL,98.0,16033,Pave,Other,IR1,Lvl,...,0,0,0,Other,Other,Other,0,3,2006,WD
415,2045,904100100,70,RL,107.0,12888,Pave,Other,Reg,Bnk,...,0,0,0,Other,Other,Other,0,4,2007,WD
1053,596,534226120,20,RL,70.0,9100,Pave,Other,Reg,Lvl,...,0,0,0,Other,GdWo,Other,0,7,2009,WD
1449,321,916475110,60,RL,85.0,14191,Pave,Other,Reg,Lvl,...,0,0,0,Other,Other,Other,0,4,2010,WD


In [28]:
gs.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('num_imputer',
                                                                                          SimpleImputer()),
                                                                                         ('num_scaler',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f04e46fdac0>),
                                                                        ('cat',
                                                                         Pipeline(steps=[('cat_imputer',
                                                                                       

In [29]:
gs.best_params_

{'kneighborsregressor__leaf_size': 20, 'kneighborsregressor__n_neighbors': 9}

In [30]:
gs.score(X_test, y_test)

0.8376346813085992

In [31]:
predictions = pd.DataFrame(gs.predict(df_test))

In [34]:
predictions = predictions.join(df_test['Id'])

In [39]:
predictions = predictions.rename({0:'SalePrice'}, axis=1)

In [41]:
predictions = predictions[['Id', 'SalePrice']]

In [44]:
predictions

Unnamed: 0,Id,SalePrice
0,2658,134722.222222
1,2718,143522.222222
2,2414,196637.777778
3,1989,107733.333333
4,625,167155.555556
...,...,...
873,1662,209044.444444
874,1234,210833.333333
875,1373,124922.222222
876,1672,124195.333333


In [46]:
predictions.to_csv('./predictions/0918-1-nguyen.csv', index=False)