In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

from sklearn import set_config

In [4]:
# Set up Data

In [2]:
df_train_raw = pd.read_csv('./datasets/train.csv')

df = df_train_raw.copy()

In [3]:
df_test_raw = pd.read_csv('./datasets/test.csv')
df_test = df_test_raw.copy()

In [5]:
# Set up Pipeline

In [6]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
#numeric_features = list(X.loc[:, X.dtypes == object].columns)
numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('num_scaler', StandardScaler())])

#cat_features = list(X.loc[:, X.dtypes != object].columns)
categorical_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Other')),
    ('cat_onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessing = ColumnTransformer(transformers=[
    ('num', numeric_transformer, make_column_selector(dtype_include=np.number)),
    ('cat', categorical_transformer, make_column_selector(dtype_include='object'))
])

In [7]:
# Set up Params

In [8]:
params={
    #'simpleimputer__strategy': ['mean', 'median', 'most_frequent', 'constant'],
    #'simpleimputer__strategy': ['mean'],
    #'simpleimputer__fill_value': [0, 1],
    'kneighborsregressor__n_neighbors': list(range(3, 31, 2)),
    #'kneighborsregressor__leaf_size': list(range(10,100))
    'kneighborsregressor__leaf_size': list(range(20,51))
}

In [9]:
# Set up Pipeline, GridSearch

In [10]:
pipe = make_pipeline(preprocessing, KNeighborsRegressor())
gs = GridSearchCV(pipe, param_grid=params, n_jobs=-1)

In [11]:
# See Layout

In [12]:
#set_config(display='diagram')
set_config(display='text')
gs

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('num_imputer',
                                                                                          SimpleImputer()),
                                                                                         ('num_scaler',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f644df8efd0>),
                                                                        ('cat',
                                                                         Pipeline(steps=[('cat_imputer',
                                                                                       

In [27]:
df

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,0,,,,0,1,2008,WD,298751
2047,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,0,0,,,,0,3,2009,WD,82500
2048,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,0,,,,0,3,2009,WD,177000
2049,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,0,,,,0,11,2009,WD,144000


In [28]:
a = pd.Series(df.corr()['SalePrice']).sort_values(ascending=False) > .50

In [41]:
df_nums = df.loc[:, df.dtypes != object]

In [43]:
df_nums_corr = df_nums.loc[:, a]

In [61]:
df_nums

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
0,109,533352170,60,,13517,6,8,1976,2005,289.0,...,0,44,0,0,0,0,0,3,2010,130500
1,544,531379050,60,43.0,11492,7,5,1996,1997,132.0,...,0,74,0,0,0,0,0,4,2009,220000
2,153,535304180,20,68.0,7922,5,7,1953,2007,0.0,...,0,52,0,0,0,0,0,1,2010,109000
3,318,916386060,60,73.0,9802,5,5,2006,2007,0.0,...,100,0,0,0,0,0,0,4,2010,174000
4,255,906425045,50,82.0,14235,6,8,1900,1993,0.0,...,0,59,0,0,0,0,0,3,2010,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,79.0,11449,8,5,2007,2007,0.0,...,0,276,0,0,0,0,0,1,2008,298751
2047,785,905377130,30,,12342,4,5,1940,1950,0.0,...,158,0,0,0,0,0,0,3,2009,82500
2048,916,909253010,50,57.0,7558,6,6,1928,1950,0.0,...,0,0,0,0,0,0,0,3,2009,177000
2049,639,535179160,20,80.0,10400,4,5,1956,1956,0.0,...,0,189,140,0,0,0,0,11,2009,144000


In [75]:
df_dummies = pd.get_dummies(df)

In [79]:
pd.Series(df_dummies.corr()['SalePrice']).sort_values(ascending=False).head(17)

SalePrice           1.000000
Overall Qual        0.800207
Gr Liv Area         0.697038
Garage Area         0.650270
Garage Cars         0.648220
Total Bsmt SF       0.628925
1st Flr SF          0.618486
Bsmt Qual_Ex        0.586497
Year Built          0.571849
Kitchen Qual_Ex     0.551284
Year Remod/Add      0.550370
Full Bath           0.537969
Garage Yr Blt       0.533922
Foundation_PConc    0.529047
Mas Vnr Area        0.512230
TotRms AbvGrd       0.504014
Exter Qual_Ex       0.493861
Name: SalePrice, dtype: float64

In [101]:
X = df[['Overall Qual', 'Gr Liv Area', 'Total Bsmt SF']]
y = df['SalePrice']

In [102]:
# Split Training Data into 2 sets

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [104]:
gs.fit(X_train, y_train);

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('num_imputer',
                                                                                          SimpleImputer()),
                                                                                         ('num_scaler',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f644df8efd0>),
                                                                        ('cat',
                                                                         Pipeline(steps=[('cat_imputer',
                                                                                       

In [105]:
gs.best_params_

{'kneighborsregressor__leaf_size': 39, 'kneighborsregressor__n_neighbors': 9}

In [106]:
gs.score(X_test, y_test)

0.8657077430905257

In [109]:
df_test_submit = df_test[['Overall Qual', 'Gr Liv Area', 'Total Bsmt SF']]

In [113]:
predictions = pd.DataFrame(gs.predict(df_test_submit))

In [114]:
predictions = predictions.join(df_test['Id'])

In [115]:
predictions = predictions.rename({0:'SalePrice'}, axis=1)

In [116]:
predictions = predictions[['Id', 'SalePrice']]

In [118]:
predictions.to_csv('./predictions/0918-2-nguyen.csv', index=False)