In [163]:
import pandas as pd
# classifiers / models
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV

# other
from sklearn.metrics import accuracy_score, log_loss, make_scorer, mean_squared_error
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    train_test_split,
)

In [164]:
X_train = pd.read_csv('data/X_train.csv')
X_valid =  pd.read_csv('data/X_valid.csv')
y_train = pd.read_csv('data/y_train.csv')
y_valid = pd.read_csv('data/y_valid.csv')

In [165]:
X_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1217,90,RM,68.0,8930,Pave,no_access,Reg,Lvl,AllPub,...,0,0,no_pool,no_fence,none,0,4,2010,WD,Normal
1,340,20,RL,66.0,12400,Pave,no_access,IR1,Lvl,AllPub,...,234,0,no_pool,no_fence,none,0,6,2009,WD,Normal
2,1058,60,RL,0.0,29959,Pave,no_access,IR2,Lvl,AllPub,...,0,0,no_pool,no_fence,none,0,1,2009,WD,Normal
3,483,70,RM,50.0,2500,Pave,Pave,Reg,Lvl,AllPub,...,0,0,no_pool,no_fence,none,0,6,2009,WD,Normal
4,530,20,RL,0.0,32668,Pave,no_access,IR1,Lvl,AllPub,...,0,0,no_pool,no_fence,none,0,3,2007,WD,Alloca
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162,1395,120,RL,53.0,4045,Pave,no_access,Reg,Lvl,AllPub,...,0,0,no_pool,no_fence,none,0,10,2006,New,Partial
1163,1345,60,RL,85.0,11103,Pave,no_access,IR1,Lvl,AllPub,...,0,0,no_pool,no_fence,none,0,7,2007,New,Partial
1164,528,60,RL,67.0,14948,Pave,no_access,IR1,Lvl,AllPub,...,0,0,no_pool,no_fence,none,0,11,2008,New,Partial
1165,1150,70,RM,50.0,9000,Pave,no_access,Reg,Lvl,AllPub,...,0,0,no_pool,no_fence,none,0,7,2009,WD,Normal


In [166]:
cat_cols = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', # possibly add LotShape,Landslope to ord
            'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
           'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
           'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation',
           'Heating', 'Electrical', 'Fireplaces', 'GarageType', 'GarageFinish',
            'PavedDrive', 'MiscFeature','BsmtQual','GarageQual', 'GarageCond', 
           'BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2','PoolQC', 'Fence',]

num_cols = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
           'BsmtFinSF1', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
           'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 
           'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt','GarageCars', 
           'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 
            'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', ]

ord_cols = ['ExterQual', 'ExterCond', 'HeatingQC',
           'KitchenQual', 'Functional', 'FireplaceQu']

pass_cols = ['OverallQual', 'OverallCond']

bin_cols = ['CentralAir']

drop_feats = ['Unnamed: 0', 'Id']

In [167]:
X_train['CentralAir']

0       Y
1       Y
2       Y
3       Y
4       Y
       ..
1162    Y
1163    Y
1164    Y
1165    Y
1166    Y
Name: CentralAir, Length: 1167, dtype: object

In [168]:
# Sorting Ordinal Columns 

std_grading = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
function_grading = ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']
fire_grading = ['no_fireplace'] + std_grading 
ordinates = std_grading, std_grading, std_grading, std_grading, function_grading, fire_grading

ordinates

(['Po', 'Fa', 'TA', 'Gd', 'Ex'],
 ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
 ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
 ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
 ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'],
 ['no_fireplace', 'Po', 'Fa', 'TA', 'Gd', 'Ex'])

### Steps to take here 

1. Scale numeric columns 
2. One-hot encode categorical columns
3. create ordinate for ordinal categories
4. create binary features
5. pass through o

In [169]:
from sklearn.compose import ColumnTransformer, make_column_transformer

# Classifiers
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# classifiers / models
from sklearn.linear_model import LogisticRegression

# other
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.dummy import DummyClassifier, DummyRegressor

In [170]:
preprocessor = make_column_transformer(
    (StandardScaler(), num_cols), 
    (OrdinalEncoder(categories=ordinates), ord_cols),
    (OneHotEncoder(handle_unknown="ignore", sparse=False), cat_cols),
    (OneHotEncoder(drop='if_binary', sparse=False), bin_cols),
    #(passthrough, pass_cols)
)

In [179]:
preprocessor.named_transformers_

{'standardscaler': StandardScaler(),
 'ordinalencoder': OrdinalEncoder(categories=(['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                            ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                            ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                            ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                            ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev',
                             'Sal'],
                            ['no_fireplace', 'Po', 'Fa', 'TA', 'Gd', 'Ex'])),
 'onehotencoder-1': OneHotEncoder(handle_unknown='ignore', sparse=False),
 'onehotencoder-2': OneHotEncoder(drop='if_binary', sparse=False),
 'remainder': 'drop'}

In [171]:
transfeat_names = num_cols + ord_cols + list(preprocessor.named_transformers_['onehotencoder-1'].get_feature_names()) + list(preprocessor.named_transformers_['onehotencoder-2'].get_feature_names())

AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

In [178]:
X = preprocessor.fit_transform(X_train)

In [153]:
pd.DataFrame(X, columns=transfeat_names)

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF1.1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,x34_Ex,x34_Fa,x34_Gd,x34_no_pool,x35_GdPrv,x35_GdWo,x35_MnPrv,x35_MnWw,x35_no_fence,x0_Y
0,0.282813,-0.165377,0.225474,-0.330645,-0.573301,-0.959581,-0.959581,-1.299380,-2.409798,0.383378,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.225574,0.154812,-0.436318,0.641669,0.383437,0.296815,0.296815,0.129407,0.340543,0.122152,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
2,-1.663329,1.775038,0.754908,0.447206,-0.573301,0.318292,0.318292,-0.442108,-0.207261,-0.476386,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
3,-0.232342,-0.758694,-1.859170,0.981979,-0.573301,-0.317423,-0.317423,0.086317,-0.349872,-0.636165,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
4,-1.663329,2.025006,-0.469407,-0.476492,-0.573301,1.658448,1.658448,0.551240,2.196741,3.419180,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162,-0.146483,-0.616132,1.151983,1.030595,-0.328681,1.338443,1.338443,-0.650756,0.659719,0.844962,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1163,0.769348,0.035133,1.151983,1.030595,-0.573301,-0.959581,-0.959581,0.351663,-0.761857,-1.112966,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1164,0.254193,0.389924,1.218162,1.127826,0.883550,1.896841,1.896841,-1.022694,0.877030,0.784093,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1165,-0.232342,-0.158918,-1.693722,0.155512,-0.573301,0.380575,0.380575,-1.240414,-0.938422,-0.849204,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [70]:
pipe = make_pipeline(preprocessor, DummyRegressor())

In [71]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  ['LotFrontage', 'LotArea',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF1', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   '2ndFlrSF', 'LowQualFinSF',
                                                   'GrLivArea', 'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                   'HalfB...
                                  

In [73]:
pipe.score(X_valid, y_valid)

-0.0027929685199812315

In [77]:
pipe = make_pipeline(preprocessor, RidgeCV())
pipe.fit(X_train, y_train)
pipe.score(X_valid, y_valid)

0.3792260978076283