In [83]:
# System
import os
# import yaml
import datetime
from itertools import combinations

# Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data Analysis
import re
import numpy as np
import pandas as pd

# Modeling
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler,LabelEncoder,RobustScaler,MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge,Lasso,ElasticNet,LinearRegression
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Neural Network
import tensorflow
tensorflow.random.set_seed(1)
from tensorflow.python.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor

In [90]:
class Preprocessor(object):
    """
    """
    
    def __init__(self,target,stage):
        try:
            assert stage in ['dev','test','prod']
        except:
            raise Exception('Unknown stage')
        self.stage = stage
        
        cwd = os.getcwd()
        kaggle = os.path.join(cwd,'kaggle')
        
        self.target = target
        self.train_df = pd.read_csv(os.path.join(kaggle,'train.csv'),header=0,index_col=0)
        self.test_df = pd.read_csv(os.path.join(kaggle,'test.csv'),header=0,index_col=0)
        
        # with open(os.path.join(os.getcwd(),'features.yml'),'r') as file:
        #     self.features_yml = yaml.load(file, Loader=yaml.FullLoader)
        self.features = None
        self._get_features()
        
        self.X_train = None
        self.y_train = None
        self.idx_train = None
        self.X_test = None
        self.y_test = None
        self.idx_test = None
        
        self.train_dummies = None
        self.test_dummies = None
        self.dummies = None
        
        self.train_encoded = None
        self.test_encoded = None
        
        self.scaler = None
        self.train_standardized = None
        self.test_standardized = None
        
        self.train_combined = None
        self.test_combined = None
        
        self.selector = None
        self.estimator= None
        self.selected_features = None
        self.train_selected = None
        self.test_selected = None
        
        self.X_fit = None
        self.y_fit = None
        self.X_score = None
        self.y_score = None
        self.X_pred = None
        self.y_pred = None
        
                    
    def _get_features(self,refresh=False):
        # features = {'all':[]}
        # for feature,info in self.features_yml.items():
        #     features['all'].append(feature)
        #     if not features.get(info['dtype'],None):
        #         features[info['dtype']] = []
        #     features[info['dtype']].append(feature)
        # self.features = features
        drop = [
            # 'Alley','LandContour','LandSlope',
            # 'Condition2','MSSubClass','HouseStyle',
            # 'YearRemodAdd','RoofStyle','RoofMatl',
            # 'Exterior1st','Exterior2nd','MasVnrType',
            # 'MasVnrArea','BsmtExposure','BsmtFinType1',
            # 'BsmtFinSF1','BsmtFinType2','BsmtFinSF2',
            # 'BsmtUnfSF','TotRmsAbvGrd','Fireplaces',
            # 'FireplaceQu','GarageType','GarageYrBlt',
        ]
        
        features = {
            'all':[],
            'target':['Target'],
            'cat':[],
            'num':[],
            'encoded':[],
            'other':[]
        }
        if refresh:
            df = self.X_train
        else:
            df = self.train_df
        
        for c in df.columns:
            if c not in drop:
                features['all'].append(c)

                if c==self.target:
                    None # features['target'].append(c)
                elif len(df[c].unique()) > 30:
                    features['num'].append(c)
                elif type(df[c].dropna().values[0])==str:
                    features['cat'].append(c)
                elif type(df[c].dropna().values[0])==np.int64:
                    features['encoded'].append(c)
                else:
                    features['other'].append(c)
        
        if len(features['other']) > 0:
            raise Exception('Uncategorized features')
        
        self.features = features
    
    def clean(self):
        df = self.train_df[self.features['all']].rename({self.target:'Target'},axis=1)
        df = df.dropna(axis=1,thresh=(df.shape[0]*0.6)).fillna('0')
        df.index = df.index.astype(str)
        
        for dtype,fields in self.features.items():
            for field in fields:
                if field not in df.columns:
                    fields.remove(field)
        
        self.y_train = df.Target
        self.X_train = df.drop(self.features['target'],axis=1)
        self.idx_train = df.index
                
        df = self.test_df[self.features['all']].fillna('0')
        df.index = df.index.astype(str)
        self.X_test = df
        self.idx_test = self.X_test.index
        
        self._get_features(refresh=True)
        
    def dummy(self):
        self.train_dummies = pd.get_dummies(self.X_train[self.features['cat']])
        self.test_dummies = pd.get_dummies(self.X_test[self.features['cat']])
        self.dummies = list(set(list(self.train_dummies.columns)+list(self.test_dummies.columns)))
        
        for column in self.dummies:
            if column not in self.train_dummies:
                self.train_dummies[column] = 0
            if column not in self.test_dummies:
                self.test_dummies[column] = 0
        
        self.train_dummies = self.train_dummies[self.dummies]
        self.test_dummies = self.test_dummies[self.dummies]
    
    def encode(self):
        train = pd.DataFrame(index=self.idx_train)
        test = pd.DataFrame(index=self.idx_test)
        for c in self.features['cat']:
            encoder = LabelEncoder()
            encoder.fit(self.X_train[c].values)
            train[c] = encoder.transform(self.X_train[c].values)
            test[c] = encoder.transform(self.X_test[c].values)
        self.train_encoded = train
        self.test_encoded = test
    
    def standardize(self,scaler):
        if scaler == 'standard':
            self.scaler = StandardScaler()
        elif scaler == 'robust': 
            self.scaler = RobustScaler()
        elif scaler == 'minmax':
            self.scaler = MinMaxScaler()
        else:
            raise Exception('Unknown scaler option')
        self.scaler.fit(self.X_train[self.features['num']+self.features['encoded']])
        self.scaler.fit(self.X_test[self.features['num']+self.features['encoded']])
        
        self.train_standardized = pd.DataFrame(
            self.scaler.transform(self.X_train[self.features['num']+self.features['encoded']])
            ,columns=self.features['num']+self.features['encoded'],index=self.idx_train
        )
        self.test_standardized = pd.DataFrame(
            self.scaler.transform(self.X_test[self.features['num']+self.features['encoded']])
            ,columns=self.features['num']+self.features['encoded'],index=self.idx_test
        )
    
    def combine(self):
        self.train_combined = self.train_dummies.join(self.train_standardized,how='inner')
        self.test_combined = self.test_dummies.join(self.test_standardized,how='inner')
    
    def select(self,num_columns=120):
        self.estimator = SVR(kernel='linear')
        self.selector = RFE(self.estimator, n_features_to_select=num_columns, step=1)
        self.selector.fit(self.train_combined,self.y_train)
        self.selected_features = self.selector.get_feature_names_out(self.selector.feature_names_in_)
        
        self.train_selected = self.train_combined[self.selected_features]
        self.test_selected = self.test_combined[self.selected_features]
        
    def split(self):
        (self.X_fit,self.X_score,
        self.y_fit,self.y_score) = train_test_split(
            self.train_selected,self.y_train,
            test_size=0.2,random_state=42
        )
        self.X_pred = self.test_selected
    
    def audit(self):
        None

In [91]:
class Model():
    """
    """
    def __init__(self,model_type,stage,X_fit,y_fit,X_score,y_score,X_pred):
        try:
            assert model_type in [
                'gbr','knn','ridge',
                'xgb','rfr','kernel',
                'enet','lgb','linreg'
            ]
        except:
            raise Exception('Unknown model type')
        self.model_type = model_type
        
        try:
            assert stage in ['dev','test','prod']
        except:
            raise Exception('Unknown stage')
        self.stage = stage
        
        if self.model_type == 'gbr':
            self.model = GradientBoostingRegressor()
            if self.stage == 'dev':
                self.parameters = {
                    'n_estimators': 1000,'max_depth': 5,
                    'min_samples_split': 5,'learning_rate': 0.1,
                    'loss': 'squared_error','criterion': 'friedman_mse',
                }
            elif self.stage == 'test':
                self.parameters = {
                    'n_estimators': [1000],'max_depth': [5],
                    'min_samples_split': [5],'learning_rate': [0.1],
                    'loss': ['squared_error'],'criterion': ['friedman_mse'],
                }
            elif self.stage == 'prod':
                self.parameters = {
                    'n_estimators': [500*i for i in range(1,4)],
                    'max_depth': [2+i for i in range(20)],
                    'min_samples_split': [3+i for i in range(8)],
                    'learning_rate': [0.06+(0.02*i) for i in range(5)],
                    'loss': ['squared_error','absolute_error'],#'huber','quantile'],
                    'criterion': ['friedman_mse','squared_error','mae'],
                }
        
        elif self.model_type == 'knn':
            self.model = KNeighborsRegressor()
            if self.stage == 'dev':
                self.parameters = {'n_neighbors': 7}
            elif self.stage == 'test':
                self.parameters = {
                    'n_neighbors': [7],'weights': ['uniform'],
                    'algorithm': ['auto'],'leaf_size': [15],
                    'p': [1],'metric': ['minkowski'],
                }
            elif self.stage == 'prod':
                self.parameters = {
                    'n_neighbors': [3+i for i in range(10)],
                    'weights': ['uniform'],#'distance'],
                    'algorithm': ['auto'],#'ball_tree','kd_tree'],
                    'leaf_size': [10+i for i in range(10)],
                    'p': [1,2],
                    'metric': ['minkowski'],
                }
        
        elif self.model_type == 'ridge':
            self.model = Ridge()
            if self.stage == 'dev':
                self.parameters = {'alpha': 0.5,'solver': 'auto',}
            elif self.stage == 'test':
                self.parameters = {'alpha': [0.5],'solver': ['auto'],}
            elif self.stage == 'prod':
                self.parameters = {
                    'alpha': [0.1*i for i in range(1,20)],
                    'solver': ['auto']
                }
        
        elif self.model_type == 'xgb':
            self.model = XGBRegressor(nthread=-1)
            if self.stage == 'dev':
                self.parameters = {
                    'min_child_weight': 5,'gamma': 0.5,
                    'subsample': 1,'colsample_bytree': 0.7,
                    'max_depth': 3,'reg_lambda': 0.8,
                    'reg_alpha': 0.5,
                }
            elif self.stage == 'test':
                self.parameters = {
                    'min_child_weight': [5],'gamma': [0.5],
                    'subsample': [1],'colsample_bytree': [0.7],
                    'max_depth': [3],'reg_lambda': [0.8],
                    'reg_alpha': [0.5],
                }
            elif self.stage == 'prod':
                self.parameters = {
                    'min_child_weight': [4,5],
                    'gamma': [i/10.0 for i in range(3,6)],
                    'subsample': [i/10.0 for i in range(6,11)],
                    'colsample_bytree': [i/10.0 for i in range(4,11)],
                    'max_depth': [2,3,4],
                    'reg_lambda': [i/10.0 for i in range(7,9)],
                    'reg_alpha': [i/10.0 for i in range(4,7)],
                }
        
        elif self.model_type == 'rfr':
            self.model = RandomForestRegressor()
            if self.stage == 'dev':
                self.parameters = {}
            elif self.stage == 'test':
                self.parameters = {}
            elif self.stage == 'prod':
                self.parameters = {}
        
        elif self.model_type == 'enet':
            self.model = ElasticNet()
            if self.stage == 'dev':
                self.parameters = {}
            elif self.stage == 'test':
                self.parameters = {}
            elif self.stage == 'prod':
                self.parameters = {}
        
        elif self.model_type == 'linreg':
            self.model = LinearRegression()
            if self.stage == 'dev':
                self.parameters = {}
            elif self.stage == 'test':
                self.parameters = {}
            elif self.stage == 'prod':
                self.parameters = {}
        
        elif self.model_type == 'kernel':
            self.model = KernelRidge()
            if self.stage == 'dev':
                self.parameters = {
                    'alpha': 0.6,
                    'kernel': 'polynomial',
                    'degree': 2,
                    'coef0': 2.5
                }
            elif self.stage == 'test':
                self.parameters = {
                    'alpha': [0.6],
                    'kernel': ['polynomial'],
                    'degree': [2],
                    'coef0': [2.5]
                }
            elif self.stage == 'prod':
                self.parameters = {
                    'alpha': [0.6],
                    'kernel': ['polynomial'],
                    'degree': [2],
                    'coef0': [2.5]
                }
        
        elif self.model_type == 'lbg':
            self.model = LGBMRegressor()
            if self.stage == 'dev':
                self.parameters = {}
            elif self.stage == 'test':
                self.parameters = {}
            elif self.stage == 'prod':
                self.parameters = {}
        
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score
        self.X_pred = X_pred
        self.X_pred_idx = X_pred.index
        
        self.grid = None
        self.corr = None
        self.mse = None
        
        self.score_prediction = None
        self.fit_prediction = None
        self.prediction = None
    
    def tune(self):
        if self.stage == 'dev':
            self.model.fit(X=self.X_fit,y=self.y_fit)
        
        elif self.stage in ['test','prod']:
            self.grid = GridSearchCV(self.model,self.parameters)
            self.grid.fit(X=self.X_fit,y=self.y_fit)
            self.model = self.grid.best_estimator_
        
        self.score_prediction = pd.Series(self.model.predict(self.X_score),name=self.model_type,index=self.X_score.index)
        self.corr = round(self.model.score(X=self.X_score,y=self.y_score),4)
        self.mse = round(mean_squared_error(self.y_score, self.score_prediction),4)
        
        self.fit_prediction = pd.Series(self.model.predict(self.X_fit),name=self.model_type,index=self.X_fit.index)
    
    def predict(self):
        self.prediction = pd.Series(self.model.predict(self.X_pred),name=self.model_type,index=self.X_pred.index)
    
    def audit(self):
        None

In [92]:
class NeuralNetwork(object):
    """
    """
    def __init__(self,X_fit,y_fit,X_score,y_score,X_pred):
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score
        self.X_pred = X_pred
                
        self.model = Sequential()
        
    def layer(self):
        self.model.add(Dense(self.X_fit.shape[1], input_dim=self.X_fit.shape[1], kernel_initializer='normal', activation='relu'))
        self.model.add(Dense(1500, activation='relu'))
        self.model.add(Dense(1000, activation='relu'))
        self.model.add(Dense(500, activation='relu'))
        self.model.add(Dense(1, activation='linear'))
        
    def run(self):
        self.model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
        self.model.fit(self.X_fit, self.y_fit, epochs=100, batch_size=150, verbose=1, validation_split=0.2)
        
    def predict(self):
        self.fit_prediction = pd.Series(self.model.predict(self.X_fit).flatten(),name='neural',index=self.X_fit.index)
        self.score_prediction = pd.Series(self.model.predict(self.X_score).flatten(),name='neural',index=self.X_score.index)
        self.prediction = pd.Series(self.model.predict(self.X_pred).flatten(),name='neural',index=self.X_pred.index)

In [93]:
class MetaModel(object):
    """
    """
    def __init__(self,model_type,X_fit,y_fit,X_score,y_score,X_pred):
        try:
            assert model_type in ['lasso','rfr','gbr','xgb']
        except:
            raise Exception('Unknown model type')
        
        self.model_type = model_type
        self.X_fit = X_fit
        self.y_fit = y_fit
        
        self.X_score = X_score
        self.y_score = y_score
        
        self.X_pred = X_pred
        self.prediction = None
        
        self.mse = None
        self.selected_features = None
        
        self.X_fit_final = None
        self.X_score_final = None
        self.X_pred_final = None
        
        self.score_prediction = None
        self.prediction = None
        
        if self.model_type == 'lasso':
            self.parameters = {
                'alpha': [0.0005],
                'random_state': [2],
            }
            self.model = Lasso()
            self.grid = GridSearchCV(self.model,self.parameters)
        
        elif self.model_type == 'rfr':
            self.parameters = {}
            self.model = RandomForestRegressor()
            self.grid = GridSearchCV(self.model,self.parameters)
        
        elif self.model_type == 'xgb':
            self.parameters = {}
            self.model = XGBRegressor()
            self.grid = GridSearchCV(self.model,self.parameters)
        
        elif self.model_type == 'gbr':
            self.parameters = {}
            self.model = GradientBoostingRegressor()
            self.grid = GridSearchCV(self.model,self.parameters)
    
    def standardize(self):
        self.scaler = StandardScaler()
        self.scaler.fit(self.X_fit)
        self.scaler.fit(self.X_score)
        self.scaler.fit(self.X_pred)
        
        self.X_fit_std = pd.DataFrame(
            self.scaler.transform(self.X_fit)
            ,columns=self.X_fit.columns,index=self.X_fit.index
        )
        self.X_score_std = pd.DataFrame(
            self.scaler.transform(self.X_score)
            ,columns=self.X_score.columns,index=self.X_score.index
        )
        self.X_pred_std = pd.DataFrame(
            self.scaler.transform(self.X_pred)
            ,columns=self.X_pred.columns,index=self.X_pred.index
        )
    
    def select(self,num_columns=5):
        self.estimator = SVR(kernel='linear')
        self.selector = RFE(self.estimator, n_features_to_select=num_columns, step=1)
        self.selector.fit(self.X_fit_std,self.y_fit)
        
        self.selected_features = self.selector.get_feature_names_out(self.selector.feature_names_in_)
        self.X_fit_final = self.X_fit_std[self.selected_features]
        self.X_score_final = self.X_score_std[self.selected_features]
        self.X_pred_final = self.X_pred_std[self.selected_features]
        
    def predict(self):
        self.model.fit(self.X_fit_final,self.y_fit)
        self.score_prediction = pd.Series(self.model.predict(self.X_score_final),name='SalePrice',index=self.X_score_final.index)
        self.mse = round(mean_squared_error(self.y_score, self.score_prediction),4)
        self.prediction = pd.Series(self.model.predict(self.X_pred_final),name='SalePrice',index=self.X_pred_final.index)
        
    def write(self):
        self.prediction.to_csv(os.path.join(os.getcwd(),'submission.csv'),index=True) # .format(datetime.datetime.now().strftime('%Y%m%d.%H%M'))

In [94]:
stage = 'test' # dev, test, prod

In [96]:
preprocessor = Preprocessor(target='SalePrice',stage=stage)
preprocessor.clean()
preprocessor.dummy()
preprocessor.standardize(scaler='robust')
preprocessor.combine()
preprocessor.select(num_columns=preprocessor.train_combined.shape[1])
preprocessor.split()

model_data = {
    'X_fit': preprocessor.X_fit,
    'y_fit': preprocessor.y_fit,
    'X_score': preprocessor.X_score,
    'y_score': preprocessor.y_score,
    'X_pred': preprocessor.X_pred,
}

In [97]:
models = {
    'knn': None,
    'ridge': None,
    'linreg': None,
    'rfr': None,
    'kernel': None,
    'enet': None,
    'xgb': None,
    'gbr': None,
}
for model_type in models.keys():
    start = datetime.datetime.now()
    
    model = Model(model_type,stage=stage,**model_data)
    model.tune()
    model.predict()
    
    models[model_type] = model
    
    print('{} regression'.format(model_type))
    print('Correlation:',model.corr,'; MSE:',model.mse)
    print('Runtime:',round((datetime.datetime.now() - start).total_seconds(), 2))

knn regression
Correlation: 0.7197 ; MSE: 2149679636.6333
Runtime: 1.67
ridge regression
Correlation: 0.8929 ; MSE: 821202625.6264
Runtime: 0.1
linreg regression
Correlation: -13849102529487.752 ; MSE: 1.062271340779591e+23
Runtime: 0.16
rfr regression
Correlation: 0.8907 ; MSE: 838502051.9718
Runtime: 10.11
kernel regression
Correlation: 0.7991 ; MSE: 1541073744.0032
Runtime: 0.29
enet regression
Correlation: 0.8181 ; MSE: 1395330650.8785
Runtime: 0.12
xgb regression
Correlation: 0.9122 ; MSE: 673576577.3597
Runtime: 1.37
gbr regression
Correlation: 0.9126 ; MSE: 670489328.5076
Runtime: 60.83


In [98]:
nn = NeuralNetwork(**model_data)
nn.layer()
nn.run()
nn.predict()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [99]:
# models = [ridge,rfr,kernel,xgb,gbr]
exclude = ['linreg', 'enet', 'knn','nn',]
meta_data = {
    'X_fit': pd.concat([v.fit_prediction for k,v in models.items() if k not in exclude],axis=1),
    'y_fit': preprocessor.y_fit,
    'X_score': pd.concat([v.score_prediction for k,v in models.items() if k not in exclude],axis=1),
    'y_score': preprocessor.y_score,
    'X_pred': pd.concat([v.prediction for k,v in models.items() if k not in exclude],axis=1),
}

meta = MetaModel('rfr',**meta_data)
meta.standardize()
meta.select(num_columns=6)
meta.predict()
print(meta.mse)

636154334.4939


In [100]:
meta.write()

In [None]:
class Analysis(object):
    """
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    pd.set_option('display.max_columns', None)

    def __init__(self,train_df,test_df,features):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        
    
    def eda(self):
        nulls = self.train_df.isna().sum()
        
        display(self.train_df[self.features['target']].describe())
        display(self.train_df[self.features['int']].describe())
        for column in self.features['cat']:
            values = self.train_df[column].unique()
            missing = nulls.loc[nulls.index==column].values[0]
            print(column)
            if missing>0:
                print('Missing values:',missing)
            print(len(values))
            print(values[:5])
            print('\n')
        display(self.train_df.groupby('MSZoning').mean().SalePrice)
        print('\n')
    
    def plot(self,version):
        if version == 'target':
            sns.histplot(self.train_df.Target)
        elif version == 'quality':
            data = self.train_df[['OverallQual','SalePrice']]
            plt.figure(figsize=(8,6))
            sns.boxplot(x='OverallQual',y='SalePrice',data=data)
        else:
            raise Exception('Unknown plot version')

In [None]:
analysis = Analysis(preprocessor.train_df,preprocessor.test_df,preprocessor.features)

In [None]:
analysis.plot('target')

In [None]:
analysis.eda()