In [8]:
# System
import os
import yaml
import datetime

# Data Analysis
import re
import numpy as np
import pandas as pd

# Machine Learning
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

In [21]:
class Preprocessor(object):
    """
    """
    
    def __init__(self,target,stage):
        try:
            assert stage in ['dev','test','prod']
        except:
            raise Exception('Unknown stage')
        self.stage = stage
        
        cwd = os.getcwd()
        kaggle = os.path.join(cwd,'kaggle')
        
        self.target = target
        self.train_df = pd.read_csv(os.path.join(kaggle,'train.csv'),header=0,index_col=0)
        self.test_df = pd.read_csv(os.path.join(kaggle,'test.csv'),header=0,index_col=0)
        
        with open(os.path.join(os.getcwd(),'features.yml'),'r') as file:
            self.features_yml = yaml.load(file, Loader=yaml.FullLoader)
        self.features = None
        self._get_features()
        
        self.X_train = None
        self.y_train = None
        self.idx_train = None
        self.X_test = None
        self.y_test = None
        self.idx_test = None
        
        self.train_dummies = None
        self.test_dummies = None
        self.dummies = None
        
        self.train_encoded = None
        self.test_encoded = None
        
        self.scaler = None
        self.train_normalized = None
        self.test_normalized = None
        
        self.train_combined = None
        self.test_combined = None
        
        self.selector = None
        self.estimator= None
        self.selected_features = None
        self.train_selected = None
        self.test_selected = None
        
        self.X_fit = None
        self.y_fit = None
        self.X_score = None
        self.y_score = None
        self.X_pred = None
        self.y_pred = None
        
        self._clean()
        self._dummy()
        #self._encode()
        self._normalize()
        self._combine()
                    
    def _get_features(self):
        features = {'all':[]}
        for feature,info in self.features_yml.items():
            features['all'].append(feature)
            if not features.get(info['dtype'],None):
                features[info['dtype']] = []
            features[info['dtype']].append(feature)
        self.features = features
    
    def _clean(self):
        df = self.train_df.fillna(0).rename({self.target:'Target'},axis=1)
        self.y_train = df.Target
        self.X_train = df.drop(self.features['target'],axis=1)
        self.idx_train = df.index
        
        self.X_test = self.test_df.fillna(0)
        self.idx_test = self.X_test.index
        
    def _dummy(self):
        self.train_dummies = pd.get_dummies(self.X_train[self.features['cat']])
        self.test_dummies = pd.get_dummies(self.X_test[self.features['cat']])
        self.dummies = list(set(list(self.train_dummies.columns)+list(self.test_dummies.columns)))
        
        for column in self.dummies:
            if column not in self.train_dummies:
                self.train_dummies[column] = 0
            if column not in self.test_dummies:
                self.test_dummies[column] = 0
        
        self.train_dummies = self.train_dummies[self.dummies]
        self.test_dummies = self.test_dummies[self.dummies]
    
    def _encode(self):
        train = pd.DataFrame(index=self.idx_train)
        test = pd.DataFrame(index=self.idx_test)
        for c in self.features['cat']:
            encoder = LabelEncoder()
            encoder.fit(self.X_train[c].values)
            train[c] = encoder.transform(self.X_train[c].values)
            test[c] = encoder.transform(self.X_test[c].values)
        self.train_encoded = train
        self.test_encoded = test
    
    def _normalize(self):
        self.scaler = StandardScaler()
        self.scaler.fit(self.X_train[self.features['int']])
        self.scaler.fit(self.X_test[self.features['int']])
        
        self.train_normalized = pd.DataFrame(
            self.scaler.transform(self.X_train[self.features['int']])
            ,columns=self.features['int'],index=self.idx_train
        )
        self.test_normalized = pd.DataFrame(
            self.scaler.transform(self.X_test[self.features['int']])
            ,columns=self.features['int'],index=self.idx_test
        )
    
    def _combine(self):
        self.train_combined = self.train_dummies.join(self.train_normalized,how='inner')
        self.test_combined = self.test_dummies.join(self.test_normalized,how='inner')
    
    def select(self,num_columns=120):
        self.estimator = SVR(kernel='linear')
        self.selector = RFE(self.estimator, n_features_to_select=num_columns, step=1)
        self.selector.fit(self.train_combined,self.y_train)
        self.selected_features = self.selector.get_feature_names_out(self.selector.feature_names_in_)
        
        self.train_selected = self.train_combined[self.selected_features]
        self.test_selected = self.test_combined[self.selected_features]
        
    def split(self):
        (self.X_fit,self.X_score,
        self.y_fit,self.y_score) = train_test_split(
            self.train_selected,self.y_train,
            test_size=0.2,random_state=42
        )
        self.X_pred = self.test_selected
    
    def audit(self):
        None

In [29]:
class Model(object):
    """
    """
    def __init__(self,model_type,stage,X_fit,y_fit,X_score,y_score,X_pred):
        try:
            assert model_type in ['gbr','knn','ridge','xgb']
        except:
            raise Exception('Unknown model type')
        self.model_type = model_type
        
        try:
            assert stage in ['dev','test','prod']
        except:
            raise Exception('Unknown stage')
        self.stage = stage
        
        if self.model_type == 'gbr':
            self.model = GradientBoostingRegressor()
            if self.stage == 'dev':
                self.parameters = {
                    'n_estimators': 1000,'max_depth': 5,
                    'min_samples_split': 5,'learning_rate': 0.3,
                    'loss': 'squared_error','criterion': 'friedman_mse',
                }
            elif self.stage == 'test':
                self.parameters = {
                    'n_estimators': [1000],'max_depth': [5],
                    'min_samples_split': [5],'learning_rate': [0.3],
                    'loss': ['squared_error'],'criterion': ['friedman_mse'],
                }
            elif self.stage == 'prod':
                self.parameters = {
                    'n_estimators': [500*i for i in range(1,8)],
                    'max_depth': [2+i for i in range(20)],
                    'min_samples_split': [3+i for i in range(8)],
                    'learning_rate': [0.1*i for i in range(1,30)],
                    'loss': ['squared_error','absolute_error'],#'huber','quantile'],
                    'criterion': ['friedman_mse','squared_error','mae'],
                }
        
        elif self.model_type == 'knn':
            self.model = KNeighborsRegressor()
            if self.stage == 'dev':
                self.parameters = {
                    'n_neighbors': 3,'weights': 'uniform',
                    'algorithm': 'auto','leaf_size': 15,
                    'p': 1,'metric': 'minkowski',
                }
            elif self.stage == 'test':
                self.parameters = {
                    'n_neighbors': [3],'weights': ['uniform'],
                    'algorithm': ['auto'],'leaf_size': [15],
                    'p': [1],'metric': ['minkowski'],
                }
            elif self.stage == 'prod':
                self.parameters = {
                    'n_neighbors': [1+i for i in range(50)],
                    'weights': ['uniform'],#'distance'],
                    'algorithm': ['auto'],#'ball_tree','kd_tree'],
                    'leaf_size': [10+i for i in range(40)],
                    'p': [1,2],
                    'metric': ['minkowski'],
                }
        
        elif self.model_type == 'ridge':
            self.model = Ridge()
            if self.stage == 'dev':
                self.parameters = {'alpha': 0.5,'solver': 'auto',}
            elif self.stage == 'test':
                self.parameters = {'alpha': [0.5],'solver': ['auto'],}
            elif self.stage == 'prod':
                self.parameters = {
                    'alpha': [0.1*i for i in range(1,50)],
                    'solver': ['auto']
                }
        
        elif self.model_type == 'xgb':
            self.model = XGBRegressor(nthread=-1)
            if self.stage == 'dev':
                self.parameters = {
                    'min_child_weight': 5,'gamma': 0.5,
                    'subsample': 1,'colsample_bytree': 0.7,
                    'max_depth': 3,'reg_lambda': 0.8,
                    'reg_alpha': 0.5,
                }
            elif self.stage == 'test':
                self.parameters = {
                    'min_child_weight': [5],'gamma': [0.5],
                    'subsample': [1],'colsample_bytree': [0.7],
                    'max_depth': [3],'reg_lambda': [0.8],
                    'reg_alpha': [0.5],
                }
            elif self.stage == 'prod':
                self.parameters = {
                    'min_child_weight': [4,5],
                    'gamma': [i/10.0 for i in range(3,6)],
                    'subsample': [i/10.0 for i in range(6,11)],
                    'colsample_bytree': [i/10.0 for i in range(4,11)],
                    'max_depth': [2,3,4],
                    'reg_lambda': [i/10.0 for i in range(7,9)],
                    'reg_alpha': [i/10.0 for i in range(4,7)],
                }
        
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score
        self.X_pred = X_pred
        self.X_pred_idx = X_pred.index
        
        self.grid = None
        self.corr = None
        self.mse = None
        self.prediction = None
    
    def tune(self):
        if self.stage == 'dev':
            self.model.fit(X=self.X_fit,y=self.y_fit)
            self.prediction = self.model.predict(self.X_score)
            self.corr = round(self.model.score(X=self.X_score,y=self.y_score),4)
            self.mse = round(mean_squared_error(self.y_score, self.prediction),4)
        
        elif self.stage in ['test','prod']:
            self.grid = GridSearchCV(self.model,self.parameters)
            self.grid.fit(X=self.X_fit,y=self.y_fit)
            self.prediction = self.grid.best_estimator_.predict(self.X_score)
            self.corr = round(self.grid.best_estimator_.score(X=self.X_score,y=self.y_score),4)
            self.mse = round(mean_squared_error(self.y_score, self.prediction),4)

    def refresh(self,X_fit,y_fit,X_score,y_score):
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score
    
    def audit(self):
        None

In [23]:
stage = 'test' # dev, test, prod

In [31]:
preprocessor = Preprocessor(target='SalePrice',stage=stage)
preprocessor.select(num_columns=120)
preprocessor.split()

In [32]:
data = {
    'X_fit': preprocessor.X_fit,
    'y_fit': preprocessor.y_fit,
    'X_score': preprocessor.X_score,
    'y_score': preprocessor.y_score,
    'X_pred': preprocessor.X_pred,
}

# ridge regression
start_ridge = datetime.datetime.now()
ridge = Model('ridge',stage=stage,**data)
ridge.tune()
print('Ridge regression correlation:',ridge.corr)
print('Runtime:',round((datetime.datetime.now() - start_ridge).total_seconds(), 2))

# k nearest neighbors regression
start_knn = datetime.datetime.now()
knn = Model('knn',stage=stage,**data)
knn.tune()
print('KNN regression correlation:',knn.corr)
print('Runtime:',round((datetime.datetime.now() - start_knn).total_seconds(), 2))

# gradient boosted regression
start_gbc = datetime.datetime.now()
gbr = Model('gbr',stage=stage,**data)
gbr.tune()
print('GBDT regression correlation:',gbr.corr)
print('Runtime:',round((datetime.datetime.now() - start_gbc).total_seconds(), 2))

# xgb regression
start_xgb = datetime.datetime.now()
xgb = Model('xgb',stage=stage,**data)
xgb.tune()
print('XGB regression correlation:',xgb.corr)
print('Runtime:',round((datetime.datetime.now() - start_xgb).total_seconds(), 2))

Ridge regression correlation: 0.8465
Runtime: 0.06
KNN regression correlation: 0.7409
Runtime: 0.28
GBDT regression correlation: 0.8759
Runtime: 23.31


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGB regression correlation: 0.9001
Runtime: 0.73


In [None]:
class Analysis(object):
    """
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    pd.set_option('display.max_columns', None)

    def __init__(self,train_df,test_df,features):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        
    
    def eda(self):
        nulls = self.train_df.isna().sum()
        
        display(self.train_df[self.features['target']].describe())
        display(self.train_df[self.features['int']].describe())
        for column in self.features['cat']:
            values = self.train_df[column].unique()
            missing = nulls.loc[nulls.index==column].values[0]
            print(column)
            if missing>0:
                print('Missing values:',missing)
            print(len(values))
            print(values[:5])
            print('\n')
        display(self.train_df.groupby('MSZoning').mean().SalePrice)
        print('\n')
    
    def plot(self,version):
        if version == 'target':
            sns.histplot(self.train_df.Target)
        elif version == 'quality':
            data = self.train_df[['OverallQual','SalePrice']]
            plt.figure(figsize=(8,6))
            sns.boxplot(x='OverallQual',y='SalePrice',data=data)
        else:
            raise Exception('Unknown plot version')

In [None]:
analysis = Analysis(preprocessor.train_df,preprocessor.test_df,preprocessor.features)

In [None]:
analysis.plot('target')

In [None]:
analysis.eda()