In [91]:
# System
import os
import yaml
import datetime

# Data Analysis
import re
import matplotlib as mpl
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Machine Learning
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [92]:
class Preprocessor(object):
    """
    """
    
    def __init__(self):
        cwd = os.getcwd()
        kaggle = os.path.join(cwd,'kaggle')
        
        self.train_df = pd.read_csv(os.path.join(kaggle,'train.csv'),header=0,index_col=0)
        self.test_df = pd.read_csv(os.path.join(kaggle,'test.csv'),header=0,index_col=0)
        
        with open(os.path.join(os.getcwd(),'features.yml'),'r') as file:
            self.features_yml = yaml.load(file, Loader=yaml.FullLoader)
        self.features = None
        
        self._get_features()
        
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        
        self.train_dummies = None
        self.test_dummies = None
        self.dummies = None
        
        self.scaler = None
        self.train_normalized = None
        self.test_normalized = None
        
        self.train_combined = None
        self.test_combined = None
        
        self.selector = None
        self.selected_features = None
        self.train_selected = None
        self.test_selected = None
        
        self.X_fit = None
        self.y_fit = None
        self.X_score = None
        self.y_score = None
        self.X_pred = None
        self.y_pred = None
        
        self._clean()
        self._dummy()
        self._normalize()
        self._combine()
            
    def _get_features(self):
        features = {'all':[]}
        for feature,info in self.features_yml.items():
            features['all'].append(feature)
            if not features.get(info['dtype'],None):
                features[info['dtype']] = []
            features[info['dtype']].append(feature)
        self.features = features
    
    def _clean(self):
        self.y_train = self.train_df.loc[:,self.features['target'][0]]
        self.X_train = self.train_df.fillna(0).drop(self.features['target'],axis=1)
        self.X_test = self.test_df.fillna(0)
        
    def _dummy(self):
        self.train_dummies = pd.get_dummies(self.X_train[self.features['cat']])
        self.test_dummies = pd.get_dummies(self.X_test[self.features['cat']])
        self.dummies = list(set(list(self.train_dummies.columns)+list(self.test_dummies.columns)))
        
        for column in self.dummies:
            if column not in self.train_dummies:
                self.train_dummies[column] = 0
            if column not in self.test_dummies:
                self.test_dummies[column] = 0
        
        self.train_dummies = self.train_dummies[self.dummies]
        self.test_dummies = self.test_dummies[self.dummies]
    
    def _normalize(self):
        self.scaler = StandardScaler()
        self.train_normalized = pd.DataFrame(
            self.scaler.fit_transform(self.X_train[self.features['int']])
            ,columns=self.features['int'],index=self.X_train.index
        )
        self.test_normalized = pd.DataFrame(
            self.scaler.fit_transform(self.X_test[self.features['int']])
            ,columns=self.features['int'],index=self.X_test.index
        )
    
    def _combine(self):
        self.train_combined = self.train_dummies.join(self.train_normalized,how='inner')
        self.test_combined = self.test_dummies.join(self.test_normalized,how='inner')
    
    def select(self,num_columns=120):
        self.selector = SelectKBest(score_func=f_regression, k=num_columns)
        self.selector.fit(self.train_combined,self.y_train)
        self.selected_features = self.selector.get_feature_names_out(self.selector.feature_names_in_)
        
        self.train_selected = self.train_combined[self.selected_features]
        self.test_selected = self.test_combined[self.selected_features]
        
    def split(self):
        (self.X_fit,self.X_score,
        self.y_fit,self.y_score) = train_test_split(
            self.train_selected,self.y_train,
            test_size=0.2,random_state=42
        )
        self.X_pred = self.test_selected

In [93]:
preprocessor = Preprocessor()
preprocessor.select(num_columns=120)
preprocessor.split()

  correlation_coefficient /= X_norms


In [98]:
class Model(object):
    """
    """
    def __init__(self,model_type,X_fit,y_fit,X_score,y_score,X_pred):
        self.model_type = model_type
        
        if self.model_type == 'gbr':
            self.model = GradientBoostingRegressor()
            self.increment = 0.01
        elif self.model_type == 'knn':
            self.model = KNeighborsRegressor()
            self.increment = 1
        elif self.model_type == 'ridge':
            self.model = Ridge()
            self.increment = 0.1
        else:
            raise Exception('Unknown model type')
        
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score
        self.X_pred = X_pred
        self.y_pred = None

        self.iterations = []
        self.best = None
        self.final = None
        self.prediction = None
    
    class Iteration(object):
        def __init__(self,model,corr,mse,params):
            self.model = model
            self.corr = corr
            self.mse = mse
            self.params = params
        
    def _fit(self):
        self.model.fit(X=self.X_fit,y=self.y_fit)

    def _assess(self):
        corr = round(self.model.score(X=self.X_score,y=self.y_score),4)
        mse = round(mean_squared_error(self.y_score, self.model.predict(self.X_score)),4)
        return corr,mse

    def tune(self,iterations=20):
        for i in range(iterations):
            incr = round((i*self.increment),4)
            if self.model_type=='gbr':
                params = {
                    'n_estimators': 1000,
                    'max_depth': 3,
                    'min_samples_split': 5,
                    'learning_rate': 0.01+incr,
                    'loss': 'squared_error',
                }
                self.model.set_params(**params)
            elif self.model_type=='knn':
                params = {'n_neighbors': 1+incr,}
                self.model.set_params(**params)
            elif self.model_type=='ridge':
                params = {'alpha': 0.2+incr,}
                self.model.set_params(**params)

            self._fit()
            corr,mse = self._assess()
            self.iterations.append(Iteration(self.model,corr,mse,params))

        self.best = self.iterations[0]
        for iteration in self.iterations:
            if iteration.corr > self.best.corr:
                self.best = iteration
        self.final = self.best.model

    def predict(self):
        self.output = pd.DataFrame(
            self.final.predict(self.X_pred),
            columns=['SalePrice'],index=self.X_pred.index
        )

    def refresh(self,X_fit,y_fit,X_score,y_score):
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score

In [99]:
runs = 50
data = {
    'X_fit': preprocessor.X_fit,
    'y_fit': preprocessor.y_fit,
    'X_score': preprocessor.X_score,
    'y_score': preprocessor.y_score,
    'X_pred': preprocessor.X_pred,
}

# gradient boosted regression
gbr = Model('gbr',**data)
gbr.tune(iterations=runs)
gbr.predict()
print('GBDT regression correlation:',gbr.best.corr)
print(gbr.best.params)

# k nearest neighbors regression
knn = Model('knn',**data)
knn.tune(iterations=runs)
knn.predict()
print('KNN regression correlation:',knn.best.corr)
print(knn.best.params)

# ridge regression
ridge = Model('ridge',**data)
ridge.tune(iterations=runs)
ridge.predict()
print('Ridge regression correlation:',ridge.best.corr)
print(ridge.best.params)

GBDT regression correlation: 0.9179
{'n_estimators': 1000, 'max_depth': 3, 'min_samples_split': 5, 'learning_rate': 0.08, 'loss': 'squared_error'}
KNN regression correlation: 0.8152
{'n_neighbors': 2}
Ridge regression correlation: 0.8796
{'alpha': 4.5}
