In [60]:
# System
import os
import datetime
from itertools import product

# Data Analysis
import re
import numpy as np
import pandas as pd

# Machine Learning
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [61]:
class Preprocessor(object):
    """
    """
    
    def __init__(self,target,stage):
        try:
            assert stage in ['dev','test','prod']
        except:
            raise Exception('Unknown stage')
        self.stage = stage
        
        cwd = os.getcwd()
        
        self.target = target
        self.rename = {target: 'Target','key':'Key'}
        self.fields = [
            'ABV','Ave Rating','Min IBU',
            'Astringency','Body','Alcohol','Bitter',
            'Sweet','Sour','Salty','Fruits','Hoppy',
            'Spices','Malty'
        ]
        self.input_df = pd.read_csv(os.path.join(cwd,'beer_data_set.csv'))
        
        self.y = None
        self.X_base = None
        self.X = None
        self.features = None
        
        self.scaler = None
        self.X_standardized = None
        
        self.pca = None
        self.X_decomposed = None
        
        self.selector = None
        self.estimator= None
        self.selected_features = None
        self.X_selected = None
        
        self.X_fit = None
        self.y_fit = None
        self.X_score = None
        self.y_score = None
            
    def clean(self):
        df = self.input_df
        df = df.rename(self.rename,axis=1)
        df = df.set_index('Key')
        df = df.loc[:,['Target']+self.fields]
        self.y = df.Target
        X = df.drop('Target',axis=1)
        self.X_base = X
        self.X = X
        self.features = X.columns
    
    def engineer(self):
        X = self.X
        
        combinations = []
        for c1 in self.fields:
            for c2 in self.fields:
                comb = [c1,c2]
                comb.sort()
                comb = (comb[0],comb[1])
                combinations.append(comb)
        combinations = set(combinations)
                
        extra = []
        for c in combinations:
            extra.append(pd.Series((X[c[0]]*X[c[1]]),name=(c[0]+' * '+c[1]),index=X.index))
            extra.append(pd.Series((X[c[0]]+X[c[1]]),name=(c[0]+' + '+c[1]),index=X.index))
                
        X = pd.concat([X]+extra,axis=1)
        self.X = X
        self.features = X.columns
    
    def standardize(self):
        self.scaler = StandardScaler()
        self.scaler.fit(self.X)
        
        self.X_standardized = pd.DataFrame(
            self.scaler.transform(self.X)
            ,columns=self.features,index=self.X.index
        )
        self.X = self.X_standardized
    
    def decompose(self):
        self.pca = PCA(0.95)
        self.pca.fit(self.X_standardized)
        self.X_decomposed = pd.DataFrame(
            self.pca.transform(self.X_standardized),
            index=self.X_standardized.index
        )
        self.X_decomposed.columns = [str(i) for i in self.X_decomposed.columns]
        self.X = self.X_decomposed
    
    def select(self,num_columns):
        self.estimator = DecisionTreeClassifier()
        self.selector = RFE(self.estimator, n_features_to_select=num_columns, step=1)
        self.selector.fit(self.X,self.y)
        self.selected_features = self.selector.get_feature_names_out(self.selector.feature_names_in_)
        self.X_selected = self.X[self.selected_features]
        self.X = self.X_selected
        
    def split(self):
        (self.X_fit,self.X_score,
        self.y_fit,self.y_score) = train_test_split(
            self.X,self.y,
            test_size=0.1,random_state=42
        )
    
    def audit(self):
        None

In [62]:
class Model(object):
    """
    """
    def __init__(self,model_type,stage,X_fit,y_fit,X_score,y_score):
        try:
            assert model_type in ['gbc','knn','xgb','rfc']
        except:
            raise Exception('Unknown model type')
        self.model_type = model_type
        
        try:
            assert stage in ['dev','test','prod']
        except:
            raise Exception('Unknown stage')
        self.stage = stage
        
        
        if self.model_type == 'gbc':
            self.model = GradientBoostingClassifier()
            if self.stage == 'dev':
                self.parameters = {
                    'n_estimators': 100,
                    'learning_rate': 0.1,
                }
            elif self.stage in ['test','prod']:
                self.parameters = {
                    'n_estimators': [40*i for i in range(1,4)],
                    'learning_rate': [0.05+(0.02*i) for i in range(1,5)],
                }
        elif self.model_type == 'knn':
            self.model = KNeighborsClassifier()
            if self.stage == 'dev':
                self.parameters = {
                    'n_neighbors': 7,
                    'leaf_size': 10,
                }
            elif self.stage in ['test','prod']:
                self.parameters = {
                    'n_neighbors': [1+i for i in range(10)],
                    'leaf_size': [5+i for i in range(10)],
                }
        elif self.model_type == 'xgb':
            self.model = XGBClassifier()
            if self.stage == 'dev':
                self.parameters = {
                    'tree_method': 'gpu_hist',
                    'enable_categorical': True,
                    'use_label_encoder': True,
                }
            elif self.stage in ['test','prod']:
                self.parameters = {
                    'tree_method': ['gpu_hist'],
                    'enable_categorical': [True],
                    'use_label_encoder': [True],
                }
        elif self.model_type == 'rfc':
            self.model = RandomForestClassifier()
            if self.stage == 'dev':
                self.parameters = {}
            elif self.stage in ['test','prod']:
                self.parameters = {
                    'n_estimators': [100],
                    'criterion': ['gini'],
                    'max_depth': [None],
                }
        
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score
        
        self.grid = None
        self.corr = None
        self.mse = None
        
        self.final = None
        self.prediction = None
    
    def tune(self):
        if self.stage == 'dev':
            self.model.set_params(**self.parameters)
            self.model.fit(X=self.X_fit,y=self.y_fit)
        elif self.stage in ['test','prod']:
            self.grid = GridSearchCV(self.model,self.parameters)
            self.grid.fit(X=self.X_fit,y=self.y_fit)
            self.model = self.grid.best_estimator_
        
        self.corr = round(self.model.score(X=self.X_score,y=self.y_score),4)

    def refresh(self,X_fit,y_fit,X_score,y_score):
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score
    
    def audit(self):
        None

In [63]:
stage = 'dev' # dev, test, prod

In [64]:
preprocessor = Preprocessor(target='Style',stage=stage)
preprocessor.clean()
# preprocessor.engineer()
preprocessor.standardize()
# preprocessor.decompose()
preprocessor.select(num_columns=14)
preprocessor.split()

In [65]:
model_data = {
    'X_fit': preprocessor.X_fit,
    'y_fit': preprocessor.y_fit,
    'X_score': preprocessor.X_score,
    'y_score': preprocessor.y_score,
}

# k nearest neighbors classifier
start_knn = datetime.datetime.now()
knn = Model('knn',stage=stage,**model_data)
knn.tune()
print('KNN classifier correlation:',knn.corr)
print('Runtime:',round((datetime.datetime.now() - start_knn).total_seconds(), 2))

# xg boost classifier
# start_xgb = datetime.datetime.now()
# xgb = Model('xgb',stage=stage,**model_data)
# xgb.tune()
# print('XGBoost classifier correlation:',xgb.corr)
# print('Runtime:',round((datetime.datetime.now() - start_xgb).total_seconds(), 2))

# random forest classifier
start_rfc = datetime.datetime.now()
rfc = Model('rfc',stage=stage,**model_data)
rfc.tune()
print('Random Forest classifier correlation:',rfc.corr)
print('Runtime:',round((datetime.datetime.now() - start_rfc).total_seconds(), 2))

# gradient boosted classifier
start_gbc = datetime.datetime.now()
gbc = Model('gbc',stage=stage,**model_data)
gbc.tune()
print('GBDT classifier correlation:',gbc.corr)
print('Runtime:',round((datetime.datetime.now() - start_gbc).total_seconds(), 2))

KNN classifier correlation: 0.5108
Runtime: 0.13
Random Forest classifier correlation: 0.696
Runtime: 1.49


KeyboardInterrupt: 