In [None]:
# System
import os
import yaml
import datetime

# Data Analysis
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Machine Learning
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier

In [None]:
class Preprocessor(object):
    """
    """
    
    def __init__(self,target):
        cwd = os.getcwd()
        
        self.target = target
        self.rename = {target: 'Target','key':'Key'}
        self.fields = [
            'ABV','Ave Rating','Min IBU',
            'Astringency','Body','Alcohol','Bitter',
            'Sweet','Sour','Salty','Fruits','Hoppy',
            'Spices','Malty'
        ]
        self.input_df = pd.read_csv(os.path.join(cwd,'beer_data_set.csv'))
        
        self.y = None
        self.X = None
        
        self.scaler = None
        self.X_normalized = None
        
        self.selector = None
        self.estimator= None
        self.selected_features = None
        self.X_selected = None
        
        self.X_fit = None
        self.y_fit = None
        self.X_score = None
        self.y_score = None
        
        self._clean()
        self._normalize()
                    
    def _get_features(self):
        features = {'all':[],'target':[],'int':[]}
        for field in self.fields:
            if field == 'Target':
                features['target'].append(field)
            else:
                features['int'].append(field)
    
    def _clean(self):
        df = self.input_df
        df = df.rename(self.rename,axis=1)
        df = df.set_index('Key')
        df = df.loc[:,['Target']+self.fields]
        self.X = df.drop('Target',axis=1)
        self.y = df.Target
    
    def _normalize(self):
        self.scaler = StandardScaler()
        self.scaler.fit(self.X)
        
        self.X_normalized = pd.DataFrame(
            self.scaler.transform(self.X)
            ,columns=self.fields,index=self.X.index
        )
    
    def select(self,num_columns=14):
        self.estimator = DecisionTreeClassifier()
        self.selector = RFE(self.estimator, n_features_to_select=num_columns, step=1)
        self.selector.fit(self.X_normalized,self.y)
        self.selected_features = self.selector.get_feature_names_out(self.selector.feature_names_in_)
        self.X_selected = self.X_normalized[self.selected_features]
        
    def split(self):
        (self.X_fit,self.X_score,
        self.y_fit,self.y_score) = train_test_split(
            self.X_selected,self.y,
            test_size=0.1,random_state=42
        )
    
    def audit(self):
        None

In [None]:
class Model(object):
    """
    """
    def __init__(self,model_type,X_fit,y_fit,X_score,y_score):
        self.model_type = model_type
        
        if self.model_type == 'gbc':
            self.model = GradientBoostingClassifier()
            self.parameters = {
                'n_estimators': [500*i for i in range(1,8)],
                'max_depth': [2+i for i in range(10)],
                'min_samples_split': [3+i for i in range(8)],
                'learning_rate': [0.2+(0.1*i) for i in range(1,20)],
                'loss': ['squared_error','absolute_error','huber','quantile'],
                'criterion': ['friedman_mse','squared_error','mse','mae'],
            }
        elif self.model_type == 'knn':
            self.model = KNeighborsClassifier()
            self.parameters = {
                'n_neighbors': [1+i for i in range(20)],
                'weights': ['uniform','distance'],
                'algorithm': ['auto','ball_tree','kd_tree'],
                'leaf_size': [10+i for i in range(20)],
                'p': [1,2],
                'metric': ['minkowski'],
            }
        else:
            raise Exception('Unknown model type')
        
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score
        
        self.grid = None
        self.corr = None
        self.mse = None
        
        self.final = None
        self.prediction = None
    
    def tune(self):
        self.grid = GridSearchCV(self.model,self.parameters)
        self.grid.fit(X=self.X_fit,y=self.y_fit)
        self.corr = round(self.grid.score(X=self.X_score,y=self.y_score),4)
        self.mse = round(mean_squared_error(self.y_score, self.grid.predict(self.X_score)),4)

    def refresh(self,X_fit,y_fit,X_score,y_score):
        self.X_fit = X_fit
        self.y_fit = y_fit
        self.X_score = X_score
        self.y_score = y_score
    
    def audit(self):
        None

In [None]:
preprocessor = Preprocessor(target='Style')
preprocessor.select(num_columns=14)
preprocessor.split()

In [None]:
data = {
    'X_fit': preprocessor.X_fit,
    'y_fit': preprocessor.y_fit,
    'X_score': preprocessor.X_score,
    'y_score': preprocessor.y_score,
}

# # gradient boosted regression
gbc = Model('gbc',**data)
gbc.tune()
print('GBDT classifier correlation:',gbc.corr)

# k nearest neighbors regression
knn = Model('knn',**data)
knn.tune()
print('KNN classifier correlation:',knn.corr)