In [1]:
import pandas as pd
import numpy as np
import joblib
from typing import List

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

from catboost import CatBoostRegressor

In [2]:
data_train = pd.read_csv("../data/drinks_data_train.csv")
data_test = pd.read_csv("../data/drinks_data_valid.csv")

In [3]:
data_train = data_train[~data_train['ibu'].isnull()]
data_train

Unnamed: 0,id,name,abv,ibu,target_fg,target_og,ebc,srm,ph
2,266,Small Batch: Lemon Meringue Pie,6.5,5.0,1015.0,1066.0,7.0,4.0,4.0
3,31,Nanny State,0.5,55.0,1005.0,1007.0,30.0,15.0,4.4
4,89,Citra,7.5,70.0,1013.0,1068.0,30.0,15.0,4.4
7,60,Dogma,7.5,30.0,1023.0,1080.0,46.0,23.0,4.5
9,106,Punk IPA 2010 - Current,5.6,40.0,1011.0,1055.0,15.0,7.6,4.4
...,...,...,...,...,...,...,...,...,...
236,294,Opaque Jake,7.2,20.0,1009.0,1065.0,20.0,10.0,4.4
237,77,Hobo Pop,4.2,50.0,1010.0,1042.0,30.0,15.0,4.4
239,163,This. Is. Lager,4.7,37.0,1007.0,1043.0,10.0,6.0,4.2
242,191,Interstellar,6.5,55.0,1011.0,1059.0,50.0,25.0,4.4


In [4]:
to_drop_columns = [
    "id",
    "name"
]

no_standard_scaler_variables = [
    "ibu"
]

In [5]:
def fix_standard_scaler_variables(variables_list: List, target_dataframe: pd.DataFrame,
                                  original_dataframe: pd.DataFrame):
    
    fixed_dataframe = target_dataframe.copy()
    for var in variables_list:
        fixed_dataframe[var] = original_dataframe[var]
        
    return fixed_dataframe

In [6]:
def standard_scaler_dataframe(target_dataframe: pd.DataFrame,
                              original_dataframe: pd.DataFrame):
    transformed_dataframe = StandardScaler().fit_transform(target_dataframe)
    transformed_dataframe = pd.DataFrame(transformed_dataframe, index=original_dataframe.index,
                                        columns=original_dataframe.columns)
    
    return transformed_dataframe

In [7]:
class StandardScalerTransformers(BaseEstimator, TransformerMixin):
    """
    Transformer which creates the fake_var variable.
    """

    def __init__(self, variables_list: List):
        self.variables_list = variables_list

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        Y = X.copy()
        Y = standard_scaler_dataframe(Y, X)
        Y = fix_standard_scaler_variables(self.variables_list, Y, X)
        
        return Y

In [8]:
class TargetDiffTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer which creates the target_dg variable.
    """

    def __init__(self, final_var: str, original_var: str):
        self.final_var = final_var
        self.original_var = original_var

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        Y = X.copy()
        Y["target_dg"] = Y[self.final_var] - Y[self.original_var]

        return Y


class SelectVariablesTransformers(BaseEstimator, TransformerMixin):
    """
    Transformer which drops unwanted variables.
    """

    def __init__(self, variables_list: List):
        self.variables_list = variables_list

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        Y = X.copy()
        Y = Y.drop(self.variables_list, axis=1)
        
        return Y
    


In [9]:
example_pipeline = Pipeline([
    ('target_diff', TargetDiffTransformer("target_fg", "target_og")),
    ('drop_variables', SelectVariablesTransformers(to_drop_columns)),
    ('standard_scaler', StandardScalerTransformers(no_standard_scaler_variables))
])

model_dataframe = example_pipeline.fit_transform(data_train)

In [10]:
model_dataframe.isna().sum()

abv          0
ibu          0
target_fg    0
target_og    0
ebc          0
srm          0
ph           0
target_dg    0
dtype: int64

In [11]:
X = model_dataframe.drop("ibu", axis=1)
y = model_dataframe["ibu"]

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)


In [12]:
model = CatBoostRegressor(iterations=400,
                          learning_rate=0.1)


model.fit(X_train, y_train)

# Get predictions
y_pred = model.predict(X_test)

0:	learn: 22.0642229	total: 46.8ms	remaining: 18.7s
1:	learn: 21.2840504	total: 48.1ms	remaining: 9.58s
2:	learn: 20.6313829	total: 49.3ms	remaining: 6.53s
3:	learn: 19.9677285	total: 50.1ms	remaining: 4.96s
4:	learn: 19.3815146	total: 51.2ms	remaining: 4.05s
5:	learn: 18.8112902	total: 52.1ms	remaining: 3.42s
6:	learn: 18.2291368	total: 53.2ms	remaining: 2.98s
7:	learn: 17.7561970	total: 53.9ms	remaining: 2.64s
8:	learn: 17.2898273	total: 54.9ms	remaining: 2.38s
9:	learn: 16.9220689	total: 55.9ms	remaining: 2.18s
10:	learn: 16.6927694	total: 56.6ms	remaining: 2s
11:	learn: 16.4307766	total: 57.2ms	remaining: 1.85s
12:	learn: 16.2124832	total: 58.5ms	remaining: 1.74s
13:	learn: 15.8588319	total: 59.2ms	remaining: 1.63s
14:	learn: 15.5963287	total: 60.1ms	remaining: 1.54s
15:	learn: 15.2745647	total: 60.8ms	remaining: 1.46s
16:	learn: 15.0851010	total: 61.3ms	remaining: 1.38s
17:	learn: 14.8285776	total: 62ms	remaining: 1.31s
18:	learn: 14.6393184	total: 63ms	remaining: 1.26s
19:	learn:

317:	learn: 1.8488099	total: 212ms	remaining: 54.7ms
318:	learn: 1.8193468	total: 213ms	remaining: 54ms
319:	learn: 1.7993997	total: 213ms	remaining: 53.2ms
320:	learn: 1.7876353	total: 214ms	remaining: 52.7ms
321:	learn: 1.7846665	total: 214ms	remaining: 51.9ms
322:	learn: 1.7769827	total: 215ms	remaining: 51.2ms
323:	learn: 1.7613887	total: 215ms	remaining: 50.5ms
324:	learn: 1.7348401	total: 216ms	remaining: 49.8ms
325:	learn: 1.7107761	total: 216ms	remaining: 49ms
326:	learn: 1.7006237	total: 216ms	remaining: 48.3ms
327:	learn: 1.6952842	total: 217ms	remaining: 47.6ms
328:	learn: 1.6810622	total: 217ms	remaining: 46.8ms
329:	learn: 1.6698632	total: 217ms	remaining: 46.1ms
330:	learn: 1.6541762	total: 218ms	remaining: 45.4ms
331:	learn: 1.6455805	total: 218ms	remaining: 44.7ms
332:	learn: 1.6344699	total: 219ms	remaining: 44ms
333:	learn: 1.6161229	total: 219ms	remaining: 43.2ms
334:	learn: 1.6004791	total: 219ms	remaining: 42.5ms
335:	learn: 1.5892308	total: 219ms	remaining: 41.8ms

In [13]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(y_test, y_pred)

print("Mean Absolute Error    :", mae)
print("Mean Squared Error     :", mse)
print("Root Mean Squared Error:", rmse)

Mean Absolute Error    : 11.419921907337839
Mean Squared Error     : 237.9741055305563
Root Mean Squared Error: 15.426409353137116


In [15]:
filename = 'catboost_model_latest.pkl'
joblib.dump(model, filename)

['catboost_model_latest.pkl']

In [17]:
loaded_model = joblib.load(filename)

0.49642170354663606


Unnamed: 0,id,name,abv,ibu,target_fg,target_og,ebc,srm,ph
0,5,Avery Brown Dredge,7.2,59.0,1027.0,1069.0,10.0,5.0,4.4
1,9,AB:07,12.5,30.0,1020.0,1106.0,84.0,42.0,5.6
2,15,Mixtape 8,14.5,50.0,1014.0,1093.0,40.0,20.0,4.4
3,16,Libertine Porter,6.1,45.0,1020.0,1067.0,219.0,109.5,4.4
4,24,The End Of History,55.0,,1000.0,1112.0,,,4.4
...,...,...,...,...,...,...,...,...,...
76,302,Hazy Jane Bourbon Barrel Aged,7.2,30.0,1009.0,1065.0,15.0,8.0,4.2
77,307,Kamikaze Knitting Club,7.5,30.0,1013.0,1071.0,130.0,66.0,4.6
78,315,Baltic Fleet,7.2,45.0,1016.0,1071.0,120.0,61.0,4.2
79,318,Neverland,5.0,40.0,1007.0,1048.0,12.0,6.0,4.2


In [20]:
prod_dataframe = example_pipeline.fit_transform(data_test)

In [21]:
loaded_model.predict(prod_dataframe)

array([43.86490696, 46.19651935, 56.51115621, 58.04892684, 52.48244687,
       34.78003786, 44.2514435 , 48.25515831, 53.58641364, 45.72273423,
       36.0740386 , 62.57459892, 57.06809994, 54.34037365, 49.12491996,
       52.16547502, 39.46518989, 61.23146132, 35.12804484, 48.25515831,
       37.99106137, 42.51649526, 52.27006818, 38.83186931, 37.50692386,
       49.83323731, 50.38074551, 56.97391232, 24.77151441, 47.56123159,
       48.25515831, 44.2514435 , 44.12214169, 51.20948923, 58.46872129,
       48.23911217, 44.75457871, 40.64686624, 60.83911833, 62.57459892,
       45.72873687, 45.69100646, 60.88576755, 42.88537633, 23.33916847,
       66.22623055, 65.43280745, 60.18632322, 68.81586588, 35.01214563,
       54.31625331, 51.49026924, 31.54904544, 47.60780808, 42.77038993,
       35.01214563, 64.54357175, 18.89933074, 58.05355963, 31.46193612,
       58.21492395, 39.95613147, 50.58546442, 54.71716928, 33.26593795,
       53.21707248, 43.45876339, 11.66411242, 33.58515403, 55.32