In [311]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',None)

path = '../../data/final/train.csv'
data = pd.read_csv(path)

In [324]:
from malbecs import modeling as mm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, IsolationForest
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import category_encoders as ce
from xgboost import XGBRegressor
from typing import List, Tuple
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import GroupKFold, KFold
from sklearn.base import clone
import catboost as cb
from sklearn.decomposition import PCA
from sklearn.model_selection._split import _BaseKFold
from sklearn.utils.validation import _num_samples
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

seed = 42


def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


rmse_scorer = make_scorer(rmse_score, greater_is_better=False)


def eval_model(m, X, y, train_test_idx: List[Tuple[pd.Series, pd.Series]]):
    res = {
        "rmse": [],
        "mape": [],
    }
    for idxs in train_test_idx:
        m_ = clone(m)
        train_idx, test_idx = idxs[0], idxs[1]
        X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
        m_.fit(X_train, y_train)
        y_pred = m_.predict(X_test)
        mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred)
        rmse = rmse_score(y_true=y_test, y_pred=y_pred)
        res['mape'].append(mape)
        res['rmse'].append(rmse)
    return res


class YearKFold(_BaseKFold):
    def __init__(self, test_idxs):
        self.test_idxs = test_idxs
        self.n_splits = len(test_idxs)

    def _iter_test_indices(self, X: pd.DataFrame, y=None, groups=None):
        for idx in self.test_idxs:
            yield idx


In [292]:
data_train = data.copy()
data_train = data[data['campana'] > 15].copy()


X = data_train.drop(columns="produccion")
y = data_train['produccion']
print(X.shape, y.shape)

test_idxs = []
for i in range(19, 22):
    test_idxs.append(X['campana'] == i)

cv = YearKFold(test_idxs)

# test_idxs = []
# for i in range(19, 22):
#     test_idxs.append(X_tree['campana'] == i)

# cv_tree = YearKFold(test_idxs)


(6231, 141) (6231,)


In [293]:
wine_cols = [
    'campana', 'id_finca', 'id_zona',
    'id_estacion', 'altitud', 'variedad',
    'modo', 'tipo', 'color',
    'superficie',
    'sup_tot_camp_finca', 'superficie_total',
    'n_var_camp_finca', 'prod_shift_1', 'prod_shift_2',
    'prod_var_shift_1', 'prod_var_shift_2',
    'prod_finca_shift_1', 'prod_finca_shift_2',
    'prod_he_var_modo_mean_shift_1', 'prod_he_var_modo_std_shift_1'
]

cat_cols = [
    'id_finca', 'id_zona',
    'id_estacion', 'variedad',
    "modo", "tipo", "color"
]

wine_num_cols = [c for c in wine_cols if c not in cat_cols]

eto_cols = X.columns[X.columns.str.contains("Month")].to_list()
meteo_cols = X.columns[X.columns.str.contains("month")].to_list()

X[cat_cols] = X[cat_cols].astype("category")


In [294]:
# qs = [0, 0.25, 0.5, 0.75, 1]
qs = [0,0.5,1]
# X['superficie_cut'] = pd.qcut(X['superficie'], q=qs, labels=[i for i in range(len(qs)-1)])
X['altitud_cut'] = pd.qcut(X['altitud'], q=qs, labels=[i for i in range(len(qs)-1)])

In [295]:
from sklearn.tree import DecisionTreeRegressor

X[cat_cols] = X[cat_cols].astype("category")

m = make_pipeline(ce.OneHotEncoder(), DecisionTreeRegressor())

dt_cols = [
    # 'campana',
    'id_finca', 
    'variedad', 
    'modo',
    'tipo', 
    'id_zona',
    'altitud_cut',
]

cross_validate(
    estimator=m,
    X=X[dt_cols],
    y=y,
    cv=cv,
    n_jobs=-1,
    scoring=rmse_scorer,
    return_train_score=True,
)


{'fit_time': array([0.53979802, 0.53996372, 0.48484993]),
 'score_time': array([0.02082157, 0.02167225, 0.02169013]),
 'test_score': array([-4675.82574067, -6574.9302979 , -6124.67116162]),
 'train_score': array([-4801.35753963, -4452.30305511, -4590.36778029])}

In [317]:
from sklearn.pipeline import make_union
from sklearn.base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, PowerTransformer


class DecisionTreeTransformer(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):
    # __sklearn_is_fitted__ = True
    
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # self.dt = make_pipeline(ce.OneHotEncoder(), RandomForestRegressor())
        self.dt_ = make_pipeline(ce.OneHotEncoder(), DecisionTreeRegressor())
        self.dt_.fit(X, y)
        self.fitted=True
        return self

    def transform(self, X):
        return self.dt_.predict(X).reshape(-1, 1)


daylight_cols = [c for c in meteo_cols if "LightHours" in c]

# dt_feat = make_column_transformer(
#     (DecisionTreeTransformer(), dt_cols),
#     remainder='drop'
# )

prep = make_column_transformer(
    (ce.CatBoostEncoder(),cat_cols),
    (ce.OrdinalEncoder(), cat_cols),
    (PowerTransformer(), wine_num_cols),
    (make_pipeline(StandardScaler(), PCA(n_components=0.90)), meteo_cols+eto_cols),
    (StandardScaler(), daylight_cols),
    remainder='passthrough'
)

# fu = make_union(
#     dt_feat,
#     prep
# )

m = make_pipeline(
    prep,
    # XGBRegressor(random_state=seed, n_estimators=1000)
    RandomForestRegressor(
        n_estimators=1000,
        max_features='sqrt',
        criterion='poisson',
        max_depth=8,
        bootstrap=True,
        random_state=seed, n_jobs=-1),
    # cb.CatBoostRegressor(random_state=seed, cat_features=cat_cols)
)

cross_validate(
    estimator=m,
    X=X,
    y=y,
    cv=cv,
    n_jobs=-1,
    scoring=rmse_scorer,
    return_train_score=True,

)


{'fit_time': array([3.95543885, 3.95293999, 4.0299859 ]),
 'score_time': array([0.16356397, 0.14796185, 0.1490581 ]),
 'test_score': array([-5110.0087921 , -6750.05262789, -5662.79852898]),
 'train_score': array([-4485.03595892, -4230.5516769 , -4390.354746  ])}

In [329]:
train_idxs, test_idxs = list(cv.split(X, y))[-1]

X_train, X_test, y_train, y_test = X.iloc[train_idxs], X.iloc[test_idxs], y.iloc[train_idxs], y.iloc[test_idxs]


In [330]:
m.fit(X_train, y_train)
y_pred = m.predict(X_test)

In [331]:
rmse_score(y_test, y_pred)
np.sqrt(mean_squared_error(y_test, y_pred))

5662.798528984825

In [332]:
X_test['preds'] = y_preds
X_test['true'] = y_test.copy().values


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['preds'] = y_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['true'] = y_test.copy().values


In [333]:
X_test['err'] = np.abs(X_test['preds'] - X_test['true'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['err'] = np.abs(X_test['preds'] - X_test['true'])


In [334]:
X_test[cat_cols+['preds','true','err']].sort_values('err', ascending=False)


Unnamed: 0,id_finca,id_zona,id_estacion,variedad,modo,tipo,color,preds,true,err
7472,48877,506,16,32,2,0,1,78089.149793,124620.262,46531.112207
7555,37826,964,2,17,2,0,1,64581.250489,101750.000,37168.749511
8303,80627,845,16,9,2,0,0,27322.762248,61420.000,34097.237752
7948,93922,50,19,87,2,0,0,38949.810014,71550.000,32600.189986
8442,14843,845,16,62,2,0,0,21583.870198,51708.184,30124.313802
...,...,...,...,...,...,...,...,...,...,...
7691,74722,86,12,59,1,0,1,3370.576227,3380.000,9.423773
7689,9594,86,15,59,1,0,1,826.235195,819.000,7.235195
7716,42384,441,12,59,1,0,1,854.575895,847.500,7.075895
8086,66871,698,12,68,1,0,0,2695.680228,2698.800,3.119772


In [344]:
# X_test[X_test['id_finca'] == 84804][wine_cols+['preds', 'true', 'err']]
X_test[X_test['id_finca'] == 14843][wine_cols+['preds', 'true', 'err']]

# X_train[X_train['id_finca'] == 48877]
# [wine_cols+['preds', 'true', 'err']]
# 670.800017*9.6 

Unnamed: 0,campana,id_finca,id_zona,id_estacion,altitud,variedad,modo,tipo,color,superficie,sup_tot_camp_finca,superficie_total,n_var_camp_finca,prod_shift_1,prod_shift_2,prod_var_shift_1,prod_var_shift_2,prod_finca_shift_1,prod_finca_shift_2,prod_he_var_modo_mean_shift_1,prod_he_var_modo_std_shift_1,preds,true,err
7929,21,14843,845,16,647.5,87,2,0,0,6.28,18.6908,16.365015,3,86930.0,82940.0,86930.0,82940.0,101646.456,107322.296,7248.183443,3498.689763,68567.91348,66890.0,1677.91348
8399,21,14843,845,16,647.5,43,2,0,0,2.74,18.6908,16.365015,3,7854.056,6145.468,7854.056,6145.468,101646.456,107322.296,5019.885346,2564.797393,9373.196825,16899.064,7525.867175
8442,21,14843,845,16,647.5,62,2,0,0,9.6708,18.6908,16.365015,3,6862.4,18236.828,6862.4,18236.828,101646.456,107322.296,670.800017,54.87151,21583.870198,51708.184,30124.313802
