In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import optuna
import warnings

from category_encoders import TargetEncoder
from xgboost import XGBRegressor
from scipy.stats import randint, uniform
from lightgbm import LGBMRegressor


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, _search_successive_halving, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor 
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cleanData = pd.read_csv('./data csv/kijkcijfersWeer.csv')
cleanData.sample(5)

Unnamed: 0.1,Unnamed: 0,FullDate,date,hour,Kanaal,Programma,Lengte_sec,Kijkers,Temperatuur,Gevoelstemp,Regen,Sneeuw,Weercode,Bewolking,Windsnelheid,Zonnenschijn,isFeestdag,Weekdag,isWeekend,Seizoen
23702,23702,2020-01-06 20:40:42,2020-01-06,20,EEN,TUSSEN OORLOG EN LEVEN - IRAK,3024,753729,4.6,-0.1,0.0,0.0,1,40,19.2,0.0,0,0,0,winter
46531,46533,2023-03-12 18:59:26,2023-03-12,18,VTM,NIEUWS 19U VTM,2747,535050,11.8,7.4,0.0,0.0,3,100,24.5,687.66,0,6,1,winter
9964,9964,2018-02-16 18:59:50,2018-02-16,18,VTM,NIEUWS 19U VTM,3189,606518,6.7,4.3,0.0,0.0,0,0,3.9,3600.0,0,4,0,winter
55775,55778,2024-06-29 19:55:30,2024-06-29,19,VTM,DE 25,1556,256232,20.5,19.1,0.0,0.0,3,100,16.3,0.0,0,5,1,zomer
14025,14025,2018-09-07 20:45:14,2018-09-07,20,EEN,MIDSOMER MURDERS,5326,432373,15.0,12.1,0.0,0.0,2,51,13.8,1510.66,0,4,0,zomer


Feature engineering

In [3]:
def lagFeatures(df, n):
  for i in range(1,n+1):
    df[f'KijkersLag{i}'] = df.sort_values('FullDate').groupby('Programma')['Kijkers'].shift(i).fillna(df.groupby('Programma')['Kijkers'].transform('mean'))
  return df

In [4]:
cleanData = lagFeatures(cleanData, 3)
cleanData[cleanData['Programma'] == 'THUIS'][['FullDate', 'Kijkers', 'KijkersLag1', 'KijkersLag2', 'KijkersLag3']]


Unnamed: 0,FullDate,Kijkers,KijkersLag1,KijkersLag2,KijkersLag3
40,2016-10-03 20:14:23,1268561,1.059928e+06,1.059928e+06,1.059928e+06
60,2016-10-04 20:09:27,1169791,1.268561e+06,1.059928e+06,1.059928e+06
80,2016-10-05 20:13:52,1244502,1.169791e+06,1.268561e+06,1.059928e+06
121,2016-10-07 19:57:34,1156477,1.244502e+06,1.169791e+06,1.268561e+06
180,2016-10-10 19:59:00,1315826,1.156477e+06,1.244502e+06,1.169791e+06
...,...,...,...,...,...
61265,2025-04-04 20:24:13,779244,1.018053e+06,9.942260e+05,1.066599e+06
61325,2025-04-07 20:14:35,869193,7.792440e+05,1.018053e+06,9.942260e+05
61345,2025-04-08 20:13:09,880767,8.691930e+05,7.792440e+05,1.018053e+06
61366,2025-04-09 20:19:53,802132,8.807670e+05,8.691930e+05,7.792440e+05


Kardinaliteiten

In [5]:
#kardinaliteiten voor betere toepassingen
for kolom in cleanData.columns:
  print(f'{kolom}: {cleanData[kolom].nunique()}')

Unnamed: 0: 61405
FullDate: 61318
date: 3051
hour: 21
Kanaal: 29
Programma: 5963
Lengte_sec: 6774
Kijkers: 58860
Temperatuur: 414
Gevoelstemp: 478
Regen: 62
Sneeuw: 25
Weercode: 13
Bewolking: 101
Windsnelheid: 456
Zonnenschijn: 3555
isFeestdag: 2
Weekdag: 7
isWeekend: 2
Seizoen: 4
KijkersLag1: 58998
KijkersLag2: 56714
KijkersLag3: 55143


One Hot Encoding

In [6]:
#one hot encoding voor lage cardinaliteit
oneHotEnc = OneHotEncoder(handle_unknown="ignore")
lageKardinaliteit = cleanData[[ 'hour','Kanaal', 'isFeestdag', 'Weekdag', 'Seizoen']]
oneHot = oneHotEnc.fit_transform(lageKardinaliteit)
dfOneHot = pd.DataFrame(oneHot.toarray(), 
                            columns=oneHotEnc.get_feature_names_out(), 
                            index=lageKardinaliteit.index)

cleanData = cleanData.drop(columns=['hour', 'Kanaal', 'isFeestdag', 'Weekdag', 'Seizoen'])
cleanData = pd.concat([cleanData, dfOneHot], axis=1)

cleanData.sample(5)

Unnamed: 0.1,Unnamed: 0,FullDate,date,Programma,Lengte_sec,Kijkers,Temperatuur,Gevoelstemp,Regen,Sneeuw,...,Weekdag_1,Weekdag_2,Weekdag_3,Weekdag_4,Weekdag_5,Weekdag_6,Seizoen_herfst,Seizoen_lente,Seizoen_winter,Seizoen_zomer
24031,24031,2020-01-22 22:43:34,2020-01-22,HET JOURNAAL LAAT,1018,346843,3.4,1.1,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4468,4468,2017-05-16 20:40:23,2017-05-16,THE BEST OF HOLLAND'S GOT TALENT,3217,348031,21.3,23.1,0.1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
57784,57787,2024-10-09 23:08:57,2024-10-09,HET JOURNAAL LAAT,1349,236375,12.8,12.2,0.5,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
19400,19400,2019-06-03 19:42:39,2019-06-03,IEDEREEN BEROEMD,1271,850878,19.1,16.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
46850,46852,2023-03-28 18:29:14,2023-03-28,BLOKKEN,1634,647971,7.6,3.5,0.3,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
#opslaan bij models
modelsDir = os.getcwd()
modelFile = os.path.join(modelsDir, ".",
                          "models", "oneHotEncoder.pkl")
os.makedirs(os.path.dirname(modelFile), exist_ok=True)
with open(modelFile, 'wb') as m:
  pickle.dump(oneHotEnc, m)

Target Encoding

In [8]:
#target encoding voor medium kardinaliteiten
targetEncoding = TargetEncoder()
medKardinaliteit = cleanData[['date', 'Programma', 'Lengte_sec', 'Temperatuur', 'Gevoelstemp', 'Regen', 'Bewolking', 'Windsnelheid', 'Zonnenschijn']]
#verdere feature engineering op vorig model
target = targetEncoding.fit_transform(medKardinaliteit, cleanData['Kijkers'])
target.sample(10)

Unnamed: 0,date,Programma,Lengte_sec,Temperatuur,Gevoelstemp,Regen,Bewolking,Windsnelheid,Zonnenschijn
33550,431059.54896,344892.4,920,17.6,15.8,0.0,99,9.2,384.34
56679,384833.14896,657807.9,2082,27.2,29.3,0.0,100,5.8,3600.0
12,402732.14896,289674.2,994,12.1,9.7,0.0,20,15.3,0.0
52794,470522.04896,312908.5,7013,3.3,-3.3,0.0,100,27.5,2636.21
38927,505890.62396,337885.9,2911,7.7,3.4,0.0,39,18.5,0.0
10593,512370.82396,403932.9,3091,-0.3,-5.8,0.0,0,16.9,0.0
31316,489572.52396,409602.9,7665,9.5,6.7,0.0,100,13.8,3600.0
36118,464521.57396,1059928.0,1465,13.8,11.8,0.1,54,14.1,0.0
51656,447444.32396,327674.3,2833,5.4,-0.3,0.3,72,27.2,0.0
61377,417431.14896,289674.2,1069,6.8,3.1,0.0,30,14.7,0.0


In [9]:
cleanData = cleanData.drop(columns=['date', 'Programma', 'Lengte_sec', 'Temperatuur', 'Gevoelstemp', 'Regen', 'Bewolking', 'Windsnelheid', 'Zonnenschijn'])
cleanData = pd.concat([cleanData, target], axis=1)
cleanData.sample(5)

Unnamed: 0.1,Unnamed: 0,FullDate,Kijkers,Sneeuw,Weercode,isWeekend,KijkersLag1,KijkersLag2,KijkersLag3,hour_0,...,Seizoen_zomer,date,Programma,Lengte_sec,Temperatuur,Gevoelstemp,Regen,Bewolking,Windsnelheid,Zonnenschijn
6101,6101,2017-08-06 19:58:11,466963,0.0,1,1,463296.0,689373.0,549823.0,0.0,...,1.0,373473.59896,490284.6,1687,19.4,18.5,0.0,42,9.2,3600.0
27129,27129,2020-06-25 12:59:54,234916,0.0,0,0,182959.0,258276.0,258862.0,0.0,...,1.0,391060.24896,242060.7,1702,27.2,26.9,0.0,0,14.8,3600.0
8694,8694,2017-12-13 16:39:50,262427,0.0,53,0,246388.0,275351.0,206962.0,0.0,...,0.0,500474.54896,204258.0,2830,6.6,2.8,0.5,100,18.0,0.0
21011,21011,2019-08-24 16:41:56,176216,0.0,2,1,225934.0,229488.0,272438.0,0.0,...,1.0,333531.02396,311475.6,3234,29.4,29.4,0.0,54,7.4,3600.0
28798,28798,2020-10-02 20:43:06,1297212,0.0,61,0,1284929.0,1003222.0,1079362.0,0.0,...,0.0,473681.37396,1056828.0,5508,13.7,12.3,1.3,100,15.0,0.0


Verbanden

In [10]:
dataOneHotTarget = cleanData.select_dtypes(include=[np.number])

verband = dataOneHotTarget.corr()

verband = verband['Kijkers'].abs().sort_values(ascending=False)

print(f'{verband.head(10)}')
dataOneHotTarget.shape

Kijkers        1.000000
KijkersLag1    0.894582
KijkersLag2    0.893195
KijkersLag3    0.875979
Programma      0.827215
hour_19        0.386184
Kanaal_EEN     0.333420
date           0.333361
hour_22        0.184308
hour_17        0.176407
Name: Kijkers, dtype: float64


(61405, 80)

In [11]:
# Eerst omzetten naar strings, daarna komma's verwijderen, en uiteindelijk naar floats
for col in ['KijkersLag1', 'KijkersLag2', 'KijkersLag3', 'Programma']:
    cleanData[col] = cleanData[col].astype(str).str.replace(',', '').astype(float)

# Print het resultaat
print(cleanData[['Kijkers', 'KijkersLag1', 'KijkersLag2', 'KijkersLag3', 'Programma']].head(10))


   Kijkers    KijkersLag1    KijkersLag2    KijkersLag3      Programma
0   721850  892001.072096  892001.072096  892001.072096  892001.072096
1   709606  628487.089971  628487.089971  628487.089971  628487.089971
2   548239  478421.179487  478421.179487  478421.179487  474186.638952
3   523610  791862.682708  791862.682708  791862.682708  791862.682708
4   496216  491224.740741  491224.740741  491224.740741  491217.906057
5   447427  351454.000000  351454.000000  351454.000000  431290.191170
6   424041  585853.225914  585853.225914  585853.225914  585853.225914
7   369066  408691.165611  408691.165611  408691.165611  408691.165611
8   368549  336230.500000  336230.500000  336230.500000  424185.509591
9   360544  332870.588235  332870.588235  332870.588235  332901.616764


In [12]:
#opslaan bij models
targetModelDir = os.getcwd()
targetModelPath = os.path.join(targetModelDir, ".",
                          "models", "oneHotEncoder.pkl")
os.makedirs(os.path.dirname(targetModelPath), exist_ok=True)
with open(targetModelPath, 'wb') as m:
  pickle.dump(oneHotEnc, m)

Model tests

In [13]:
def test_models(models, X, y, cv=10, scoring='neg_mean_absolute_error'):
    res = {}
    for m in models:
        model = type(m).__name__
        scores = cross_val_score(m, X, y, cv=cv, scoring=scoring)
        res[model] = {
            'scores': scores,
            'mean_score': scores.mean()
        }
        print(f"Scores {model}\n{scores}\nMAE = {scores.mean()}")
    return res

Lasso Regression

In [15]:
models = [
    RandomForestRegressor(verbose=True), Lasso()
]

# Zorg ervoor dat X en y goed geschaald zijn
scaler = StandardScaler()
X = dataOneHotTarget.drop(columns=['Kijkers', 'Unnamed: 0'])
X_scaled = scaler.fit_transform(X)
y = dataOneHotTarget['Kijkers']

# Run de test
results = test_models(models, X_scaled, y)

KeyboardInterrupt: 

RandomForest heeft betere scores:
  
  RandomForest
  MAE = -52082.47210795064

  Lasso
  MAE = -64622.292775108515

Naar csv schrijven

In [16]:
dir = os.getcwd()
pathFile = os.path.join(dir, "./data csv/oneHotTarget.csv")

os.makedirs(os.path.dirname(pathFile), exist_ok=True)

dataOneHotTarget.to_csv(pathFile)


keuze maken voor model

In [16]:
#trainset en testset maken
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

def scoreModel(model,X,y, cv=5):
    print(f"Model: {model.__class__.__name__}")
    start = time.time()
        
    # MAE
    mae_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    mae = -np.mean(mae_scores)

    # MAPE
    mape_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_percentage_error')
    mape = -np.mean(mape_scores) * 100

    duration = time.time() - start
    min, sec = divmod(duration, 60)

    print(f"MAE:  {mae:.4f}")
    print(f"MAPE: {mape:.2f}%")
    print(f"Tijd: {int(min)}m {sec:.1f}s")

    return {'Model': model.__class__.__name__, 'MAE': mae, 'MAPE': mape, 'Tijd': duration}
def startModel(models, X, y, cv=5):
    results = []
    for model in models:
        result = scoreModel(model, X, y, cv=cv)
        results.append(result)
    return results

In [17]:
linear = LinearRegression()
decision = DecisionTreeRegressor()
randomForest = RandomForestRegressor()
gradientBoosting = GradientBoostingRegressor()
xgb = XGBRegressor()
extraTrees = ExtraTreesRegressor()
histGB = HistGradientBoostingRegressor()

startModel([linear, decision, randomForest, gradientBoosting, xgb, extraTrees, histGB], X, y)


Model: LinearRegression
MAE:  65143.6876
MAPE: 20.22%
Tijd: 0m 1.5s
Model: DecisionTreeRegressor


KeyboardInterrupt: 

# Best models:

## Model: ExtraTreesRegressor

MAE:  51912.1192

MAPE: 15.50%

Tijd: 11m 29.4s

## Model: HistGradientBoostingRegressor

MAE:  52753.3521

MAPE: 15.64%

Tijd: 0m 11.3s

## Model: XGBRegressor

MAE:  53561.6539

MAPE: 15.62%

Tijd: 0m 4.2s

## RandomForest er niet bij want 18m+ traintijd voor slechts 200 MAE verschil met XGB

LGBM zou ook een goede optie moetten zijn


ExtraTrees: Beste nauwkeurigheid maar lange training tijd

HistGradientBoosting en XGB: Veel sneller met minimale performance drop

Verbeteren van de top 3 modellen

In [18]:
warnings.filterwarnings("ignore")

#parameters
model_param = {
    'XGBoost': {
        'model': XGBRegressor(),
        'params': {
            'n_estimators': randint(100, 1000),
            'learning_rate': uniform(0.01, 0.3),
            'max_depth': randint(3, 15),
            'min_child_weight': randint(1, 10),
            'subsample': uniform(0.6, 0.4),
            'colsample_bytree': uniform(0.6, 0.4),
            'gamma': uniform(0, 0.5),
            'reg_lambda': uniform(0.1, 10),
            'reg_alpha': uniform(0, 10)
        }
    },
    'HistGradient': {
        'model': HistGradientBoostingRegressor(),
        'params': {
            'learning_rate': uniform(0.01, 0.3),
            'max_iter': randint(100, 1000),
            'max_depth': randint(3, 15),
            'l2_regularization': uniform(0.1, 10),
            'min_samples_leaf': randint(1, 10),
            'max_leaf_nodes': randint(20, 100)
        }
    },
    "LightGBM": {
        "model": LGBMRegressor(),
        "params": {
            "num_leaves": [31, 50, 70],
            "learning_rate": [0.01, 0.05, 0.1, 0.15],
            "n_estimators": [100, 200, 500],
            "max_depth": [-1, 5, 10, 20],
            "min_child_samples": [10, 20, 30],
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.6, 0.8, 1.0],
            "reg_alpha": [0, 0.1, 0.5],
            "reg_lambda": [0, 0.1, 0.5]
        }
    }
}

extra = ExtraTreesRegressor()
hist = HistGradientBoostingRegressor()
xgb = XGBRegressor()
lgbm = LGBMRegressor()


Random Search

In [19]:
for name, config in model_param.items():
    print(f"Model: {name}")
    model = config['model']
    param = config['params']
    
    rs = RandomizedSearchCV(
        estimator=model,
        param_distributions=param,
        n_iter=50,
        cv=2,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=2
    )
    
    start = time.time()
    rs.fit(X_train, y_train)
    duration = time.time() - start

    print(f"Tijd {name}: {duration:.2f}s")
    print(f"Beste param {name}: {rs.best_params_}")
    print(f"Beste MAE: {rs.best_score_:.4f}")

Model: XGBoost
Fitting 2 folds for each of 50 candidates, totalling 100 fits
Tijd XGBoost: 196.60s
Beste param XGBoost: {'colsample_bytree': np.float64(0.9679342816678482), 'gamma': np.float64(0.28670341586778214), 'learning_rate': np.float64(0.012933802843496814), 'max_depth': 10, 'min_child_weight': 5, 'n_estimators': 681, 'reg_alpha': np.float64(4.851770514322081), 'reg_lambda': np.float64(2.64060184325017), 'subsample': np.float64(0.6209956539590782)}
Beste MAE: -48749.8867
Model: HistGradient
Fitting 2 folds for each of 50 candidates, totalling 100 fits
Tijd HistGradient: 50.11s
Beste param HistGradient: {'l2_regularization': np.float64(1.4085589288330669), 'learning_rate': np.float64(0.07790634302676941), 'max_depth': 10, 'max_iter': 244, 'max_leaf_nodes': 89, 'min_samples_leaf': 1}
Beste MAE: -49538.1651
Model: LightGBM
Fitting 2 folds for each of 50 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001965 seco

In [20]:
best_params = rs.best_params_
tuned_model = rs.best_estimator_

y_pred = tuned_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = 1 - mape

print(f'Mean absolute error: {mae}')
print(f'Mean absolute percentage error: {mape}')
print(f'Accuracy: {accuracy}')

Mean absolute error: 47585.56428634255
Mean absolute percentage error: 0.14202693123140966
Accuracy: 0.8579730687685904


In [None]:

rs = RandomizedSearchCV(
  estimator=extra,
  param_distributions=model_param['ExtraTrees']['params'],
  n_iter=150,
  cv=5,
  scoring='neg_mean_absolute_error',
  n_jobs=-1,
  verbose=2
)
    
start = time.time()
rs.fit(X_train, y_train)
duration = time.time() - start

print(f"Tijd: {duration:.2f}s")
print(f"Beste param: {rs.best_params_}")
print(f"Beste MAE: {rs.best_score_:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Tijd: 2857.67s
Beste param: {'max_depth': 9, 'max_features': np.float64(0.9124558702471834), 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 742}
Beste MAE: -54224.2000


### XGBoost
Tijd XGBoost: 1334.04s
Beste param XGBoost: {'colsample_bytree': np.float64(0.6550054872406584), 'gamma': np.float64(0.08367508771505777), 'learning_rate': np.float64(0.04567558428183022), 'max_depth': 8, 'min_child_weight': 2, 'n_estimators': 609, 'reg_alpha': np.float64(6.924415940372761), 'reg_lambda': np.float64(0.2620475254770348), 'subsample': np.float64(0.970829180420445)}
Beste MAE: -46309.9074
### HistGradient
Tijd HistGradient: 1145.61s
Beste param HistGradient: {'l2_regularization': np.float64(6.796320445820958), 'learning_rate': np.float64(0.02650163315092701), 'max_depth': 9, 'max_iter': 770, 'max_leaf_nodes': 96, 'min_samples_leaf': 4}
Beste MAE: -47550.1241
### ExtraTree
Tijd: 2857.67s
Beste param: {'max_depth': 9, 'max_features': np.float64(0.9124558702471834), 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 742}
Beste MAE: -54224.2000
### LightGBM
Tijd LightGBM: 448.79s
Beste param LightGBM: {'subsample': 1.0, 'reg_lambda': 0, 'reg_alpha': 0.5, 'num_leaves': 50, 'n_estimators': 500, 'min_child_samples': 10, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Beste MAE: -47344.7920

In [28]:
def run_optuna(model_name):
    def objective(trial):
        # if model_name == 'ExtraTrees':
        #     model = ExtraTreesRegressor(
        #         n_estimators=trial.suggest_int('n_estimators', 100, 500),
        #         max_depth=trial.suggest_int('max_depth', 5, 30),
        #         min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        #         min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
        #         random_state=42
        #     )
        if model_name == 'HistGradient':
            model = HistGradientBoostingRegressor(
                learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
                max_iter=trial.suggest_int('max_iter', 200, 500),
                max_depth=trial.suggest_int('max_depth', 3, 15),
                max_leaf_nodes=trial.suggest_int('max_leaf_nodes', 20, 100),
                random_state=42
            )
        elif model_name == 'XGBoost':
            model = XGBRegressor(
                n_estimators=trial.suggest_int('n_estimators', 200, 500),
                max_depth=trial.suggest_int('max_depth', 3, 10),
                learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                subsample=trial.suggest_float('subsample', 0.5, 1.0),
                colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
                reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
                reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
                min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
                gamma=trial.suggest_float('gamma', 0, 5),
                random_state=42,
                n_jobs=-1
        )
        elif model_name == 'LightGBM':
            model = LGBMRegressor(
                n_estimators=trial.suggest_int('n_estimators', 200, 500),
                max_depth=trial.suggest_int('max_depth', -1, 15),
                num_leaves=trial.suggest_int('num_leaves', 20, 150),
                learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                subsample=trial.suggest_float('subsample', 0.6, 1.0),
                colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
                reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
                reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
                min_child_samples=trial.suggest_int('min_child_samples', 5, 30),
                random_state=42,
                n_jobs=-1,
                verbosity=-1
            )
        model.fit(X_train, y_train)
        trial.set_user_attr("model", model)
        pred = model.predict(X_test)
        return mean_absolute_error(y_test, pred)
    return objective

In [29]:
for name, config in model_param.items():
  print(f"Optuna {name}")
  optres = optuna.create_study(direction="minimize")
  optres.optimize(run_optuna(name), n_trials=10)

  start = time.time()
  duration = time.time() - start

  print(f"Tijd: {duration:.2f}s")
  print(f"Beste params {name}: ", optres.best_params)
  print(f"Beste MAE {name}: ", optres.best_value)

[I 2025-04-14 23:43:12,752] A new study created in memory with name: no-name-431b0e87-c5dd-43b6-9a23-b79971cc7f42


Optuna XGBoost


[I 2025-04-14 23:43:15,497] Trial 0 finished with value: 47579.46484375 and parameters: {'n_estimators': 409, 'max_depth': 9, 'learning_rate': 0.11664255883867901, 'subsample': 0.8175933842132205, 'colsample_bytree': 0.5285758150351358, 'reg_alpha': 0.5005834852883168, 'reg_lambda': 0.9499033641556756, 'min_child_weight': 2, 'gamma': 3.089297811979732}. Best is trial 0 with value: 47579.46484375.
[I 2025-04-14 23:43:16,672] Trial 1 finished with value: 53273.6171875 and parameters: {'n_estimators': 285, 'max_depth': 4, 'learning_rate': 0.04253887066281115, 'subsample': 0.5859350105049617, 'colsample_bytree': 0.6132371014425155, 'reg_alpha': 0.6582560390011314, 'reg_lambda': 0.4549161418204557, 'min_child_weight': 9, 'gamma': 3.7397468130691545}. Best is trial 0 with value: 47579.46484375.
[I 2025-04-14 23:43:18,444] Trial 2 finished with value: 52882.33203125 and parameters: {'n_estimators': 397, 'max_depth': 6, 'learning_rate': 0.010707858208883113, 'subsample': 0.8193044349140403, 'c

Tijd: 0.00s
Beste params XGBoost:  {'n_estimators': 409, 'max_depth': 9, 'learning_rate': 0.11664255883867901, 'subsample': 0.8175933842132205, 'colsample_bytree': 0.5285758150351358, 'reg_alpha': 0.5005834852883168, 'reg_lambda': 0.9499033641556756, 'min_child_weight': 2, 'gamma': 3.089297811979732}
Beste MAE XGBoost:  47579.46484375
Optuna HistGradient


[I 2025-04-14 23:43:33,576] Trial 0 finished with value: 55942.28421866004 and parameters: {'learning_rate': 0.02525500427028235, 'max_iter': 363, 'max_depth': 3, 'max_leaf_nodes': 33}. Best is trial 0 with value: 55942.28421866004.
[I 2025-04-14 23:43:35,429] Trial 1 finished with value: 48353.94119463584 and parameters: {'learning_rate': 0.15197471832302206, 'max_iter': 396, 'max_depth': 14, 'max_leaf_nodes': 82}. Best is trial 1 with value: 48353.94119463584.
[I 2025-04-14 23:43:37,150] Trial 2 finished with value: 48663.99227148554 and parameters: {'learning_rate': 0.16714849466944837, 'max_iter': 459, 'max_depth': 14, 'max_leaf_nodes': 61}. Best is trial 1 with value: 48353.94119463584.
[I 2025-04-14 23:43:40,815] Trial 3 finished with value: 48863.81330174823 and parameters: {'learning_rate': 0.06876171594166239, 'max_iter': 493, 'max_depth': 6, 'max_leaf_nodes': 62}. Best is trial 1 with value: 48353.94119463584.
[I 2025-04-14 23:43:43,057] Trial 4 finished with value: 49436.792

Tijd: 0.00s
Beste params HistGradient:  {'learning_rate': 0.15197471832302206, 'max_iter': 396, 'max_depth': 14, 'max_leaf_nodes': 82}
Beste MAE HistGradient:  48353.94119463584
Optuna LightGBM


[I 2025-04-14 23:43:55,084] Trial 0 finished with value: 49603.006053967045 and parameters: {'n_estimators': 281, 'max_depth': 14, 'num_leaves': 72, 'learning_rate': 0.021457005502921508, 'subsample': 0.7549976866279142, 'colsample_bytree': 0.9000234948602006, 'reg_alpha': 0.43563686836622484, 'reg_lambda': 0.4146292665019804, 'min_child_samples': 27}. Best is trial 0 with value: 49603.006053967045.
[I 2025-04-14 23:43:57,811] Trial 1 finished with value: 48533.03542975565 and parameters: {'n_estimators': 357, 'max_depth': 10, 'num_leaves': 115, 'learning_rate': 0.019039726694784646, 'subsample': 0.9680127015378496, 'colsample_bytree': 0.6950237505153987, 'reg_alpha': 0.23910495925920883, 'reg_lambda': 0.3483843958836289, 'min_child_samples': 26}. Best is trial 1 with value: 48533.03542975565.
[I 2025-04-14 23:44:01,744] Trial 2 finished with value: 49032.655332216105 and parameters: {'n_estimators': 460, 'max_depth': 9, 'num_leaves': 145, 'learning_rate': 0.013157899131192636, 'subsam

Tijd: 0.00s
Beste params LightGBM:  {'n_estimators': 315, 'max_depth': 13, 'num_leaves': 135, 'learning_rate': 0.08388660215788596, 'subsample': 0.808052384666472, 'colsample_bytree': 0.7456553808547765, 'reg_alpha': 0.4807580824213282, 'reg_lambda': 0.41499289834746445, 'min_child_samples': 12}
Beste MAE LightGBM:  46606.30032491692


extratrees word verder genegeerd door de zeer lange trainingtijden en slechte finetuning resultaten, hierdoor heb ik dit vervamgen met LGBM doordat dit gelijkaardig zou moetten testen als XGB op mijn model

Optuna XGBoost

Tijd: 0.00s
Beste params XGBoost:  {'n_estimators': 463, 'max_depth': 10, 'learning_rate': 0.038646828538743884, 'subsample': 0.8457360904312544, 'colsample_bytree': 0.7006988296992306, 'reg_alpha': 0.4758175096059658, 'reg_lambda': 0.6127142309646054, 'min_child_weight': 5, 'gamma': 3.8183007954851766}
Beste MAE XGBoost:  46403.6054687

Optuna HistGradient

Tijd: 0.00s
Beste params HistGradient:  {'learning_rate': 0.09360779076213302, 'max_iter': 403, 'max_depth': 12, 'max_leaf_nodes': 83}
Beste MAE HistGradient:  47610.78728722957

Optuna ExtraTrees

Tijd: 0.00s
Beste params ExtraTrees:  {'n_estimators': 351, 'max_depth': 27, 'min_samples_split': 8, 'min_samples_leaf': 1}
Beste MAE ExtraTrees:  48229.70593565771

Optune LightGBM

Tijd: 0.00s
Beste params LightGBM:  {'n_estimators': 402, 'max_depth': 15, 'num_leaves': 147, 'learning_rate': 0.05432181624846083, 'subsample': 0.996840954866944, 'colsample_bytree': 0.6530974845266458, 'reg_alpha': 0.529415698140803, 'reg_lambda': 0.271552924298581, 'min_child_samples': 21}
Beste MAE LightGBM:  46530.63689665324

In [25]:
print(optres.best_trial.user_attrs)


{}


In [30]:
best_params = optres.best_params
tuned_model = optres.best_trial.user_attrs['model']

y_pred = tuned_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = 1 - mape

print(f'Mean absolute error: {mae}')
print(f'Mean absolute percentage error: {mape}')
print(f'Accuracy: {accuracy}')

Mean absolute error: 46606.30032491692
Mean absolute percentage error: 0.13865293366566056
Accuracy: 0.8613470663343394


GridsearchCV

In [42]:
warnings.filterwarnings("ignore")

#parameters
model_param = {
    'XGBoost': {
        'model': XGBRegressor(),
        'params': {
            'n_estimators': [850],
            'learning_rate': [0.025, 0.03],
            'max_depth': [8, 9],
            'min_child_weight': [1,2],
            'subsample': [1.0],
            'colsample_bytree': [0.8],
            'gamma': [0.035, 0.04],
            'reg_lambda': [3],
            'reg_alpha': range(0, 5, 1)
        }
    },
    'HistGradient': {
        'model': HistGradientBoostingRegressor(),
        'params': {
            'n_estimators': [850],
            'learning_rate': [0.01, 0.1],
            'max_iter': [100, 200],
            'max_depth': [3, 5],
            'l2_regularization': [0.1, 1],
            'min_samples_leaf': [1, 2],
            'max_leaf_nodes': [20, 40]
        }
    },
    'LightGBM': {
        'model': LGBMRegressor(),
        'params': {
            'n_estimators': [850],
            'num_leaves': [31, 50],
            'learning_rate': [0.01, 0.1],
            'n_estimators': [100, 200],
            'max_depth': [-1, 5],
            'min_child_samples': [10, 20],
            'subsample': [0.6, 0.8],
            'colsample_bytree': [0.6, 0.8],
            'reg_alpha': [0, 0.1],
            'reg_lambda': [0, 0.1]
        }
    }
}

# ExtraTrees model is not adjusted in the grid for now, as the focus is on the three models above
extra = ExtraTreesRegressor()
hist = HistGradientBoostingRegressor()
xgb = XGBRegressor()
lgbm = LGBMRegressor()


In [43]:
for name, config in model_param.items():
    print(f"Model: {name}")
    model = config['model']
    param = config['params']
    
    rs = GridSearchCV(
        estimator=model,
        param_grid=param,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=2
    )
    
    start = time.time()
    rs.fit(X_train, y_train)
    duration = time.time() - start

    print(f"Tijd {name}: {duration:.2f}s")
    print(f"Beste param {name}: {rs.best_params_}")
    print(f"Beste MAE: {rs.best_score_:.4f}")

Model: XGBoost
Fitting 5 folds for each of 80 candidates, totalling 400 fits


KeyboardInterrupt: 

Resultaten tuned model

In [None]:
best_params = rs.best_params_
tuned_model = rs.best_estimator_

y_pred = tuned_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = 1 - mape

print(f'Mean absolute error: {mae}')
print(f'Mean absolute percentage error: {mape}')
print(f'Accuracy: {accuracy}')