In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import optuna
import warnings

from category_encoders import TargetEncoder
from xgboost import XGBRegressor
from scipy.stats import randint, uniform
from lightgbm import LGBMRegressor


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, _search_successive_halving, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor 
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cleanData = pd.read_csv('./data csv/kijkcijfersWeer.csv')
cleanData.sample(5)

Unnamed: 0.1,Unnamed: 0,FullDate,date,hour,Kanaal,Programma,Lengte_sec,Kijkers,Temperatuur,Gevoelstemp,Regen,Sneeuw,Weercode,Bewolking,Windsnelheid,Zonnenschijn,isFeestdag,Weekdag,isWeekend,Seizoen
29934,29934,2020-11-27 20:40:51,2020-11-27,20,Canvas,DE AFSPRAAK,2741,228862,6.8,5.0,0.0,0.0,1,47,6.4,0.0,0,4,0,herfst
486,486,2016-10-25 18:31:02,2016-10-25,18,EEN,BLOKKEN,1631,712801,12.4,11.0,0.0,0.0,3,100,6.3,0.0,0,1,0,herfst
26571,26571,2020-05-28 23:07:58,2020-05-28,23,EEN,HET JOURNAAL LAAT,1103,254602,15.0,11.3,0.0,0.0,0,3,18.7,0.0,0,3,0,lente
32710,32710,2021-04-16 22:12:26,2021-04-16,22,EEN,HET JOURNAAL LAAT,1019,250833,6.6,3.1,0.0,0.0,2,55,11.9,0.0,0,4,0,lente
46810,46812,2023-03-26 19:22:53,2023-03-26,19,EEN,SPORTWEEKEND,2074,838746,7.2,2.5,0.0,0.0,3,100,22.5,0.0,0,6,1,lente


Feature engineering

In [3]:
def lagFeatures(df, n):
  for i in range(1,n+1):
    df[f'KijkersLag{i}'] = df.sort_values('FullDate').groupby('Programma')['Kijkers'].shift(i).fillna(df.groupby('Programma')['Kijkers'].transform('mean'))
  return df

In [4]:
cleanData = lagFeatures(cleanData, 3)
cleanData[cleanData['Programma'] == 'THUIS'][['FullDate', 'Kijkers', 'KijkersLag1', 'KijkersLag2', 'KijkersLag3']]


Unnamed: 0,FullDate,Kijkers,KijkersLag1,KijkersLag2,KijkersLag3
40,2016-10-03 20:14:23,1268561,1.059928e+06,1.059928e+06,1.059928e+06
60,2016-10-04 20:09:27,1169791,1.268561e+06,1.059928e+06,1.059928e+06
80,2016-10-05 20:13:52,1244502,1.169791e+06,1.268561e+06,1.059928e+06
121,2016-10-07 19:57:34,1156477,1.244502e+06,1.169791e+06,1.268561e+06
180,2016-10-10 19:59:00,1315826,1.156477e+06,1.244502e+06,1.169791e+06
...,...,...,...,...,...
61265,2025-04-04 20:24:13,779244,1.018053e+06,9.942260e+05,1.066599e+06
61325,2025-04-07 20:14:35,869193,7.792440e+05,1.018053e+06,9.942260e+05
61345,2025-04-08 20:13:09,880767,8.691930e+05,7.792440e+05,1.018053e+06
61366,2025-04-09 20:19:53,802132,8.807670e+05,8.691930e+05,7.792440e+05


Kardinaliteiten

In [5]:
#kardinaliteiten voor betere toepassingen
for kolom in cleanData.columns:
  print(f'{kolom}: {cleanData[kolom].nunique()}')

Unnamed: 0: 61405
FullDate: 61318
date: 3051
hour: 21
Kanaal: 29
Programma: 5963
Lengte_sec: 6774
Kijkers: 58860
Temperatuur: 414
Gevoelstemp: 478
Regen: 62
Sneeuw: 25
Weercode: 13
Bewolking: 101
Windsnelheid: 456
Zonnenschijn: 3555
isFeestdag: 2
Weekdag: 7
isWeekend: 2
Seizoen: 4
KijkersLag1: 58998
KijkersLag2: 56714
KijkersLag3: 55143


One Hot Encoding

In [6]:
#one hot encoding voor lage cardinaliteit
oneHotEnc = OneHotEncoder(handle_unknown="ignore")
lageKardinaliteit = cleanData[[ 'hour','Kanaal', 'isFeestdag', 'Weekdag', 'Seizoen']]
oneHot = oneHotEnc.fit_transform(lageKardinaliteit)
dfOneHot = pd.DataFrame(oneHot.toarray(), 
                            columns=oneHotEnc.get_feature_names_out(), 
                            index=lageKardinaliteit.index)

cleanData = cleanData.drop(columns=['hour', 'Kanaal', 'isFeestdag', 'Weekdag', 'Seizoen'])
cleanData = pd.concat([cleanData, dfOneHot], axis=1)

cleanData.sample(5)

Unnamed: 0.1,Unnamed: 0,FullDate,date,Programma,Lengte_sec,Kijkers,Temperatuur,Gevoelstemp,Regen,Sneeuw,...,Weekdag_1,Weekdag_2,Weekdag_3,Weekdag_4,Weekdag_5,Weekdag_6,Seizoen_herfst,Seizoen_lente,Seizoen_winter,Seizoen_zomer
49668,49670,2023-08-17 19:48:08,2023-08-17,FC DE KAMPIOENEN,1952,751420,21.0,21.6,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
40912,40913,2022-05-26 15:35:04,2022-05-26,WIELRENNEN. CIRCUIT DE WALLONIE,9116,143059,19.6,16.6,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
36312,36312,2021-10-13 17:04:07,2021-10-13,WITSE,3356,287288,12.4,10.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9217,9217,2018-01-09 20:37:04,2018-01-09,PREMIUM RUSH,4845,182507,5.5,2.3,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
31882,31882,2021-03-06 19:58:28,2021-03-06,DE 25,1537,576908,4.4,0.9,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [7]:
#opslaan bij models
modelsDir = os.getcwd()
modelFile = os.path.join(modelsDir, ".",
                          "models", "oneHotEncoder.pkl")
os.makedirs(os.path.dirname(modelFile), exist_ok=True)
with open(modelFile, 'wb') as m:
  pickle.dump(oneHotEnc, m)

Target Encoding

In [8]:
#target encoding voor medium kardinaliteiten
targetEncoding = TargetEncoder()
medKardinaliteit = cleanData[['date', 'Programma', 'Lengte_sec', 'Temperatuur', 'Gevoelstemp', 'Regen', 'Bewolking', 'Windsnelheid', 'Zonnenschijn']]
#verdere feature engineering op vorig model
target = targetEncoding.fit_transform(medKardinaliteit, cleanData['Kijkers'])
target.sample(10)

Unnamed: 0,date,Programma,Lengte_sec,Temperatuur,Gevoelstemp,Regen,Bewolking,Windsnelheid,Zonnenschijn
60774,470501.34896,436902.358032,3407,11.7,10.0,0.0,49,8.5,0.0
60001,491305.24896,318608.152185,3320,7.0,3.7,0.0,23,12.2,3600.0
26043,458695.24896,590186.857005,1927,12.4,10.0,0.0,23,8.4,3600.0
9026,437754.331992,437257.74033,5924,9.1,4.2,0.6,100,28.9,0.0
50249,414494.14896,791862.682708,1389,22.2,22.2,0.0,100,7.7,2390.33
18907,484290.89896,454069.358761,2828,12.9,9.1,0.0,45,21.6,3600.0
20489,414203.02396,458660.405011,2742,23.3,23.9,0.1,100,12.8,2382.52
60895,435236.92396,428249.989673,1384,7.1,3.4,0.0,0,12.4,3600.0
32665,493583.64896,460775.345328,1391,4.4,0.0,0.0,37,14.5,0.0
21884,492844.92396,638643.551574,1636,14.0,12.4,0.0,95,11.4,0.0


In [9]:
cleanData = cleanData.drop(columns=['date', 'Programma', 'Lengte_sec', 'Temperatuur', 'Gevoelstemp', 'Regen', 'Bewolking', 'Windsnelheid', 'Zonnenschijn'])
cleanData = pd.concat([cleanData, target], axis=1)
cleanData.sample(5)

Unnamed: 0.1,Unnamed: 0,FullDate,Kijkers,Sneeuw,Weercode,isWeekend,KijkersLag1,KijkersLag2,KijkersLag3,hour_0,...,Seizoen_zomer,date,Programma,Lengte_sec,Temperatuur,Gevoelstemp,Regen,Bewolking,Windsnelheid,Zonnenschijn
53973,53976,2024-03-20 21:57:10,546977,0.0,1,0,544150.0,560115.0,565329.0,0.0,...,0.0,459076.62396,589142.350983,2764,10.5,8.9,0.0,46,8.8,0.0
51668,51670,2023-11-25 18:59:51,540019,0.0,2,1,620044.0,667906.0,707399.0,0.0,...,0.0,395335.52396,585853.225914,2640,6.3,2.4,0.0,64,16.6,0.0
21866,21866,2019-10-06 19:56:39,424988,0.0,53,1,390534.0,425063.0,460595.0,0.0,...,0.0,451825.47396,432173.471026,904,12.3,8.8,0.5,57,25.1,558.57
55082,55085,2024-05-18 21:15:03,126045,0.0,3,1,156787.0,161612.0,172686.0,0.0,...,0.0,368234.69896,382077.066817,3174,14.4,14.2,0.0,80,6.6,3410.46
57992,57995,2024-10-20 13:00:04,479335,0.0,3,1,317435.0,401978.0,364328.0,0.0,...,0.0,436244.42396,408691.165611,1772,16.4,12.6,0.0,100,29.7,0.0


Verbanden

In [10]:
dataOneHotTarget = cleanData.select_dtypes(include=[np.number])

verband = dataOneHotTarget.corr()

verband = verband['Kijkers'].abs().sort_values(ascending=False)

print(f'{verband.head(10)}')
dataOneHotTarget.shape

Kijkers        1.000000
KijkersLag1    0.894582
KijkersLag2    0.893195
KijkersLag3    0.875979
Programma      0.827215
hour_19        0.386184
Kanaal_EEN     0.333420
date           0.333361
hour_22        0.184308
hour_17        0.176407
Name: Kijkers, dtype: float64


(61405, 80)

In [11]:
# Eerst omzetten naar strings, daarna komma's verwijderen, en uiteindelijk naar floats
for col in ['KijkersLag1', 'KijkersLag2', 'KijkersLag3', 'Programma']:
    cleanData[col] = cleanData[col].astype(str).str.replace(',', '').astype(float)

# Print het resultaat
print(cleanData[['Kijkers', 'KijkersLag1', 'KijkersLag2', 'KijkersLag3', 'Programma']].head(10))


   Kijkers    KijkersLag1    KijkersLag2    KijkersLag3      Programma
0   721850  892001.072096  892001.072096  892001.072096  892001.072096
1   709606  628487.089971  628487.089971  628487.089971  628487.089971
2   548239  478421.179487  478421.179487  478421.179487  474186.638952
3   523610  791862.682708  791862.682708  791862.682708  791862.682708
4   496216  491224.740741  491224.740741  491224.740741  491217.906057
5   447427  351454.000000  351454.000000  351454.000000  431290.191170
6   424041  585853.225914  585853.225914  585853.225914  585853.225914
7   369066  408691.165611  408691.165611  408691.165611  408691.165611
8   368549  336230.500000  336230.500000  336230.500000  424185.509591
9   360544  332870.588235  332870.588235  332870.588235  332901.616764


In [12]:
#opslaan bij models
targetModelDir = os.getcwd()
targetModelPath = os.path.join(targetModelDir, ".",
                          "models", "oneHotEncoder.pkl")
os.makedirs(os.path.dirname(targetModelPath), exist_ok=True)
with open(targetModelPath, 'wb') as m:
  pickle.dump(oneHotEnc, m)

Model tests

In [13]:
def test_models(models, X, y, cv=10, scoring='neg_mean_absolute_error'):
    res = {}
    for m in models:
        model = type(m).__name__
        scores = cross_val_score(m, X, y, cv=cv, scoring=scoring)
        res[model] = {
            'scores': scores,
            'mean_score': scores.mean()
        }
        print(f"Scores {model}\n{scores}\nMAE = {scores.mean()}")
    return res

Lasso Regression

In [14]:
models = [
    RandomForestRegressor(verbose=True), Lasso()
]

# Zorg ervoor dat X en y goed geschaald zijn
scaler = StandardScaler()
X = dataOneHotTarget.drop(columns=['Kijkers', 'Unnamed: 0'])
X_scaled = scaler.fit_transform(X)
y = dataOneHotTarget['Kijkers']

# Run de test
results = test_models(models, X_scaled, y)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   57.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.1min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.1min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.1min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Do

Scores RandomForestRegressor
[-56435.34179612 -52905.7809917  -50810.55038919 -48055.78391304
 -60465.86495359 -51753.55967101 -50944.79969218 -48479.49968404
 -50266.85558469 -49917.59970195]
MAE = -52003.56363775172
Scores Lasso
[-70240.31538707 -67302.89346697 -64776.0347236  -62745.39174351
 -69298.51653763 -64885.35384331 -63986.35690782 -58469.83377457
 -62217.9159027  -62300.31546391]
MAE = -64622.292775108515


RandomForest heeft betere scores:
  
  RandomForest
  MAE = -52082.47210795064

  Lasso
  MAE = -64622.292775108515

Naar csv schrijven

In [15]:
dir = os.getcwd()
pathFile = os.path.join(dir, "./data csv/oneHotTarget.csv")

os.makedirs(os.path.dirname(pathFile), exist_ok=True)

dataOneHotTarget.to_csv(pathFile)


keuze maken voor model

In [16]:
#trainset en testset maken
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

def scoreModel(model,X,y, cv=5):
    print(f"Model: {model.__class__.__name__}")
    start = time.time()
        
    # MAE
    mae_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    mae = -np.mean(mae_scores)

    # MAPE
    mape_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_percentage_error')
    mape = -np.mean(mape_scores) * 100

    duration = time.time() - start
    min, sec = divmod(duration, 60)

    print(f"MAE:  {mae:.4f}")
    print(f"MAPE: {mape:.2f}%")
    print(f"Tijd: {int(min)}m {sec:.1f}s")

    return {'Model': model.__class__.__name__, 'MAE': mae, 'MAPE': mape, 'Tijd': duration}
def startModel(models, X, y, cv=5):
    results = []
    for model in models:
        result = scoreModel(model, X, y, cv=cv)
        results.append(result)
    return results

In [17]:
linear = LinearRegression()
decision = DecisionTreeRegressor()
randomForest = RandomForestRegressor()
gradientBoosting = GradientBoostingRegressor()
xgb = XGBRegressor()
extraTrees = ExtraTreesRegressor()
histGB = HistGradientBoostingRegressor()

startModel([linear, decision, randomForest, gradientBoosting, xgb, extraTrees, histGB], X, y)


Model: LinearRegression
MAE:  65143.6876
MAPE: 20.22%
Tijd: 0m 1.5s
Model: DecisionTreeRegressor


KeyboardInterrupt: 

# Best models:

## Model: ExtraTreesRegressor

MAE:  51912.1192

MAPE: 15.50%

Tijd: 11m 29.4s

## Model: HistGradientBoostingRegressor

MAE:  52753.3521

MAPE: 15.64%

Tijd: 0m 11.3s

## Model: XGBRegressor

MAE:  53561.6539

MAPE: 15.62%

Tijd: 0m 4.2s

## RandomForest er niet bij want 18m+ traintijd voor slechts 200 MAE verschil met XGB

LGBM zou ook een goede optie moetten zijn


ExtraTrees: Beste nauwkeurigheid maar lange training tijd

HistGradientBoosting en XGB: Veel sneller met minimale performance drop

Verbeteren van de top 3 modellen

In [17]:
warnings.filterwarnings("ignore")

#parameters
model_param = {
    'XGBoost': {
        'model': XGBRegressor(),
        'params': {
            'n_estimators': randint(100, 1000),
            'learning_rate': uniform(0.01, 0.3),
            'max_depth': randint(3, 15),
            'min_child_weight': randint(1, 10),
            'subsample': uniform(0.6, 0.4),
            'colsample_bytree': uniform(0.6, 0.4),
            'gamma': uniform(0, 0.5),
            'reg_lambda': uniform(0.1, 10),
            'reg_alpha': uniform(0, 10)
        }
    },
    # 'HistGradient': {
    #     'model': HistGradientBoostingRegressor(),
    #     'params': {
    #         'learning_rate': uniform(0.01, 0.3),
    #         'max_iter': randint(100, 1000),
    #         'max_depth': randint(3, 15),
    #         'l2_regularization': uniform(0.1, 10),
    #         'min_samples_leaf': randint(1, 10),
    #         'max_leaf_nodes': randint(20, 100)
    #     }
    # },
    "LightGBM": {
        "model": LGBMRegressor(),
        "params": {
            "num_leaves": [31, 50, 70],
            "learning_rate": [0.01, 0.05, 0.1, 0.15],
            "n_estimators": [100, 200, 500],
            "max_depth": [-1, 5, 10, 20],
            "min_child_samples": [10, 20, 30],
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.6, 0.8, 1.0],
            "reg_alpha": [0, 0.1, 0.5],
            "reg_lambda": [0, 0.1, 0.5]
        }
    }
}

extra = ExtraTreesRegressor()
hist = HistGradientBoostingRegressor()
xgb = XGBRegressor()
lgbm = LGBMRegressor()


Random Search

In [19]:
for name, config in model_param.items():
    print(f"Model: {name}")
    model = config['model']
    param = config['params']
    
    rsLoop = RandomizedSearchCV(
        estimator=model,
        param_distributions=param,
        n_iter=250,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=2
    )
    
    start = time.time()
    rsLoop.fit(X_train, y_train)
    duration = time.time() - start

    print(f"Tijd {name}: {duration:.2f}s")
    print(f"Beste param {name}: {rsLoop.best_params_}")
    print(f"Beste MAE: {rsLoop.best_score_:.4f}")

Model: XGBoost
Fitting 5 folds for each of 250 candidates, totalling 1250 fits


KeyboardInterrupt: 

In [None]:
best_params = rsLoop.best_params_
tuned_model = rsLoop.best_estimator_

y_pred = tuned_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = 1 - mape

print(f'Mean absolute error: {mae}')
print(f'Mean absolute percentage error: {mape}')
print(f'Accuracy: {accuracy}')

Mean absolute error: 47585.56428634255
Mean absolute percentage error: 0.14202693123140966
Accuracy: 0.8579730687685904


In [None]:

rsExtra = RandomizedSearchCV(
  estimator=extra,
  param_distributions=model_param['ExtraTrees']['params'],
  n_iter=150,
  cv=5,
  scoring='neg_mean_absolute_error',
  n_jobs=-1,
  verbose=2
)
    
start = time.time()
rsExtra.fit(X_train, y_train)
duration = time.time() - start

print(f"Tijd: {duration:.2f}s")
print(f"Beste param: {rsExtra.best_params_}")
print(f"Beste MAE: {rsExtra.best_score_:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Tijd: 2857.67s
Beste param: {'max_depth': 9, 'max_features': np.float64(0.9124558702471834), 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 742}
Beste MAE: -54224.2000


### XGBoost
Tijd XGBoost: 1334.04s
Beste param XGBoost: {'colsample_bytree': np.float64(0.6550054872406584), 'gamma': np.float64(0.08367508771505777), 'learning_rate': np.float64(0.04567558428183022), 'max_depth': 8, 'min_child_weight': 2, 'n_estimators': 609, 'reg_alpha': np.float64(6.924415940372761), 'reg_lambda': np.float64(0.2620475254770348), 'subsample': np.float64(0.970829180420445)}
Beste MAE: -46309.9074
### HistGradient
Tijd HistGradient: 1145.61s
Beste param HistGradient: {'l2_regularization': np.float64(6.796320445820958), 'learning_rate': np.float64(0.02650163315092701), 'max_depth': 9, 'max_iter': 770, 'max_leaf_nodes': 96, 'min_samples_leaf': 4}
Beste MAE: -47550.1241
### ExtraTree
Tijd: 2857.67s
Beste param: {'max_depth': 9, 'max_features': np.float64(0.9124558702471834), 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 742}
Beste MAE: -54224.2000
### LightGBM
Tijd LightGBM: 448.79s
Beste param LightGBM: {'subsample': 1.0, 'reg_lambda': 0, 'reg_alpha': 0.5, 'num_leaves': 50, 'n_estimators': 500, 'min_child_samples': 10, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Beste MAE: -47344.7920

In [24]:
def run_optuna(model_name):
    def objective(trial):
        # if model_name == 'ExtraTrees':
        #     model = ExtraTreesRegressor(
        #         n_estimators=trial.suggest_int('n_estimators', 100, 500),
        #         max_depth=trial.suggest_int('max_depth', 5, 30),
        #         min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        #         min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
        #         random_state=42
        #     )
        # if model_name == 'HistGradient':
        #     model = HistGradientBoostingRegressor(
        #         learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        #         max_iter=trial.suggest_int('max_iter', 200, 500),
        #         max_depth=trial.suggest_int('max_depth', 3, 15),
        #         max_leaf_nodes=trial.suggest_int('max_leaf_nodes', 20, 100),
        #         random_state=42
        #     )
        if model_name == 'XGBoost':
            model = XGBRegressor(
                n_estimators=trial.suggest_int('n_estimators', 200, 800),
                max_depth=trial.suggest_int('max_depth', 10, 15),
                learning_rate=trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
                subsample=trial.suggest_float('subsample', 0.5, 1.0),
                colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
                reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
                reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
                min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
                gamma=trial.suggest_float('gamma', 0, 5),
                random_state=42,
                n_jobs=-1
        )
        elif model_name == 'LightGBM':
            model = LGBMRegressor(
                n_estimators=trial.suggest_int('n_estimators', 200, 800),
                max_depth=trial.suggest_int('max_depth', -1, 15),
                num_leaves=trial.suggest_int('num_leaves', 20, 350),
                learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                subsample=trial.suggest_float('subsample', 0.6, 1.0),
                colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
                reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
                reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
                min_child_samples=trial.suggest_int('min_child_samples', 5, 30),
                random_state=42,
                n_jobs=-1,
                verbosity=-1
            )
        model.fit(X_train, y_train)
        trial.set_user_attr("model", model)
        pred = model.predict(X_test)
        return mean_absolute_error(y_test, pred)
    return objective

In [25]:
for name, config in model_param.items():
  print(f"Optuna {name}")
  optres = optuna.create_study(direction="minimize")
  optres.optimize(run_optuna(name), n_trials=500)

  start = time.time()
  duration = time.time() - start

  print(f"Tijd: {duration:.2f}s")
  print(f"Beste params {name}: ", optres.best_params)
  print(f"Beste MAE {name}: ", optres.best_value)

[I 2025-04-16 17:47:52,172] A new study created in memory with name: no-name-7540c3cc-de12-4073-a47c-7d627ed1efce


Optuna XGBoost


[I 2025-04-16 17:48:06,388] Trial 0 finished with value: 48279.04296875 and parameters: {'n_estimators': 792, 'max_depth': 12, 'learning_rate': 0.10504238180433755, 'subsample': 0.7528360273035375, 'colsample_bytree': 0.8508795212484088, 'reg_alpha': 0.16665169025447124, 'reg_lambda': 0.18968621528871654, 'min_child_weight': 3, 'gamma': 4.964358230393764}. Best is trial 0 with value: 48279.04296875.
[I 2025-04-16 17:48:15,302] Trial 1 finished with value: 49524.47265625 and parameters: {'n_estimators': 358, 'max_depth': 14, 'learning_rate': 0.08516128632591645, 'subsample': 0.5722841771011843, 'colsample_bytree': 0.6152420456760939, 'reg_alpha': 0.8595127823435313, 'reg_lambda': 0.02432077566031965, 'min_child_weight': 3, 'gamma': 2.900215859857673}. Best is trial 0 with value: 48279.04296875.
[I 2025-04-16 17:48:19,074] Trial 2 finished with value: 47624.94921875 and parameters: {'n_estimators': 203, 'max_depth': 14, 'learning_rate': 0.05118957676361663, 'subsample': 0.824292972753120

Tijd: 0.00s
Beste params XGBoost:  {'n_estimators': 800, 'max_depth': 11, 'learning_rate': 0.024791795598051402, 'subsample': 0.9152182561342729, 'colsample_bytree': 0.628872599960794, 'reg_alpha': 0.4587526558642877, 'reg_lambda': 0.9054148915932084, 'min_child_weight': 9, 'gamma': 0.03130859570095774}


[I 2025-04-16 18:45:13,612] A new study created in memory with name: no-name-047d6f2c-8d88-466b-bb37-959ef292a0b9


Beste MAE XGBoost:  45756.13671875
Optuna LightGBM


[I 2025-04-16 18:46:49,163] Trial 0 finished with value: 49908.24635917908 and parameters: {'n_estimators': 256, 'max_depth': 7, 'num_leaves': 274, 'learning_rate': 0.027511781206980532, 'subsample': 0.7687690982356093, 'colsample_bytree': 0.8866450983070837, 'reg_alpha': 0.8348792868765971, 'reg_lambda': 0.1217297631605142, 'min_child_samples': 21}. Best is trial 0 with value: 49908.24635917908.
[I 2025-04-16 18:46:54,284] Trial 1 finished with value: 51204.468710700865 and parameters: {'n_estimators': 583, 'max_depth': 13, 'num_leaves': 156, 'learning_rate': 0.2909493183846242, 'subsample': 0.7194107273846557, 'colsample_bytree': 0.9835574124597473, 'reg_alpha': 0.3829772785216915, 'reg_lambda': 0.05094603076860704, 'min_child_samples': 10}. Best is trial 0 with value: 49908.24635917908.
[I 2025-04-16 18:46:55,145] Trial 2 finished with value: 57187.703001264934 and parameters: {'n_estimators': 727, 'max_depth': 2, 'num_leaves': 161, 'learning_rate': 0.040059737285213975, 'subsample'

Tijd: 0.00s
Beste params LightGBM:  {'n_estimators': 791, 'max_depth': 14, 'num_leaves': 334, 'learning_rate': 0.029456000685140836, 'subsample': 0.8303507682774592, 'colsample_bytree': 0.6505159652741844, 'reg_alpha': 0.20430689348688258, 'reg_lambda': 0.3218993752340186, 'min_child_samples': 13}
Beste MAE LightGBM:  45536.17331152257


extratrees word verder genegeerd door de zeer lange trainingtijden en slechte finetuning resultaten, hierdoor heb ik dit vervamgen met LGBM doordat dit gelijkaardig zou moetten testen als XGB op mijn model

Optuna XGBoost

Beste params XGBoost:  {'n_estimators': 800, 'max_depth': 11, 'learning_rate': 0.024791795598051402, 'subsample': 0.9152182561342729, 'colsample_bytree': 0.628872599960794, 'reg_alpha': 0.4587526558642877, 'reg_lambda': 0.9054148915932084, 'min_child_weight': 9, 'gamma': 0.03130859570095774}
[I 2025-04-16 18:45:13,612] A new study created in memory with name: no-name-047d6f2c-8d88-466b-bb37-959ef292a0b9
Beste MAE XGBoost:  45756.13671875

Optuna HistGradient

Beste params HistGradient:  {'learning_rate': 0.09360779076213302, 'max_iter': 403, 'max_depth': 12, 'max_leaf_nodes': 83}
Beste MAE HistGradient:  47610.78728722957

Optuna ExtraTrees

Beste params ExtraTrees:  {'n_estimators': 351, 'max_depth': 27, 'min_samples_split': 8, 'min_samples_leaf': 1}
Beste MAE ExtraTrees:  48229.70593565771

Optune LightGBM

Beste params LightGBM:  {'n_estimators': 791, 'max_depth': 14, 'num_leaves': 334, 'learning_rate': 0.029456000685140836, 'subsample': 0.8303507682774592, 'colsample_bytree': 0.6505159652741844, 'reg_alpha': 0.20430689348688258, 'reg_lambda': 0.3218993752340186, 'min_child_samples': 13}
Beste MAE LightGBM:  45536.17331152257

In [26]:
print(optres.best_trial.user_attrs)


{'model': LGBMRegressor(colsample_bytree=0.6505159652741844,
              learning_rate=0.029456000685140836, max_depth=14,
              min_child_samples=13, n_estimators=791, n_jobs=-1, num_leaves=334,
              random_state=42, reg_alpha=0.20430689348688258,
              reg_lambda=0.3218993752340186, subsample=0.8303507682774592,
              verbosity=-1)}


In [27]:
best_params = optres.best_params
tuned_model = optres.best_trial.user_attrs['model']

y_pred = tuned_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = 1 - mape

print(f'Mean absolute error: {mae}')
print(f'Mean absolute percentage error: {mape}')
print(f'Accuracy: {accuracy}')

Mean absolute error: 45536.17331152257
Mean absolute percentage error: 0.13347361949802408
Accuracy: 0.866526380501976


## Best so far
Mean absolute error: 45536.17331152257
Mean absolute percentage error: 0.13347361949802408
Accuracy: 0.866526380501976

GridsearchCV

In [None]:

#parameters
model_param = {
    'XGBoost': {
        'model': XGBRegressor(),
        'params': {
            'n_estimators': [850],
            'learning_rate': [0.025, 0.03],
            'max_depth': [8, 9],
            'min_child_weight': [1,2],
            'subsample': [1.0],
            'colsample_bytree': [0.8],
            'gamma': [0.035, 0.04],
            'reg_lambda': [3],
            'reg_alpha': range(0, 5, 1)
        }
    },
    'HistGradient': {
        'model': HistGradientBoostingRegressor(),
        'params': {
            'n_estimators': [850],
            'learning_rate': [0.01, 0.1],
            'max_iter': [100, 200],
            'max_depth': [3, 5],
            'l2_regularization': [0.1, 1],
            'min_samples_leaf': [1, 2],
            'max_leaf_nodes': [20, 40]
        }
    },
    'LightGBM': {
        'model': LGBMRegressor(),
        'params': {
            'n_estimators': [850],
            'num_leaves': [31, 50],
            'learning_rate': [0.01, 0.1],
            'n_estimators': [100, 200],
            'max_depth': [-1, 5],
            'min_child_samples': [10, 20],
            'subsample': [0.6, 0.8],
            'colsample_bytree': [0.6, 0.8],
            'reg_alpha': [0, 0.1],
            'reg_lambda': [0, 0.1]
        }
    }
}

hist = HistGradientBoostingRegressor()
xgb = XGBRegressor()
lgbm = LGBMRegressor()


In [None]:
for name, config in model_param.items():
    print(f"Model: {name}")
    model = config['model']
    param = config['params']
    
    gs = GridSearchCV(
        estimator=model,
        param_grid=param,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=2
    )
    
    start = time.time()
    gs.fit(X_train, y_train)
    duration = time.time() - start

    print(f"Tijd {name}: {duration:.2f}s")
    print(f"Beste param {name}: {gs.best_params_}")
    print(f"Beste MAE: {gs.best_score_:.4f}")

Model: XGBoost
Fitting 5 folds for each of 80 candidates, totalling 400 fits


KeyboardInterrupt: 

Resultaten tuned model

In [None]:
best_params = gs.best_params_
tuned_model = gs.best_estimator_

y_pred = tuned_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = 1 - mape

print(f'Mean absolute error: {mae}')
print(f'Mean absolute percentage error: {mape}')
print(f'Accuracy: {accuracy}')