In [493]:
import pandas as pd 
df = pd.read_csv('recipe_site_traffic_2212.csv')
df.head()

Unnamed: 0,recipe,calories,carbohydrate,sugar,protein,category,servings,high_traffic
0,1,,,,,Pork,6,High
1,2,35.48,38.56,0.66,0.92,Potato,4,High
2,3,914.28,42.68,3.09,2.88,Breakfast,1,
3,4,97.03,30.56,38.63,0.02,Beverages,4,High
4,5,27.05,1.85,0.8,0.53,Beverages,4,


# 🛠️ Model Development

## Fitting a Comparaison Model : XGBoost Classifier Optimized with Optuna 

In [515]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
import optuna

# Define the objective function for Optuna optimization
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 1,
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Create a Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, show_progress_bar=True)

# Get the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Create and train the XGBoost classifier with the best hyperparameters
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
accuracy_test = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy_test)






[I 2023-07-21 14:01:42,026] A new study created in memory with name: no-name-0c8fa385-9ac3-48ae-9f7b-71be5c0039d4


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2023-07-21 14:01:42,257] Trial 0 finished with value: 0.7444444444444445 and parameters: {'lambda': 0.08690248753889739, 'alpha': 0.0012996441577692215, 'colsample_bytree': 0.9242916270249731, 'subsample': 0.42540013610517613, 'learning_rate': 0.012610379071365412, 'n_estimators': 171, 'max_depth': 3, 'min_child_weight': 9}. Best is trial 0 with value: 0.7444444444444445.
[I 2023-07-21 14:01:42,342] Trial 1 finished with value: 0.6666666666666666 and parameters: {'lambda': 0.002616723592831143, 'alpha': 0.11589379235262796, 'colsample_bytree': 0.7379537542877025, 'subsample': 0.4287476098445049, 'learning_rate': 0.0012694748533246315, 'n_estimators': 60, 'max_depth': 4, 'min_child_weight': 8}. Best is trial 0 with value: 0.7444444444444445.
[I 2023-07-21 14:01:42,701] Trial 2 finished with value: 0.6666666666666666 and parameters: {'lambda': 5.30165732390735, 'alpha': 0.09101702104637145, 'colsample_bytree': 0.6371258657268896, 'subsample': 0.5692966948844895, 'learning_rate': 0.077

## Evaluation of the XGBoost Classifier

In [519]:
# Create and train the XGBoost classifier with the best hyperparameters
params = {'lambda': 0.0024216262227453877, 'alpha': 0.006418687794179554, 'colsample_bytree': 0.9784013770010614, 'subsample': 0.9603519784327296, 'learning_rate': 0.003749893128483935, 'n_estimators': 254, 'max_depth': 5, 'min_child_weight': 1, 'random_state':1}
best_model = xgb.XGBClassifier(**params)
best_model.fit(X_train, y_train)


# Make predictions on the test set
y_train_pred = best_model.predict(X_train)
# Make predictions on the test set
y_test_pred = best_model.predict(X_test)

# Calculate metrics for training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_roc_auc = roc_auc_score(y_train, y_train_pred)

# Calculate metrics for test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_pred)

# Create a table to display the metrics
metrics_table = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'],
    'Train Set': [train_accuracy, train_precision, train_recall, train_f1, train_roc_auc],
    'Test Set': [test_accuracy, test_precision, test_recall, test_f1, test_roc_auc]
})

# Set the 'Metric' column as the index
metrics_table = metrics_table.set_index('Metric')

# Define a custom styling function to apply color and background to the cells
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: cyan; color: black; font-weight: bold' if v else '' for v in is_max]

# Apply the custom styling to the metrics table
styled_metrics_table = metrics_table.style.apply(highlight_max, axis=1)

# Display the styled metrics table
display(styled_metrics_table)

Unnamed: 0_level_0,Train Set,Test Set
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Accuracy,0.807453,0.811111
Precision,0.810964,0.793103
Recall,0.886364,0.901961
F1 Score,0.846989,0.844037
ROC AUC,0.787419,0.797134
