In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('synthetic_engagement_dataset.csv')

# Video features
video_features = ['video_watched_percentage', 'pause_count', 'seek_count']

# Non-video features
non_video_features = ['reading_time', 'idle_time', 'total_scrolls', 'max_scroll_depth']

# Split video and non-video data
video_df = df[df['is_video'] == 1].copy()
non_video_df = df[df['is_video'] == 0].copy()


In [2]:
video_df.head(10)

Unnamed: 0,content_type,reading_time,idle_time,scroll_depth,max_scroll_depth,total_scrolls,tab_switches,tab_away_time,video_watched_percentage,video_duration,video_current_time,is_video_playing,pause_count,seek_count,seek_positions,engagement_level,is_video
0,video,553,38,0.22,0.31,1,2,47.34,45.0,1207,543,True,3,3,"[62.23, 65.12, 75.46]",0.36,1
1,video,450,50,0.05,0.1,8,1,7.16,33.7,1294,436,True,1,2,"[5.94, 13.1]",0.278,1
3,video,831,39,0.16,0.25,8,4,110.53,78.7,1027,808,False,4,1,[24.29],0.751,1
4,video,548,39,0.13,0.27,8,1,0.66,65.5,830,544,False,3,1,[39.84],0.721,1
7,video,913,58,0.29,0.47,8,4,146.8,51.8,1709,885,True,5,0,[],0.639,1
15,video,1004,77,0.26,0.4,2,3,53.13,65.6,1502,984,True,5,0,[],0.675,1
19,video,453,29,0.28,0.32,5,4,231.18,68.0,626,425,False,5,0,[],0.746,1
20,video,656,34,0.15,0.29,3,4,71.22,51.3,1237,635,False,3,1,[38.05],0.684,1
21,video,377,8,0.05,0.11,2,2,105.44,65.3,540,353,True,6,0,[],0.718,1
23,video,699,17,0.08,0.27,4,4,160.73,38.5,1784,686,True,1,2,"[46.92, 77.71]",0.539,1


In [3]:
non_video_df.head(10)

Unnamed: 0,content_type,reading_time,idle_time,scroll_depth,max_scroll_depth,total_scrolls,tab_switches,tab_away_time,video_watched_percentage,video_duration,video_current_time,is_video_playing,pause_count,seek_count,seek_positions,engagement_level,is_video
2,non_video,49,4,0.37,0.77,10,3,38.19,0.0,0,0,False,0,0,[],0.435,0
5,non_video,454,28,0.68,0.84,8,4,163.57,0.0,0,0,False,0,0,[],0.582,0
6,non_video,560,173,0.23,0.5,5,5,119.7,0.0,0,0,False,0,0,[],0.267,0
8,non_video,474,20,0.21,0.7,8,9,86.31,0.0,0,0,False,0,0,[],0.568,0
9,non_video,349,33,0.39,0.53,8,8,138.01,0.0,0,0,False,0,0,[],0.432,0
10,non_video,452,82,0.38,0.74,8,4,212.17,0.0,0,0,False,0,0,[],0.366,0
11,non_video,133,38,0.37,0.56,5,9,4.63,0.0,0,0,False,0,0,[],0.143,0
12,non_video,70,3,0.34,0.72,11,3,77.83,0.0,0,0,False,0,0,[],0.635,0
13,non_video,225,53,0.52,0.54,7,8,213.27,0.0,0,0,False,0,0,[],0.314,0
14,non_video,209,44,0.42,0.53,9,1,54.7,0.0,0,0,False,0,0,[],0.567,0


In [4]:
# Video
X_video = video_df[video_features]
y_video = video_df['engagement_level']

# Non-video
X_nonvideo = non_video_df[non_video_features]
y_nonvideo = non_video_df['engagement_level']

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

def kfold_evaluate(X, y, model, k=5, tolerance=0.05):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    mse_scores, r2_scores, acc_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse_scores.append(mean_squared_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))

    return {
        "MSE_mean": np.mean(mse_scores),
        "MSE_std": np.std(mse_scores),
        "R2_mean": np.mean(r2_scores),
        "R2_std": np.std(r2_scores),
    }

XGBoost with Randomized Search

In [6]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold
import numpy as np

# Parameter distributions for tuning
param_dist = {
    'n_estimators': np.arange(100, 801, 100),
    'max_depth': np.arange(3, 11),
    'learning_rate': np.linspace(0.01, 0.2, 20),
    'subsample': np.linspace(0.5, 1.0, 6),
    'colsample_bytree': np.linspace(0.5, 1.0, 6),
    'min_child_weight': np.arange(1, 11)
}

xgb_base = XGBRegressor(
    tree_method='hist',
    random_state=42,
    n_jobs=-1
)

cv_strategy = KFold(n_splits=3, shuffle=True, random_state=42)

xgb_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=30,                 # number of combinations to try
    scoring='r2',
    cv=cv_strategy,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

xgb_search_video = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',
    cv=cv_strategy,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [7]:
#NON VIDEO
xgb_search.fit(X_nonvideo, y_nonvideo)

print("Best parameters (non-video):", xgb_search.best_params_)
print("Best R² score (CV):", xgb_search.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best parameters (non-video): {'subsample': np.float64(0.5), 'n_estimators': np.int64(800), 'min_child_weight': np.int64(7), 'max_depth': np.int64(3), 'learning_rate': np.float64(0.13), 'colsample_bytree': np.float64(1.0)}
Best R² score (CV): 0.8725261455691622


In [8]:
xgb_search_video.fit(X_video, y_video)

print("Best parameters (video):", xgb_search_video.best_params_)
print("Best R² score (CV):", xgb_search_video.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best parameters (video): {'subsample': np.float64(0.9), 'n_estimators': np.int64(100), 'min_child_weight': np.int64(2), 'max_depth': np.int64(4), 'learning_rate': np.float64(0.15000000000000002), 'colsample_bytree': np.float64(0.7)}
Best R² score (CV): 0.7263764460355219


In [9]:
best_xgb_nonvideo = XGBRegressor(
    **xgb_search.best_params_,
    tree_method='hist',
    random_state=42
)
best_xgb_video = XGBRegressor(
    **xgb_search_video.best_params_,
    tree_method='hist',
    random_state=42
)

In [10]:
nonvideo_results = kfold_evaluate(X_nonvideo, y_nonvideo, best_xgb_nonvideo, k=5, tolerance=0.05)
video_results = kfold_evaluate(X_video, y_video, best_xgb_video, k=5, tolerance=0.05)

print("\n📊 Non-video Sessions (Tuned XGBoost 5-Fold CV):")
print(nonvideo_results)

print("\n🎥 Video Sessions (Tuned XGBoost 5-Fold CV):")
print(video_results)

#Non-video = 0.8726518
#Video = 0.72639288


📊 Non-video Sessions (Tuned XGBoost 5-Fold CV):
{'MSE_mean': np.float64(0.0048419029934692204), 'MSE_std': np.float64(4.686732653873059e-05), 'R2_mean': np.float64(0.8728401305825628), 'R2_std': np.float64(0.002153803070100516)}

🎥 Video Sessions (Tuned XGBoost 5-Fold CV):
{'MSE_mean': np.float64(0.005924540213312827), 'MSE_std': np.float64(3.170074867800112e-05), 'R2_mean': np.float64(0.7265529085455444), 'R2_std': np.float64(0.0026330595545216366)}


In [11]:
#TEST PREDICTION FROM EXTENSION'S DATA
sample_data = {
    "content_type": "article",
    "reading_time": 82,
    "idle_time": 12,
    "total_scrolls": 7,
    "tab_switches": 0,
    "tab_away_time": 0,
    "max_scroll_depth": 1,
    "scroll_depth": 0.1,
    "video_watched_percentage": 0,
    "video_duration": 0,
    "video_current_time": 0,
    "is_video_playing": False,
    "pause_count": 0,
    "seek_count": 0,
    "seek_positions": []
}

real_testing_in = pd.DataFrame([sample_data])[non_video_features]
predicted_real_engagement = best_xgb_nonvideo.predict(real_testing_in)[0]
print(f"Predicted engagement level: {predicted_real_engagement:.4f}")

Predicted engagement level: 0.4553


In [13]:
import os

# Go up one level from "model" to project root, then into backend/ml
save_dir = os.path.join("..", "backend", "ml")
os.makedirs(save_dir, exist_ok=True)

# Save trained models
best_xgb_nonvideo.save_model(os.path.join(save_dir, "xgb_nonvideo.json"))
best_xgb_video.save_model(os.path.join(save_dir, "xgb_video.json"))

print("✅ Models saved to:", save_dir)


XGBoostError: [20:22:03] C:\actions-runner\_work\xgboost\xgboost\dmlc-core\src\io\local_filesys.cc:210: Check failed: allow_null:  LocalFileSystem::Open "./backend/ml/xgb_nonvideo.json": No such file or directory