In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('synthetic_engagement_dataset.csv')

# Video features
video_features = ['video_watched_percentage', 'pause_count', 'seek_count']

# Non-video features
non_video_features = ['reading_time', 'idle_time', 'total_scrolls', 'max_scroll_depth']

# Split video and non-video data
video_df = df[df['is_video'] == 1].copy()
non_video_df = df[df['is_video'] == 0].copy()


In [31]:
video_df.head(10)

Unnamed: 0,content_type,reading_time,idle_time,scroll_depth,max_scroll_depth,total_scrolls,tab_switches,tab_away_time,video_watched_percentage,video_duration,video_current_time,is_video_playing,pause_count,seek_count,seek_positions,engagement_level,is_video
0,video,714,17,0.16,0.32,0,1,9.97,77.1,922,711,False,3,0,[],0.889,1
3,video,279,7,0.17,0.27,8,0,0.0,55.5,470,261,False,5,1,[36.8],0.552,1
5,video,455,19,0.13,0.31,5,2,72.73,70.1,643,451,True,3,1,[89.57],0.782,1
6,video,319,26,0.11,0.15,6,3,154.23,49.8,587,293,True,6,2,"[13.84, 98.97]",0.647,1
7,video,1052,0,0.08,0.17,4,1,0.09,72.8,1418,1032,True,7,0,[],0.99,1
8,video,464,0,0.17,0.18,9,0,0.0,81.4,539,438,True,7,0,[],1.0,1
10,video,158,9,0.27,0.36,5,2,87.13,27.4,537,147,False,3,0,[],0.564,1
11,video,861,33,0.04,0.05,2,4,146.5,54.9,1550,850,True,1,2,"[1.75, 95.99]",0.583,1
12,video,135,3,0.04,0.08,1,1,8.59,70.0,171,120,False,2,0,[],0.733,1
13,video,103,7,0.19,0.26,7,0,0.0,63.3,146,92,True,1,1,[82.03],0.698,1


In [32]:
non_video_df.head(10)

Unnamed: 0,content_type,reading_time,idle_time,scroll_depth,max_scroll_depth,total_scrolls,tab_switches,tab_away_time,video_watched_percentage,video_duration,video_current_time,is_video_playing,pause_count,seek_count,seek_positions,engagement_level,is_video
1,non_video,576,72,0.2,0.34,3,7,405.18,0.0,0,0,False,0,0,[],0.082,0
2,non_video,497,34,0.29,0.47,10,8,107.46,0.0,0,0,False,0,0,[],0.478,0
4,non_video,364,35,0.53,0.71,9,9,386.6,0.0,0,0,False,0,0,[],0.493,0
9,non_video,489,91,0.42,0.59,6,4,21.17,0.0,0,0,False,0,0,[],0.342,0
14,non_video,259,27,0.33,0.75,6,2,68.17,0.0,0,0,False,0,0,[],0.347,0
15,non_video,139,21,0.4,0.64,6,6,318.12,0.0,0,0,False,0,0,[],0.387,0
16,non_video,95,26,0.2,0.48,4,9,504.63,0.0,0,0,False,0,0,[],0.088,0
18,non_video,34,1,0.55,0.65,14,8,161.84,0.0,0,0,False,0,0,[],0.717,0
23,non_video,163,27,0.19,0.5,7,0,0.0,0.0,0,0,False,0,0,[],0.307,0
24,non_video,330,106,0.6,0.66,7,6,265.57,0.0,0,0,False,0,0,[],0.328,0


In [33]:
# Video
X_video = video_df[video_features]
y_video = video_df['engagement_level']

# Non-video
X_nonvideo = non_video_df[non_video_features]
y_nonvideo = non_video_df['engagement_level']

In [34]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

def kfold_evaluate(X, y, model, k=5, tolerance=0.05):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    mse_scores, r2_scores, acc_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse_scores.append(mean_squared_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))

    return {
        "MSE_mean": np.mean(mse_scores),
        "MSE_std": np.std(mse_scores),
        "R2_mean": np.mean(r2_scores),
        "R2_std": np.std(r2_scores),
    }

XGBoost with Randomized Search

In [35]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold
import numpy as np

# Parameter distributions for tuning
param_dist = {
    'n_estimators': np.arange(100, 801, 100),
    'max_depth': np.arange(3, 11),
    'learning_rate': np.linspace(0.01, 0.2, 20),
    'subsample': np.linspace(0.5, 1.0, 6),
    'colsample_bytree': np.linspace(0.5, 1.0, 6),
    'min_child_weight': np.arange(1, 11)
}

xgb_base = XGBRegressor(
    tree_method='hist',
    random_state=42,
    n_jobs=-1
)

cv_strategy = KFold(n_splits=3, shuffle=True, random_state=42)

xgb_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=30,                 # number of combinations to try
    scoring='r2',
    cv=cv_strategy,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

xgb_search_video = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',
    cv=cv_strategy,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [36]:
#NON VIDEO
xgb_search.fit(X_nonvideo, y_nonvideo)

print("Best parameters (non-video):", xgb_search.best_params_)
print("Best R² score (CV):", xgb_search.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best parameters (non-video): {'subsample': np.float64(0.5), 'n_estimators': np.int64(800), 'min_child_weight': np.int64(7), 'max_depth': np.int64(3), 'learning_rate': np.float64(0.13), 'colsample_bytree': np.float64(1.0)}
Best R² score (CV): 0.8743933028725528


In [37]:
#VIDEO
xgb_search_video.fit(X_video, y_video)

print("Best parameters (video):", xgb_search_video.best_params_)
print("Best R² score (CV):", xgb_search_video.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best parameters (video): {'subsample': np.float64(0.9), 'n_estimators': np.int64(100), 'min_child_weight': np.int64(2), 'max_depth': np.int64(4), 'learning_rate': np.float64(0.15000000000000002), 'colsample_bytree': np.float64(0.7)}
Best R² score (CV): 0.7265735048424068


In [38]:
best_xgb_nonvideo = XGBRegressor(
    **xgb_search.best_params_,
    tree_method='hist',
    random_state=42
)
best_xgb_video = XGBRegressor(
    **xgb_search_video.best_params_,
    tree_method='hist',
    random_state=42
)

In [39]:
nonvideo_results = kfold_evaluate(X_nonvideo, y_nonvideo, best_xgb_nonvideo, k=5, tolerance=0.05)
video_results = kfold_evaluate(X_video, y_video, best_xgb_video, k=5, tolerance=0.05)

print("\n Non-video Sessions (Tuned XGBoost 5-Fold CV):")
print(nonvideo_results)

print("\n Video Sessions (Tuned XGBoost 5-Fold CV):")
print(video_results)

#Non-video = 0.8726518
#Video = 0.72639288


 Non-video Sessions (Tuned XGBoost 5-Fold CV):
{'MSE_mean': np.float64(0.004849188839350441), 'MSE_std': np.float64(3.466728793184497e-05), 'R2_mean': np.float64(0.8744409152981533), 'R2_std': np.float64(0.0008628221397924972)}

 Video Sessions (Tuned XGBoost 5-Fold CV):
{'MSE_mean': np.float64(0.005924909091938336), 'MSE_std': np.float64(5.0101437866692815e-05), 'R2_mean': np.float64(0.7267711084814041), 'R2_std': np.float64(0.0010516311854827712)}


In [40]:
#TEST PREDICTION FROM EXTENSION'S DATA
sample_data = {
    "content_type": "article",
    "reading_time": 82,
    "idle_time": 12,
    "total_scrolls": 7,
    "tab_switches": 0,
    "tab_away_time": 0,
    "max_scroll_depth": 1,
    "scroll_depth": 0.1,
    "video_watched_percentage": 0,
    "video_duration": 0,
    "video_current_time": 0,
    "is_video_playing": False,
    "pause_count": 0,
    "seek_count": 0,
    "seek_positions": []
}

real_testing_in = pd.DataFrame([sample_data])[non_video_features]
predicted_real_engagement = best_xgb_nonvideo.predict(real_testing_in)[0]
print(f"Predicted engagement level: {predicted_real_engagement:.4f}")

Predicted engagement level: 0.4475


In [41]:
import os

# Go up one level from "model" to project root, then into backend/ml
save_dir = os.path.join("..", "backend", "ml")
os.makedirs(save_dir, exist_ok=True)

# Save trained models
best_xgb_nonvideo.save_model(os.path.join(save_dir, "xgb_nonvideo.json"))
best_xgb_video.save_model(os.path.join(save_dir, "xgb_video.json"))

print("Models saved to:", save_dir)


Models saved to: ..\backend\ml
