## ML Models: SVC, Rain Forests, XGBoost

In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from google.cloud import bigquery
from google.cloud.bigquery_storage import BigQueryReadClient
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

### Load Datasets

In [None]:
train_df = pd.read_parquet("train_test_data/train_dataset.parquet")
test_df = pd.read_parquet("train_test_data/test_dataset.parquet")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
train_df.head()

Train shape: (37713, 26)
Test shape: (9429, 26)


Unnamed: 0,video_id,title,channelId,channelTitle,categoryId,tags,comments_disabled,ratings_disabled,description,likes_start,likes_end,view_count_start,view_count_end,comment_count_start,comment_count_end,trendingDuration,hoursToReachTrending,published_dayOfWeek,published_year,published_month,tagCount,engagement_rate,like_view_ratio,popularity_score,popularity_class,trend_cluster
0,lWVZOzSzhfE,Lil Tjay - Give You What You Want (Official Tr...,UCEB4a5o_6KfjxHwNMnmj54Q,Lil Tjay,Music,lil tjay|lil tjay new song|lil tjay new video|...,False,False,Lil Tjay Give You What You Want official video...,7596,8625,126556,159460,583,614,2,8.3,Thursday,2022,11,5,0.00385,0.054089,33.210523,1,0
1,wRwaVF-M0qo,Pranking My Sister with Bad Christmas Gifts (G...,UCuVHOs0H5hvAHGr8O4yIBNQ,Niki and Gabi,Howto & Style,niki and gabi|gift exchange|christmas gift exc...,False,False,Pranking My Sister with Bad Christmas Gifts (G...,10223,15767,158103,285080,316,404,9,24.0,Wednesday,2023,12,18,0.001417,0.055307,22.344142,1,1
2,BhC-QTs-kns,Amy Coney Barrett's Supreme Court confirmation...,UCXIJgqnII2ZOINSWNOGFThA,Fox News,News & Politics,Fox News|Judge Amy Coney Barrett|Trump SCOTUS|...,False,False,The Senate Judiciary Committee hearings for Su...,15081,15166,990042,998551,1263,1296,2,29.0,Thursday,2020,10,20,0.001298,0.015188,19.683658,1,1
3,Kq0M4_FDgT8,The Milwaukee Bucks Eliminate The Brooklyn Net...,UCU7iRrk3xfpUk0R6VdyC1Ow,NBA on TNT,Sports,NBA on TNT|NBA|Inside the NBA|Charles Barkley|...,False,False,The Inside crew reacts to Giannis and the Buck...,11726,12389,885580,958911,1952,1969,4,18.0,Sunday,2021,6,25,0.002053,0.01292,25.439213,1,1
4,gOA7M371Edg,Why Minecraft Removed These Controversial Spla...,UCHZ986wm_sJT6wntdDTIIcw,FitMC,Gaming,minecraft|history|update|mobs|creeper|raid|net...,False,False,Get exclusive NordVPN deal here ➵ https://nor...,23969,35337,354816,579895,2616,3670,4,10.2,Saturday,2022,7,63,0.006329,0.060937,223.6384,2,0


In [10]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, LabelEncoder

target_col = 'popularity_class' 

# Get the target
y_train = train_df[target_col]
y_test = test_df[target_col]

# Encode target if it's categorical
if y_train.dtype == 'object':
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

### Feature Transformations

In [5]:
# Feature Transformation: text
train_df['text'] = (
    train_df['title'].fillna('') + ' ' +
    train_df['channelTitle'].fillna('') + ' ' +
    train_df['tags'].fillna('') + ' ' +
    train_df['description'].fillna('')
)
test_df['text'] = (
    test_df['title'].fillna('') + ' ' +
    test_df['channelTitle'].fillna('') + ' ' +
    test_df['tags'].fillna('') + ' ' +
    test_df['description'].fillna('')
)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=500)
X_text = tfidf.fit_transform(train_df['text'])
X_text_test = tfidf.transform(test_df['text'])  



# Feature Transformation: categories
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
X_cat = ohe.fit_transform(train_df[['categoryId', 'published_dayOfWeek']]) 
X_cat_test = ohe.transform(test_df[['categoryId', 'published_dayOfWeek']])

# Feature Transformation: scaling numeric variables
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_num = scaler.fit_transform(train_df[['published_month', 'tagCount']])
X_num_test = scaler.transform(test_df[['published_month', 'tagCount']])

### SVC

In [6]:
from scipy.sparse import hstack

X_train = hstack([X_text, X_cat, X_num])
X_test = hstack([X_text_test, X_cat_test, X_num_test])


In [17]:
from sklearn.svm import SVC

clf = SVC(kernel='linear')  
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report

print("SVM Results:")
print(classification_report(y_test, y_pred))

SVM Results:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        34
         1.0       0.62      0.46      0.53      2302
         2.0       0.58      0.80      0.67      4774
         3.0       0.63      0.31      0.41      2319

    accuracy                           0.59      9429
   macro avg       0.46      0.39      0.40      9429
weighted avg       0.60      0.59      0.57      9429



### Random Forest

### XGBoost

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced', 
    random_state=42
)
rf_model.fit(X_train, y_train)

# Predict
rf_preds = rf_model.predict(X_test)

# Evaluate
print("Random Forest Results:")
print(classification_report(y_test, rf_preds))


Random Forest Results:
              precision    recall  f1-score   support

         0.0       0.62      0.15      0.24        34
         1.0       0.71      0.53      0.61      2302
         2.0       0.64      0.84      0.72      4774
         3.0       0.75      0.45      0.57      2319

    accuracy                           0.67      9429
   macro avg       0.68      0.49      0.53      9429
weighted avg       0.68      0.67      0.65      9429



In [9]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0


In [14]:
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))

sample_weights = np.array([class_weight_dict[label] for label in y_train])

xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(classes),
    eval_metric='mlogloss',
    use_label_encoder=False,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

xgb_model.fit(X_train, y_train, sample_weight=sample_weights)

# Predict and evaluate
xgb_preds = xgb_model.predict(X_test)
print("XGBoost Results:")
print(classification_report(y_test, xgb_preds))


XGBoost Results:
              precision    recall  f1-score   support

         0.0       0.08      0.21      0.11        34
         1.0       0.56      0.69      0.62      2302
         2.0       0.70      0.47      0.56      4774
         3.0       0.49      0.69      0.57      2319

    accuracy                           0.58      9429
   macro avg       0.46      0.51      0.47      9429
weighted avg       0.61      0.58      0.58      9429



### hyperparameter tuning - RF

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define the grid of parameters to test
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced', None],
    'max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'class_weight': None, 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 200}


In [20]:
# Use the best parameters
best_rf = RandomForestClassifier(random_state=42, **grid_search.best_params_)
best_rf.fit(X_train, y_train)

# Predict
best_rf_preds = best_rf.predict(X_test)
print("Random Forest (Tuned) Results:")
print(classification_report(y_test, best_rf_preds))


Random Forest (Tuned) Results:
              precision    recall  f1-score   support

         0.0       0.62      0.15      0.24        34
         1.0       0.72      0.53      0.61      2302
         2.0       0.64      0.85      0.73      4774
         3.0       0.78      0.45      0.57      2319

    accuracy                           0.67      9429
   macro avg       0.69      0.49      0.54      9429
weighted avg       0.69      0.67      0.66      9429



### hyperparameter tuning - XGBoost

In [None]:
from xgboost import XGBClassifier

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(
    XGBClassifier(
        objective='multi:softmax',
        num_class=len(np.unique(y_train)),
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42
    ),
    param_grid,
    cv=3,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train, sample_weight=sample_weights) 
print("Best Parameters:", grid_search.best_params_)

# Retrain using best parameters
best_xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y_train)),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    **grid_search.best_params_
)
best_xgb.fit(X_train, y_train, sample_weight=sample_weights)

# Predict
xgb_preds = best_xgb.predict(X_test)
print("XGBoost (Tuned) Results:")
print(classification_report(y_test, xgb_preds))


Fitting 3 folds for each of 27 candidates, totalling 81 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best Parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}
XGBoost (Tuned) Results:
              precision    recall  f1-score   support

         0.0       0.55      0.18      0.27        34
         1.0       0.59      0.72      0.65      2302
         2.0       0.70      0.57      0.63      4774
         3.0       0.56      0.66      0.61      2319

    accuracy                           0.63      9429
   macro avg       0.60      0.53      0.54      9429
weighted avg       0.64      0.63      0.63      9429

