# üéØ YouTube Clickbait Detector - LightGBM Model v2

A streamlined machine learning pipeline using **LightGBM** for YouTube clickbait detection.

**Features:**
- Advanced text preprocessing with TF-IDF vectorization
- Comprehensive feature engineering (36 features)
- LightGBM classifier with optimized hyperparameters
- Post-training verification
- Model persistence for deployment

## üì¶ Install Dependencies

In [None]:
!pip install lightgbm -q

## üìö Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import joblib
from typing import Tuple, Dict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_recall_curve
from scipy.sparse import hstack, csr_matrix
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')
print("‚úÖ All libraries imported successfully!")

## üìÇ Load Dataset

Upload your `MASTER_DATASET.csv` file to Colab.

In [None]:
from google.colab import files
uploaded = files.upload()
DATASET_PATH = list(uploaded.keys())[0]
print(f"Loaded: {DATASET_PATH}")

## ‚öôÔ∏è Configuration & Helper Functions

In [None]:
# Configuration
MAX_TFIDF_FEATURES = 5000
RANDOM_STATE = 42

# Clickbait indicator keywords
CLICKBAIT_KEYWORDS = [
    'shocking', 'exposed', 'truth', 'secret', 'viral', 'leaked',
    "you won't believe", 'must watch', 'watch till end', 'nobody tells',
    'miracle', 'guaranteed', 'speechless', 'exclusive', 'breaking',
    'urgent', 'warning', 'banned', 'deleted', 'hidden', 'revealed'
]

PIRACY_KEYWORDS = [
    'download', 'telegram', 'camrip', 'dvdrip', 'hdrip', 'torrent',
    'leaked', 'bolly4u', 'filmyzilla', 'hdcam', 'pre-dvd', 'webrip'
]

EMOTIONAL_EMOJIS = ['üò±', 'üî•', '‚ò†Ô∏è', 'üí•', 'ü§Ø', 'üò∂', 'üò≠', 'üò°', 'üíÄ', '‚ö†Ô∏è']

def count_keywords(text, keywords):
    """Count occurrences of keywords in text."""
    text_lower = str(text).lower()
    return sum(1 for kw in keywords if kw in text_lower)

def clean_text(text):
    """Clean text for TF-IDF vectorization."""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("‚úÖ Configuration loaded!")

## üìÇ Load and Prepare Data

In [None]:
def load_and_prepare_data(filepath: str) -> pd.DataFrame:
    """Load and prepare the dataset with initial cleaning."""
    print("=" * 60)
    print("üìÇ LOADING DATA")
    print("=" * 60)

    df = pd.read_csv(filepath)
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

    # Keep only verified rows
    if "verified" in df.columns:
        df = df[df["verified"] == 1].copy()
        print(f"After filtering verified: {len(df)} rows")

    # Fill missing values
    text_cols = ["title", "description", "thumbnail_text_cleaned"]
    for col in text_cols:
        if col in df.columns:
            df[col] = df[col].fillna("")

    num_cols = ["duration_min", "views", "likes", "thumbnail_text_valid"]
    for col in num_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)

    # Create combined text field
    df["text"] = df["title"] + " " + df["description"] + " " + df["thumbnail_text_cleaned"]

    print(f"\nüìä Class distribution:")
    print(df["label"].value_counts())
    print(f"Clickbait ratio: {df['label'].mean()*100:.1f}%")

    return df

df = load_and_prepare_data(DATASET_PATH)

## üîß Feature Engineering

In [None]:
def extract_text_features(df: pd.DataFrame) -> pd.DataFrame:
    """Extract comprehensive text-based features."""
    print("\n" + "=" * 60)
    print("üî§ FEATURE ENGINEERING - TEXT FEATURES")
    print("=" * 60)

    # Basic length features
    df["title_length"] = df["title"].str.len()
    df["desc_length"] = df["description"].str.len()
    df["title_word_count"] = df["title"].str.split().str.len().fillna(0)
    df["desc_word_count"] = df["description"].str.split().str.len().fillna(0)

    # Title style features
    df["caps_ratio"] = df["title"].apply(
        lambda x: sum(1 for c in str(x) if c.isupper()) / max(len(str(x)), 1)
    )
    df["title_caps_words"] = df["title"].apply(
        lambda x: sum(1 for w in str(x).split() if w.isupper() and len(w) > 1)
    )

    # Punctuation features
    df["question_count"] = df["title"].str.count(r"\?")
    df["exclam_count"] = df["title"].str.count(r"!")
    df["ellipsis_count"] = df["title"].str.count(r"\.\.\.")
    df["pipe_count"] = df["title"].str.count(r"\|")

    # Emoji features
    df["emoji_count"] = df["title"].apply(
        lambda x: sum(1 for c in str(x) if ord(c) > 127462)
    )
    df["emotional_emoji_count"] = df["title"].apply(
        lambda x: sum(1 for e in EMOTIONAL_EMOJIS if e in str(x))
    )

    # Clickbait keyword detection
    df["clickbait_keywords"] = df["title"].apply(lambda x: count_keywords(x, CLICKBAIT_KEYWORDS))
    df["piracy_keywords"] = (
        df["title"].apply(lambda x: count_keywords(x, PIRACY_KEYWORDS)) +
        df["description"].apply(lambda x: count_keywords(x, PIRACY_KEYWORDS))
    )

    # Description quality indicators
    df["desc_is_empty"] = (df["desc_length"] < 20).astype(int)
    df["desc_hashtag_count"] = df["description"].str.count(r"#")
    df["desc_hashtag_ratio"] = df["desc_hashtag_count"] / (df["desc_word_count"] + 1)
    df["desc_has_links"] = df["description"].str.contains(r"http|https|www\.", regex=True).astype(int)

    # Special patterns
    df["has_full_movie_claim"] = df["title"].str.lower().str.contains(
        r"full movie|full hindi movie|full hd movie|complete movie", regex=True
    ).astype(int)
    df["has_year_in_title"] = df["title"].str.contains(r"\b20[0-2][0-9]\b", regex=True).astype(int)
    df["has_hd_4k"] = df["title"].str.lower().str.contains(r"\bhd\b|\b4k\b|\b1080p\b", regex=True).astype(int)

    print("‚úÖ Text features extracted successfully!")
    return df

df = extract_text_features(df)

In [None]:
def extract_engagement_features(df: pd.DataFrame) -> pd.DataFrame:
    """Extract engagement and metadata features."""
    print("\n" + "=" * 60)
    print("üìà FEATURE ENGINEERING - ENGAGEMENT FEATURES")
    print("=" * 60)

    # Engagement ratios
    df["likes_view_ratio"] = df["likes"] / (df["views"] + 1)
    df["likes_per_minute"] = df["likes"] / (df["duration_min"] + 0.1)
    df["views_per_minute"] = df["views"] / (df["duration_min"] + 0.1)

    # Log-transformed features
    df["log_views"] = np.log1p(df["views"])
    df["log_likes"] = np.log1p(df["likes"])
    df["log_duration"] = np.log1p(df["duration_min"])

    # Duration-based features
    df["is_short_video"] = (df["duration_min"] < 1).astype(int)
    df["is_very_long"] = (df["duration_min"] > 60).astype(int)
    df["duration_mismatch"] = (
        (df["has_full_movie_claim"] == 1) & (df["duration_min"] < 60)
    ).astype(int)

    # Anomaly detection features
    df["engagement_score"] = (
        df["likes_view_ratio"] * 100 +
        np.log1p(df["views"]) / 10
    )
    df["low_engagement"] = (
        (df["likes_view_ratio"] < 0.001) & (df["views"] > 10000)
    ).astype(int)

    print("‚úÖ Engagement features extracted successfully!")
    return df

df = extract_engagement_features(df)

## üìä Build Feature Matrix

In [None]:
def build_feature_matrix(df: pd.DataFrame, max_features: int = 5000) -> Tuple:
    """Build the complete feature matrix."""
    print("\n" + "=" * 60)
    print("üî® BUILDING FEATURE MATRIX")
    print("=" * 60)

    # IMPORTANT: Clean text for TF-IDF vectorization
    df["text_clean"] = df["text"].apply(clean_text)

    # TF-IDF Vectorization (uses cleaned text!)
    tfidf = TfidfVectorizer(
        max_features=max_features,
        stop_words="english",
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.95,
        sublinear_tf=True
    )
    X_text = tfidf.fit_transform(df["text_clean"])
    print(f"TF-IDF features: {X_text.shape[1]}")

    # Category encoding
    cat_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    X_cat = cat_encoder.fit_transform(df[["category"]])
    print(f"Categories: {df['category'].nunique()}")

    # Numerical features (36 total)
    num_features = [
        "duration_min", "views", "likes", "thumbnail_text_valid",
        "title_length", "desc_length", "title_word_count", "desc_word_count",
        "caps_ratio", "title_caps_words",
        "question_count", "exclam_count", "ellipsis_count", "pipe_count",
        "emoji_count", "emotional_emoji_count",
        "clickbait_keywords", "piracy_keywords",
        "desc_is_empty", "desc_hashtag_count", "desc_hashtag_ratio", "desc_has_links",
        "has_full_movie_claim", "has_year_in_title", "has_hd_4k",
        "likes_view_ratio", "likes_per_minute", "views_per_minute",
        "log_views", "log_likes", "log_duration",
        "is_short_video", "is_very_long", "duration_mismatch",
        "engagement_score", "low_engagement"
    ]

    X_num = df[num_features].values
    scaler = StandardScaler()
    X_num_scaled = scaler.fit_transform(X_num)

    # Combine all features
    X = hstack([X_text, csr_matrix(X_num_scaled), csr_matrix(X_cat)])
    y = df["label"].values

    print(f"\nüìê Final feature matrix shape: {X.shape}")
    print(f"  - Text features: {X_text.shape[1]}")
    print(f"  - Numerical features: {len(num_features)}")
    print(f"  - Category features: {X_cat.shape[1]}")

    return X, y, tfidf, scaler, cat_encoder, num_features, df

X, y, tfidf, scaler, cat_encoder, num_features, df = build_feature_matrix(df, MAX_TFIDF_FEATURES)

## üèÜ Train LightGBM Model

In [None]:
def train_lightgbm(X, y) -> Dict:
    """Train LightGBM model."""
    print("\n" + "=" * 60)
    print("üèÜ TRAINING LIGHTGBM MODEL")
    print("=" * 60)

    # Split data: 70% train, 15% validation, 15% test
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=0.15, random_state=RANDOM_STATE, stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.176, random_state=RANDOM_STATE, stratify=y_temp
    )

    print(f"üìä Train: {X_train.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}")

    # Train LightGBM
    lgb_model = LGBMClassifier(
        n_estimators=500,
        max_depth=10,
        learning_rate=0.1,
        num_leaves=31,
        class_weight="balanced",
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=-1
    )

    print("\nüîÑ Training in progress...")
    lgb_model.fit(X_train, y_train)

    # Validation metrics
    y_pred_val = lgb_model.predict(X_val)
    y_prob_val = lgb_model.predict_proba(X_val)[:, 1]
    val_f1 = f1_score(y_val, y_pred_val)
    val_auc = roc_auc_score(y_val, y_prob_val)

    print(f"\n‚úÖ Validation Results:")
    print(f"   F1 Score: {val_f1:.4f}")
    print(f"   ROC-AUC: {val_auc:.4f}")

    return {
        "model": lgb_model,
        "X_train": X_train, "X_val": X_val, "X_test": X_test,
        "y_train": y_train, "y_val": y_val, "y_test": y_test,
        "val_f1": val_f1, "val_auc": val_auc
    }

results = train_lightgbm(X, y)
model = results["model"]

## üìã Final Evaluation

In [None]:
print("\n" + "=" * 60)
print("üìã FINAL EVALUATION ON TEST SET")
print("=" * 60)

X_test = results["X_test"]
y_test = results["y_test"]

y_pred_test = model.predict(X_test)
y_prob_test = model.predict_proba(X_test)[:, 1]

print("\nüî≤ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred_test, target_names=["Non-Clickbait", "Clickbait"]))

print(f"üéØ ROC-AUC Score: {roc_auc_score(y_test, y_prob_test):.4f}")

## üîç Post-Training Verification

In [None]:
print("\n" + "=" * 60)
print("üîç POST-TRAINING VERIFICATION")
print("=" * 60)

# Test on label=0 samples
print("\nüìó Testing on NON-CLICKBAIT samples (label=0):")
test_0 = df[df['label'] == 0].head(5)
correct_0 = 0
for idx, row in test_0.iterrows():
    sample_df = pd.DataFrame([row])
    X_text_s = tfidf.transform(sample_df["text_clean"])
    X_num_s = scaler.transform(sample_df[num_features].values)
    X_cat_s = cat_encoder.transform(sample_df[["category"]])
    X_s = hstack([X_text_s, csr_matrix(X_num_s), csr_matrix(X_cat_s)])
    
    pred = model.predict(X_s)[0]
    probs = model.predict_proba(X_s)[0]
    
    status = "‚úÖ" if pred == 0 else "‚ùå"
    if pred == 0:
        correct_0 += 1
    print(f"  {status} Pred={pred}, Prob[0]={probs[0]:.3f}, Prob[1]={probs[1]:.3f}")

# Test on label=1 samples
print("\nüìï Testing on CLICKBAIT samples (label=1):")
test_1 = df[df['label'] == 1].head(5)
correct_1 = 0
for idx, row in test_1.iterrows():
    sample_df = pd.DataFrame([row])
    X_text_s = tfidf.transform(sample_df["text_clean"])
    X_num_s = scaler.transform(sample_df[num_features].values)
    X_cat_s = cat_encoder.transform(sample_df[["category"]])
    X_s = hstack([X_text_s, csr_matrix(X_num_s), csr_matrix(X_cat_s)])
    
    pred = model.predict(X_s)[0]
    probs = model.predict_proba(X_s)[0]
    
    status = "‚úÖ" if pred == 1 else "‚ùå"
    if pred == 1:
        correct_1 += 1
    print(f"  {status} Pred={pred}, Prob[0]={probs[0]:.3f}, Prob[1]={probs[1]:.3f}")

total_correct = correct_0 + correct_1
print(f"\nüéØ Verification Accuracy: {total_correct}/10 ({total_correct*10}%)")

if total_correct >= 7:
    print("‚úÖ Model verification PASSED!")
else:
    print("‚ùå Model verification FAILED!")

## üíæ Save Model

In [None]:
print("\n" + "=" * 60)
print("üíæ SAVING MODEL")
print("=" * 60)

joblib.dump(model, "clickbait_model.joblib")
joblib.dump(tfidf, "tfidf_vectorizer.joblib")
joblib.dump(scaler, "scaler.joblib")
joblib.dump(cat_encoder, "cat_encoder.joblib")
joblib.dump(num_features, "num_features.joblib")

print("‚úÖ All model files saved!")

## üì• Download Model Files

In [None]:
from google.colab import files

print("üì• Downloading model files...")
files.download('clickbait_model.joblib')
files.download('tfidf_vectorizer.joblib')
files.download('scaler.joblib')
files.download('cat_encoder.joblib')
files.download('num_features.joblib')

print("\n‚úÖ All files downloaded!")
print("\nNext steps:")
print("1. Move the 5 .joblib files to your backend/BEST_FINAL_MODEL/ folder")
print("2. Run: python app.py")
print("3. Reload your Chrome extension")
print("4. Test on YouTube videos!")

## ‚úÖ Summary

In [None]:
print("\n" + "=" * 60)
print("üéâ TRAINING COMPLETE!")
print("=" * 60)
print(f"\nüèÜ Model: LightGBM")
print(f"üìä Validation F1: {results['val_f1']:.4f}")
print(f"üìä Validation AUC: {results['val_auc']:.4f}")
print(f"üîç Verification: {total_correct}/10 correct")
print("\n‚úÖ Model saved and ready for deployment!")