# üéØ YouTube Clickbait Detector - LightGBM Model

A streamlined machine learning pipeline using **LightGBM** for YouTube clickbait detection.

**Features:**
- Advanced text preprocessing with TF-IDF vectorization
- Comprehensive feature engineering (30+ features)
- LightGBM classifier with optimized hyperparameters
- Model persistence for deployment

## üì¶ Install Dependencies

In [None]:
!pip install lightgbm -q

## üìö Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import joblib
from typing import Tuple, Dict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_recall_curve
from scipy.sparse import hstack, csr_matrix
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')
print("‚úÖ All libraries imported successfully!")

## üìÇ Load Dataset

Upload your `MASTER_DATASET.csv` file to Colab or mount Google Drive.

In [None]:
# Option 1: Upload file directly
from google.colab import files
uploaded = files.upload()
DATASET_PATH = list(uploaded.keys())[0]

# Option 2: Mount Google Drive (uncomment if needed)
# from google.colab import drive
# drive.mount('/content/drive')
# DATASET_PATH = '/content/drive/MyDrive/your_path/MASTER_DATASET.csv'

In [None]:
def load_and_prepare_data(filepath: str) -> pd.DataFrame:
    """Load and prepare the dataset with initial cleaning."""
    print("=" * 60)
    print("üìÇ LOADING DATA")
    print("=" * 60)
    
    df = pd.read_csv(filepath)
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Keep only verified rows
    df = df[df["verified"] == 1].copy()
    
    # Fill missing values
    text_cols = ["title", "description", "thumbnail_text_cleaned"]
    for col in text_cols:
        df[col] = df[col].fillna("")
    
    num_cols = ["duration_min", "views", "likes", "thumbnail_text_valid"]
    for col in num_cols:
        df[col] = df[col].fillna(0)
    
    # Create combined text field
    df["text"] = df["title"] + " " + df["description"] + " " + df["thumbnail_text_cleaned"]
    
    print(f"\nüìä Class distribution:")
    print(df["label"].value_counts())
    print(f"Clickbait ratio: {df['label'].mean()*100:.1f}%")
    
    return df

df = load_and_prepare_data(DATASET_PATH)

## üîß Feature Engineering

In [None]:
# Clickbait indicator keywords
CLICKBAIT_KEYWORDS = [
    'shocking', 'exposed', 'truth', 'secret', 'viral', 'leaked',
    "you won't believe", 'must watch', 'watch till end', 'nobody tells',
    'miracle', 'guaranteed', 'speechless', 'exclusive', 'breaking',
    'urgent', 'warning', 'banned', 'deleted', 'hidden', 'revealed'
]

PIRACY_KEYWORDS = [
    'download', 'telegram', 'camrip', 'dvdrip', 'hdrip', 'torrent',
    'leaked', 'bolly4u', 'filmyzilla', 'hdcam', 'pre-dvd', 'webrip'
]

EMOTIONAL_EMOJIS = ['üò±', 'üî•', '‚ò†Ô∏è', 'üí•', 'ü§Ø', 'üò∂', 'üò≠', 'üò°', 'üíÄ', '‚ö†Ô∏è']

In [None]:
def extract_text_features(df: pd.DataFrame) -> pd.DataFrame:
    """Extract comprehensive text-based features."""
    print("\n" + "=" * 60)
    print("üî§ FEATURE ENGINEERING - TEXT FEATURES")
    print("=" * 60)
    
    # Basic length features
    df["title_length"] = df["title"].str.len()
    df["desc_length"] = df["description"].str.len()
    df["title_word_count"] = df["title"].str.split().str.len().fillna(0)
    df["desc_word_count"] = df["description"].str.split().str.len().fillna(0)
    
    # Title style features
    df["caps_ratio"] = df["title"].apply(
        lambda x: sum(1 for c in str(x) if c.isupper()) / max(len(str(x)), 1)
    )
    df["title_caps_words"] = df["title"].apply(
        lambda x: sum(1 for w in str(x).split() if w.isupper() and len(w) > 1)
    )
    
    # Punctuation features
    df["question_count"] = df["title"].str.count(r"\?")
    df["exclam_count"] = df["title"].str.count(r"!")
    df["ellipsis_count"] = df["title"].str.count(r"\.\.\.")
    df["pipe_count"] = df["title"].str.count(r"\|")
    
    # Emoji features
    df["emoji_count"] = df["title"].apply(
        lambda x: sum(1 for c in str(x) if ord(c) > 127462)
    )
    df["emotional_emoji_count"] = df["title"].apply(
        lambda x: sum(1 for e in EMOTIONAL_EMOJIS if e in str(x))
    )
    
    # Clickbait keyword detection
    def count_keywords(text, keywords):
        text_lower = str(text).lower()
        return sum(1 for kw in keywords if kw in text_lower)
    
    df["clickbait_keywords"] = df["title"].apply(lambda x: count_keywords(x, CLICKBAIT_KEYWORDS))
    df["piracy_keywords"] = (
        df["title"].apply(lambda x: count_keywords(x, PIRACY_KEYWORDS)) +
        df["description"].apply(lambda x: count_keywords(x, PIRACY_KEYWORDS))
    )
    
    # Description quality indicators
    df["desc_is_empty"] = (df["desc_length"] < 20).astype(int)
    df["desc_hashtag_count"] = df["description"].str.count(r"#")
    df["desc_hashtag_ratio"] = df["desc_hashtag_count"] / (df["desc_word_count"] + 1)
    df["desc_has_links"] = df["description"].str.contains(r"http|https|www\.", regex=True).astype(int)
    
    # Special patterns
    df["has_full_movie_claim"] = df["title"].str.lower().str.contains(
        r"full movie|full hindi movie|full hd movie|complete movie", regex=True
    ).astype(int)
    
    df["has_year_in_title"] = df["title"].str.contains(r"\b20[0-2][0-9]\b", regex=True).astype(int)
    df["has_hd_4k"] = df["title"].str.lower().str.contains(r"\bhd\b|\b4k\b|\b1080p\b", regex=True).astype(int)
    
    print("‚úÖ Text features extracted successfully!")
    return df

df = extract_text_features(df)

In [None]:
def extract_engagement_features(df: pd.DataFrame) -> pd.DataFrame:
    """Extract engagement and metadata features."""
    print("\n" + "=" * 60)
    print("üìà FEATURE ENGINEERING - ENGAGEMENT FEATURES")
    print("=" * 60)
    
    # Engagement ratios
    df["likes_view_ratio"] = df["likes"] / (df["views"] + 1)
    df["likes_per_minute"] = df["likes"] / (df["duration_min"] + 0.1)
    df["views_per_minute"] = df["views"] / (df["duration_min"] + 0.1)
    
    # Log-transformed features
    df["log_views"] = np.log1p(df["views"])
    df["log_likes"] = np.log1p(df["likes"])
    df["log_duration"] = np.log1p(df["duration_min"])
    
    # Duration-based features
    df["is_short_video"] = (df["duration_min"] < 1).astype(int)
    df["is_very_long"] = (df["duration_min"] > 60).astype(int)
    df["duration_mismatch"] = (
        (df["has_full_movie_claim"] == 1) & (df["duration_min"] < 60)
    ).astype(int)
    
    # Anomaly detection features
    df["engagement_score"] = (
        df["likes_view_ratio"] * 100 + 
        np.log1p(df["views"]) / 10
    )
    
    # Low engagement flag
    df["low_engagement"] = (
        (df["likes_view_ratio"] < 0.001) & (df["views"] > 10000)
    ).astype(int)
    
    print("‚úÖ Engagement features extracted successfully!")
    return df

df = extract_engagement_features(df)

## üìä Build Feature Matrix

In [None]:
def build_feature_matrix(df: pd.DataFrame, max_features: int = 5000) -> Tuple:
    """Build the complete feature matrix."""
    print("\n" + "=" * 60)
    print("üî® BUILDING FEATURE MATRIX")
    print("=" * 60)
    
    # Clean text for vectorization
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'http\S+|www\.\S+', '', text)
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    df["text_clean"] = df["text"].apply(clean_text)
    
    # TF-IDF Vectorization
    tfidf = TfidfVectorizer(
        max_features=max_features,
        stop_words="english",
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.95,
        sublinear_tf=True
    )
    X_text = tfidf.fit_transform(df["text_clean"])
    print(f"TF-IDF features: {X_text.shape[1]}")
    
    # Category encoding
    cat_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    X_cat = cat_encoder.fit_transform(df[["category"]])
    print(f"Categories: {df['category'].nunique()}")
    
    # Numerical features
    num_features = [
        "duration_min", "views", "likes", "thumbnail_text_valid",
        "title_length", "desc_length", "title_word_count", "desc_word_count",
        "caps_ratio", "title_caps_words",
        "question_count", "exclam_count", "ellipsis_count", "pipe_count",
        "emoji_count", "emotional_emoji_count",
        "clickbait_keywords", "piracy_keywords",
        "desc_is_empty", "desc_hashtag_count", "desc_hashtag_ratio", "desc_has_links",
        "has_full_movie_claim", "has_year_in_title", "has_hd_4k",
        "likes_view_ratio", "likes_per_minute", "views_per_minute",
        "log_views", "log_likes", "log_duration",
        "is_short_video", "is_very_long", "duration_mismatch",
        "engagement_score", "low_engagement"
    ]
    
    X_num = df[num_features].values
    scaler = StandardScaler()
    X_num_scaled = scaler.fit_transform(X_num)
    
    # Combine all features
    X = hstack([X_text, csr_matrix(X_num_scaled), csr_matrix(X_cat)])
    y = df["label"].values
    
    print(f"\nüìê Final feature matrix shape: {X.shape}")
    print(f"  - Text features: {X_text.shape[1]}")
    print(f"  - Numerical features: {len(num_features)}")
    print(f"  - Category features: {X_cat.shape[1]}")
    
    return X, y, tfidf, scaler, cat_encoder, num_features

X, y, tfidf, scaler, cat_encoder, num_features = build_feature_matrix(df)

## üèÜ Train LightGBM Model

In [None]:
def train_lightgbm(X, y) -> Dict:
    """Train LightGBM model."""
    print("\n" + "=" * 60)
    print("üèÜ TRAINING LIGHTGBM MODEL")
    print("=" * 60)
    
    # Split data: 70% train, 15% validation, 15% test
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42, stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp
    )
    
    print(f"üìä Train: {X_train.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}")
    
    # Train LightGBM
    lgb_model = LGBMClassifier(
        n_estimators=500,
        max_depth=10,
        learning_rate=0.1,
        num_leaves=31,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    
    print("\nüîÑ Training in progress...")
    lgb_model.fit(X_train, y_train)
    
    # Validation metrics
    y_pred_val = lgb_model.predict(X_val)
    y_prob_val = lgb_model.predict_proba(X_val)[:, 1]
    val_f1 = f1_score(y_val, y_pred_val)
    val_auc = roc_auc_score(y_val, y_prob_val)
    
    print(f"\n‚úÖ Validation Results:")
    print(f"   F1 Score: {val_f1:.4f}")
    print(f"   ROC-AUC: {val_auc:.4f}")
    
    return {
        "model": lgb_model,
        "X_train": X_train,
        "X_val": X_val,
        "X_test": X_test,
        "y_train": y_train,
        "y_val": y_val,
        "y_test": y_test,
        "val_f1": val_f1,
        "val_auc": val_auc
    }

results = train_lightgbm(X, y)
model = results["model"]

## üìã Final Evaluation

In [None]:
print("\n" + "=" * 60)
print("üìã FINAL EVALUATION ON TEST SET")
print("=" * 60)

X_test = results["X_test"]
y_test = results["y_test"]

y_pred_test = model.predict(X_test)
y_prob_test = model.predict_proba(X_test)[:, 1]

print("\nüî≤ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred_test, target_names=["Non-Clickbait", "Clickbait"]))

print(f"üéØ ROC-AUC Score: {roc_auc_score(y_test, y_prob_test):.4f}")

## üéöÔ∏è Threshold Optimization

In [None]:
def optimize_threshold(model, X_test, y_test):
    """Find optimal classification threshold."""
    print("\n" + "=" * 60)
    print("üéöÔ∏è THRESHOLD OPTIMIZATION")
    print("=" * 60)
    
    y_prob = model.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
    
    # Calculate F1 for each threshold
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
    
    print(f"Default threshold (0.5) F1: {f1_score(y_test, (y_prob > 0.5).astype(int)):.4f}")
    print(f"Optimal threshold ({optimal_threshold:.3f}) F1: {f1_scores[optimal_idx]:.4f}")
    
    # Apply optimal threshold
    y_pred_optimal = (y_prob > optimal_threshold).astype(int)
    
    print("\nüìä Results with Optimal Threshold:")
    print(classification_report(y_test, y_pred_optimal, target_names=["Non-Clickbait", "Clickbait"]))
    
    return optimal_threshold

optimal_threshold = optimize_threshold(model, X_test, y_test)

## üìä Feature Importance

In [None]:
def show_feature_importance(model, num_features, tfidf, top_n=20):
    """Display top feature importances."""
    print("\n" + "=" * 60)
    print("üìä TOP FEATURE IMPORTANCES")
    print("=" * 60)
    
    importances = model.feature_importances_
    tfidf_features = list(tfidf.get_feature_names_out())
    all_features = tfidf_features + num_features
    
    # Sort by importance
    indices = np.argsort(importances)[::-1][:top_n]
    
    print(f"\nüîù Top {top_n} Most Important Features:")
    for i, idx in enumerate(indices):
        if idx < len(all_features):
            print(f"{i+1:2d}. {all_features[idx]:40s} : {importances[idx]:.4f}")

show_feature_importance(model, num_features, tfidf)

## üíæ Save Model

In [None]:
def save_model(model, tfidf, scaler, cat_encoder, num_features, output_dir="."):
    """Save model and preprocessors for deployment."""
    print("\n" + "=" * 60)
    print("üíæ SAVING MODEL")
    print("=" * 60)
    
    joblib.dump(model, f"{output_dir}/clickbait_model.joblib")
    joblib.dump(tfidf, f"{output_dir}/tfidf_vectorizer.joblib")
    joblib.dump(scaler, f"{output_dir}/scaler.joblib")
    joblib.dump(cat_encoder, f"{output_dir}/cat_encoder.joblib")
    joblib.dump(num_features, f"{output_dir}/num_features.joblib")
    
    print(f"‚úÖ Model saved to: {output_dir}/clickbait_model.joblib")
    print("‚úÖ All preprocessors saved successfully!")

save_model(model, tfidf, scaler, cat_encoder, num_features)

In [None]:
# Download saved model files (Colab)
from google.colab import files

files.download('clickbait_model.joblib')
files.download('tfidf_vectorizer.joblib')
files.download('scaler.joblib')
files.download('cat_encoder.joblib')
files.download('num_features.joblib')

## üîÆ Prediction Function

In [None]:
def predict_clickbait(title, description, thumbnail_text, category, 
                      duration_min, views, likes, model_path="."):
    """
    Predict if a video is clickbait.
    
    Returns:
        probability (float): Probability of being clickbait (0-1)
        prediction (int): 0 = Not Clickbait, 1 = Clickbait
    """
    # Load model and preprocessors
    loaded_model = joblib.load(f"{model_path}/clickbait_model.joblib")
    loaded_tfidf = joblib.load(f"{model_path}/tfidf_vectorizer.joblib")
    loaded_scaler = joblib.load(f"{model_path}/scaler.joblib")
    loaded_cat_encoder = joblib.load(f"{model_path}/cat_encoder.joblib")
    loaded_num_features = joblib.load(f"{model_path}/num_features.joblib")
    
    # Create dataframe for prediction
    df = pd.DataFrame([{
        "title": title,
        "description": description,
        "thumbnail_text_cleaned": thumbnail_text,
        "category": category,
        "duration_min": duration_min,
        "views": views,
        "likes": likes,
        "thumbnail_text_valid": 1 if thumbnail_text else 0
    }])
    
    # Extract features
    df = extract_text_features(df)
    df = extract_engagement_features(df)
    
    # Vectorize text
    df["text"] = df["title"] + " " + df["description"] + " " + df["thumbnail_text_cleaned"]
    X_text = loaded_tfidf.transform(df["text"])
    
    # Get features
    X_num = loaded_scaler.transform(df[loaded_num_features].values)
    X_cat = loaded_cat_encoder.transform(df[["category"]])
    
    # Combine and predict
    X = hstack([X_text, csr_matrix(X_num), csr_matrix(X_cat)])
    prob = loaded_model.predict_proba(X)[0, 1]
    pred = int(prob > 0.5)
    
    return prob, pred

## üß™ Test Prediction

In [None]:
# Example prediction
prob, pred = predict_clickbait(
    title="SHOCKING! You Won't Believe What Happened Next üò±üî•",
    description="Watch till end for secret reveal!",
    thumbnail_text="SHOCKING SECRET",
    category="Entertainment",
    duration_min=5.5,
    views=100000,
    likes=500
)

print(f"\nüéØ Prediction Results:")
print(f"   Probability: {prob:.2%}")
print(f"   Verdict: {'üö® CLICKBAIT' if pred == 1 else '‚úÖ NOT CLICKBAIT'}")

## ‚úÖ Summary

In [None]:
print("\n" + "=" * 60)
print("üéâ TRAINING COMPLETE!")
print("=" * 60)
print(f"\nüèÜ Model: LightGBM")
print(f"üìä Validation F1 Score: {results['val_f1']:.4f}")
print(f"üìä Validation ROC-AUC: {results['val_auc']:.4f}")
print(f"üéöÔ∏è Optimal Threshold: {optimal_threshold:.3f}")
print("\n‚úÖ Model saved and ready for deployment!")