In [None]:
# Sarcasm Detection for Steam Game Reviews

This notebook implements a sarcasm detection system for Steam game reviews using:
1. Traditional ML (Naive Bayes with TF-IDF features)
2. BERT-based deep learning model
3. Gaming-specific feature engineering

The system analyzes review text, helpfulness votes, funny votes, and playtime to detect sarcastic reviews in the Steam gaming context. Features include:
- Text-based features (TF-IDF, n-grams)
- Gaming-specific patterns
- Sentiment analysis
- Behavioral signals (funny votes, helpful votes)
- Text style analysis (capitalization, punctuation)


In [13]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import os
import joblib
from tqdm.notebook import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Machine Learning imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# PyTorch and Transformers imports
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data quietly
for resource in ['stopwords', 'punkt', 'vader_lexicon']:
    try:
        nltk.download(resource, quiet=True)
    except Exception as e:
        print(f"⚠️ Could not download NLTK resource: {resource}. Error: {e}")

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# --- GPU Setup for Metal (M1/M2/M3), CUDA, or CPU ---
if torch.backends.mps.is_available():
    device = torch.device("mps")
    torch.mps.manual_seed(RANDOM_SEED)
    print("🚀 Using Apple Metal (MPS) for GPU acceleration.")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(RANDOM_SEED)
    print("🚀 Using NVIDIA CUDA for GPU acceleration.")
else:
    device = torch.device("cpu")
    print("⚠️ GPU not available, using CPU. Training will be slower.")

print(f"Selected device: {device}")
print("✅ All packages imported and environment is set up!")


🚀 Using Apple Metal (MPS) for GPU acceleration.
Selected device: mps
✅ All packages imported and environment is set up!


In [18]:
def load_and_preprocess_data(file_path='steam_reviews.csv'):
    """
    Loads the Steam dataset and creates sentiment labels from the 'recommendation' column.
    """
    print(f"📥 Loading dataset from '{file_path}'...")
    
    if not os.path.exists(file_path):
        print(f"❌ Error: Dataset file not found at '{file_path}'.")
        return None
        
    try:
        df = pd.read_csv(file_path)
        
        # --- Preprocessing and Label Creation ---
        # 1: Positive (Recommended), 0: Negative (Not Recommended)
        df['sentiment_label'] = (df['recommendation'].str.strip() == 'Recommended').astype(int)
        df = df.dropna(subset=['review']) # Drop rows with no review text
        df['review'] = df['review'].astype(str)
        
        print(f"✅ Successfully loaded and preprocessed dataset: {df.shape[0]} reviews")
        print("\n📊 Sentiment Distribution:")
        print(df['sentiment_label'].value_counts())
        return df
        
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
        return None

# --- Execute Data Loading ---
df_labeled = load_and_preprocess_data()


📥 Loading dataset from 'steam_reviews.csv'...
✅ Successfully loaded and preprocessed dataset: 19931 reviews

📊 Sentiment Distribution:
sentiment_label
1    14005
0     5926
Name: count, dtype: int64


In [19]:
def prepare_dataset_for_training(df, target_size=10000, positive_ratio=0.5):
    """
    Cleans, samples, and balances the dataset for training.
    """
    if df is None: return None
    
    print(f"\n🧩 Preparing dataset from {len(df)} reviews...")
    
    df = df[df['review'].str.len() > 20] # Remove very short reviews
    
    positive_reviews = df[df['sentiment_label'] == 1]
    negative_reviews = df[df['sentiment_label'] == 0]
        
    # Balance the dataset by sampling
    positive_target = int(target_size * positive_ratio)
    negative_target = target_size - positive_target
    
    positive_sample = positive_reviews.sample(n=min(positive_target, len(positive_reviews)), random_state=RANDOM_SEED)
    negative_sample = negative_reviews.sample(n=min(negative_target, len(negative_reviews)), random_state=RANDOM_SEED)
    
    df_prepared = pd.concat([positive_sample, negative_sample]).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
    
    print(f"✅ Final prepared dataset size: {len(df_prepared)}")
    print(f"📈 Final distribution:\n{df_prepared['sentiment_label'].value_counts()}")
    
    return df_prepared

# --- Execute Data Preparation ---
if 'df_labeled' in locals():
    df_prepared = prepare_dataset_for_training(df_labeled)



🧩 Preparing dataset from 19931 reviews...
✅ Final prepared dataset size: 10000
📈 Final distribution:
sentiment_label
0    5000
1    5000
Name: count, dtype: int64


In [20]:
class SentimentDataset(Dataset):
    """
    Custom PyTorch Dataset for sentiment classification.
    """
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("✅ SentimentDataset class defined.")


✅ SentimentDataset class defined.


In [21]:
def train_and_evaluate_models(df):
    """
    Orchestrates the training and evaluation of both Logistic Regression and BERT models.
    """
    if df is None: return None
    print("\n🤖 Starting Model Training and Evaluation...")
    
    X = df['review'].values
    y = df['sentiment_label'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)
    
    results = {}
    
    # --- 1. Train Logistic Regression with TF-IDF ---
    print("\n📊 Training Logistic Regression model...")
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    lr_model = LogisticRegression(random_state=RANDOM_SEED, max_iter=1000)
    lr_model.fit(X_train_tfidf, y_train)
    y_pred_lr = lr_model.predict(X_test_tfidf)
    
    results['Logistic Regression'] = {
        'Accuracy': accuracy_score(y_test, y_pred_lr),
        'F1-Score': f1_score(y_test, y_pred_lr),
        'Precision': precision_score(y_test, y_pred_lr),
        'Recall': recall_score(y_test, y_pred_lr)
    }
    print("✅ Logistic Regression training complete.")

    # --- 2. Train BERT ---
    print("\n🔥 Training BERT model...")
    bert_model, bert_tokenizer, bert_stats = train_bert_function(X_train, y_train, X_test, y_test)
    
    bert_model.eval()
    y_pred_bert = []
    with torch.no_grad():
        for text in tqdm(X_test, desc="Evaluating BERT"):
            inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
            outputs = bert_model(**inputs)
            y_pred_bert.append(torch.argmax(outputs.logits).item())
            
    results['BERT'] = {
        'Accuracy': accuracy_score(y_test, y_pred_bert),
        'F1-Score': f1_score(y_test, y_pred_bert),
        'Precision': precision_score(y_test, y_pred_bert),
        'Recall': recall_score(y_test, y_pred_bert)
    }

    # --- Display Final Results ---
    print("\n\n📊 FINAL RESULTS COMPARISON")
    results_df = pd.DataFrame(results).round(4)
    print(results_df)
    
    best_model_name = results_df.loc['F1-Score'].idxmax()
    print(f"\n🏆 Best Model based on F1-Score: {best_model_name}")
    
    return {
        'results_df': results_df,
        'bert_stats': bert_stats,
        'y_test': y_test,
        'y_pred_lr': y_pred_lr,
        'y_pred_bert': y_pred_bert,
        'best_model_name': best_model_name
    }

def train_bert_function(train_texts, train_labels, val_texts, val_labels):
    """A self-contained function to train the BERT model."""
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
    
    train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    print("Training BERT...")
    for epoch in range(2): # Keep training short for demonstration
        model.train()
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            optimizer.zero_grad()
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                labels=batch['labels'].to(device)
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
    return model, tokenizer, [] # Return empty stats for simplicity here

# --- Execute the entire training pipeline ---
if 'df_prepared' in locals():
    final_results = train_and_evaluate_models(df_prepared)



🤖 Starting Model Training and Evaluation...

📊 Training Logistic Regression model...


NameError: name 'LogisticRegression' is not defined

In [8]:
def train_and_evaluate_models(df):
    """
    Train and evaluate Naive Bayes and BERT models for sarcasm detection
    """
    print("\n🤖 Training models...")
    
    # Prepare data
    X = df['review'].values
    y = (df['sarcasm_score'] > 2.0).astype(int)  # Binary classification: sarcastic vs non-sarcastic
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    results = {}
    
    # 1. Train Naive Bayes with TF-IDF
    print("\nTraining Naive Bayes model...")
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    nb_model = MultinomialNB(alpha=0.1)
    nb_model.fit(X_train_tfidf, y_train)
    nb_pred = nb_model.predict(X_test_tfidf)
    
    # Calculate Naive Bayes metrics
    nb_accuracy = accuracy_score(y_test, nb_pred)
    nb_f1 = f1_score(y_test, nb_pred)
    nb_precision = precision_score(y_test, nb_pred)
    nb_recall = recall_score(y_test, nb_pred)
    
    results['Naive Bayes'] = {
        'Accuracy': nb_accuracy,
        'F1-Score': nb_f1,
        'Precision': nb_precision,
        'Recall': nb_recall
    }
    
    print(f"✅ Naive Bayes Results:")
    print(f"   Accuracy: {nb_accuracy:.3f}")
    print(f"   F1-Score: {nb_f1:.3f}")
    print(f"   Precision: {nb_precision:.3f}")
    print(f"   Recall: {nb_recall:.3f}")
    
    # 2. Train BERT
    print("\nTraining BERT model...")
    bert_model, bert_tokenizer = train_bert_model(
        X_train, y_train,
        X_test, y_test
    )
    
    # Display final results
    print("\n📊 FINAL RESULTS:")
    print("=" * 50)
    results_df = pd.DataFrame(results).round(3)
    print(results_df)
    
    # Find best model
    best_model = results_df.loc['F1-Score'].idxmax()
    best_f1 = results_df.loc['F1-Score', best_model]
    
    print(f"\n🏆 Best Model: {best_model} (F1-Score: {best_f1:.3f})")
    
    return {
        'naive_bayes': {
            'model': nb_model,
            'vectorizer': vectorizer
        },
        'bert': {
            'model': bert_model,
            'tokenizer': bert_tokenizer
        }
    }


In [9]:
def prepare_dataset_for_training(df, target_size=1200):
    """
    Prepare the dataset for training with balanced sampling
    """
    if df is None or len(df) == 0:
        print("❌ No data available for training")
        return None
    
    print(f"📊 Preparing dataset from {len(df)} reviews...")
    
    # Clean the data
    df = df.dropna(subset=['review'])
    df = df[df['review'].str.len() > 10]  # Remove very short reviews
    
    # Sample data if we have too many reviews
    if len(df) > target_size:
        print(f"📉 Sampling {target_size} reviews from {len(df)} total...")
        
        # Try to maintain balance between sarcastic and genuine
        if 'label' in df.columns:
            sarcastic_target = int(target_size * 0.65)  # 65% sarcastic
            genuine_target = target_size - sarcastic_target
            
            sarcastic_reviews = df[df['label'] == 1]
            genuine_reviews = df[df['label'] == 0]
            
            # Sample from each group
            if len(sarcastic_reviews) >= sarcastic_target:
                sarcastic_sample = sarcastic_reviews.sample(n=sarcastic_target, random_state=42)
            else:
                sarcastic_sample = sarcastic_reviews
            
            if len(genuine_reviews) >= genuine_target:
                genuine_sample = genuine_reviews.sample(n=genuine_target, random_state=42)
            else:
                genuine_sample = genuine_reviews
            
            df = pd.concat([sarcastic_sample, genuine_sample]).sample(frac=1, random_state=42).reset_index(drop=True)
        else:
            df = df.sample(n=target_size, random_state=42).reset_index(drop=True)
    
    print(f"✅ Final dataset: {len(df)} reviews")
    if 'sentiment' in df.columns:
        print(f"📈 Final distribution: {df['sentiment'].value_counts().to_dict()}")
    
    return df


In [10]:
class SarcasmAwarePreprocessor:
    """
    Advanced text preprocessor specifically designed for Steam review sarcasm detection
    """
    
    def __init__(self):
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
        # Preserve negation and intensifier words
        self.preserve_words = {
            'not', 'no', 'never', 'nothing', 'nobody', 'nowhere', 'neither', 'nor', 
            'barely', 'hardly', 'scarcely', 'very', 'so', 'extremely', 'totally', 
            'absolutely', 'completely', 'quite', 'rather', 'really', 'definitely',
            'certainly', 'obviously', 'clearly', 'surely'
        }
        self.stop_words = self.stop_words - self.preserve_words
        
        # Steam/Gaming specific terms to preserve
        self.gaming_terms = {
            'dlc', 'fps', 'gameplay', 'multiplayer', 'singleplayer', 'coop', 'pvp',
            'respawn', 'checkpoint', 'save', 'load', 'crash', 'bug', 'glitch', 
            'lag', 'ping', 'server', 'patch', 'update', 'nerf', 'buff', 'op',
            'ragequit', 'noob', 'pro', 'speedrun', 'achievements', 'trophies'
        }
    
    def extract_advanced_sarcasm_features(self, text):
        """
        Extract comprehensive sarcasm detection features
        """
        if not isinstance(text, str):
            text = str(text)
        
        features = {}
        text_lower = text.lower()
        words = text.split()
        
        # Basic text statistics
        features['word_count'] = len(words)
        features['char_count'] = len(text)
        features['avg_word_length'] = np.mean([len(word) for word in words]) if words else 0
        features['sentence_count'] = len(re.split(r'[.!?]+', text))
        
        # Capitalization patterns (often used in sarcasm)
        features['caps_ratio'] = sum(1 for c in text if c.isupper()) / len(text) if text else 0
        features['caps_words'] = sum(1 for word in words if word.isupper())
        features['caps_sequences'] = len(re.findall(r'[A-Z]{3,}', text))
        
        # Punctuation patterns (excessive punctuation in sarcasm)
        features['exclamation_count'] = text.count('!')
        features['question_count'] = text.count('?')
        features['ellipsis_count'] = len(re.findall(r'\.{3,}', text))
        features['multiple_punct'] = len(re.findall(r'[!?]{2,}', text))
        features['punct_density'] = sum([features['exclamation_count'], features['question_count'], 
                                       features['ellipsis_count']]) / len(text) if text else 0
        
        # Sentiment analysis scores
        sentiment_scores = analyzer.polarity_scores(text)
        features.update({f'vader_{k}': v for k, v in sentiment_scores.items()})
        
        # Advanced sarcasm indicators
        features['intensifier_count'] = sum(1 for word in text_lower.split() 
                                          if word in ['very', 'so', 'extremely', 'totally', 'absolutely', 
                                                    'completely', 'quite', 'rather', 'really', 'definitely'])
        
        # Contradiction patterns
        positive_words = ['great', 'amazing', 'excellent', 'fantastic', 'wonderful', 'perfect', 
                         'brilliant', 'outstanding', 'superb', 'best', 'love', 'awesome']
        negative_words = ['terrible', 'awful', 'horrible', 'worst', 'hate', 'broken', 'crash', 
                         'bug', 'glitch', 'frustrating', 'annoying', 'impossible']
        
        features['positive_word_count'] = sum(1 for word in positive_words if word in text_lower)
        features['negative_word_count'] = sum(1 for word in negative_words if word in text_lower)
        features['pos_neg_cooccurrence'] = 1 if features['positive_word_count'] > 0 and features['negative_word_count'] > 0 else 0
        
        # Steam-specific sarcasm patterns
        steam_sarcasm_patterns = [
            r'10/10.*would.*(?:rage|quit|crash|never)',
            r'perfect.*(?:if you enjoy|for people who)',
            r'great.*(?:if you like|when you)',
            r'love.*how.*(?:crash|bug|glitch)',
            r'amazing.*(?:graphics|sound).*(?:from|like).*(?:199\d|200\d)',
            r'excellent.*(?:tutorial|help).*(?:hours|still|never)',
            r'who needs.*(?:tutorial|help|balance)',
            r'nothing like.*getting.*(?:owned|destroyed|rekt)',
            r'highly recommend.*(?:if you|for people who).*(?:enjoy|like).*(?:pain|suffering|frustration)'
        ]
        
        features['steam_sarcasm_count'] = sum(1 for pattern in steam_sarcasm_patterns 
                                            if re.search(pattern, text_lower))
        
        # Gaming terminology
        features['gaming_terms_count'] = sum(1 for term in self.gaming_terms if term in text_lower)
        
        # Quotation marks (often used sarcastically)
        features['quote_count'] = text.count('"') + text.count("'")
        
        # All caps words (emphasis in sarcasm)
        features['all_caps_words'] = len(re.findall(r'\b[A-Z]{2,}\b', text))
        
        return features
    
    def preprocess_text(self, text):
        """
        Comprehensive text preprocessing for sarcasm detection
        """
        if not isinstance(text, str):
            text = str(text)
        
        # Extract features before preprocessing
        features = self.extract_advanced_sarcasm_features(text)
        
        # Preserve important patterns before cleaning
        text = re.sub(r'(\d+)/(\d+)', r'RATING_\1_\2', text)  # Preserve ratings like 10/10
        text = re.sub(r'[.]{3,}', ' ELLIPSIS ', text)
        text = re.sub(r'[!]{2,}', ' MULTIPLE_EXCLAMATION ', text)
        text = re.sub(r'[?]{2,}', ' MULTIPLE_QUESTION ', text)
        
        # Clean URLs, mentions, etc.
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#\w+', '', text)
        
        # Preserve negation context
        negation_patterns = [
            (r'\b(not|no|never|nothing|nobody|nowhere|neither|nor)\s+(\w+)', r'\1_\2'),
            (r'\b(barely|hardly|scarcely)\s+(\w+)', r'\1_\2'),
            (r'\b(can\'t|cannot|won\'t|wouldn\'t|shouldn\'t|couldn\'t)\s+(\w+)', r'\1_\2'),
            (r'\b(don\'t|doesn\'t|didn\'t)\s+(\w+)', r'\1_\2')
        ]
        
        for pattern, replacement in negation_patterns:
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        # Tokenize and clean
        tokens = nltk.word_tokenize(text.lower())
        
        # Remove stopwords but preserve important words
        tokens = [token for token in tokens 
                 if token not in self.stop_words or token in self.preserve_words or token in self.gaming_terms]
        
        # Keep alphabetic tokens and preserved special tokens
        special_tokens = ['ELLIPSIS', 'MULTIPLE_EXCLAMATION', 'MULTIPLE_QUESTION'] + [f'RATING_{i}_{j}' for i in range(1, 11) for j in range(1, 11)]
        tokens = [token for token in tokens 
                 if token.isalpha() or token in special_tokens or '_' in token]
        
        clean_text = ' '.join(tokens)
        
        return clean_text, features


In [11]:
def main():
    """
    Main execution pipeline for real Steam reviews sarcasm detection
    """
    print("🚀 Starting Real Steam Reviews Sarcasm Detection Pipeline")
    print("=" * 60)
    
    # Step 1: Load real dataset
    df_raw = load_real_steam_dataset()
    
    if df_raw is None:
        print("❌ Could not load dataset. Please check your setup.")
        return
    
    print(f"\n📊 Raw dataset loaded: {df_raw.shape}")
    print(f"Columns: {list(df_raw.columns)}")
    
    # Step 2: Identify sarcastic reviews
    df_labeled = identify_sarcastic_reviews(df_raw)
    
    # Step 3: Prepare for training
    df = prepare_dataset_for_training(df_labeled, target_size=1200)
    
    if df is None:
        print("❌ Could not prepare dataset for training.")
        return
    
    # Step 4: Advanced preprocessing
    print("\n🔧 Applying advanced preprocessing...")
    preprocessor = SarcasmAwarePreprocessor()
    
    results = df['review'].apply(lambda x: preprocessor.preprocess_text(x))
    df['clean_review'] = [result[0] for result in results]
    features_list = [result[1] for result in results]
    
    # Convert features to DataFrame
    features_df = pd.DataFrame(features_list)
    df = pd.concat([df, features_df], axis=1)
    
    print(f"✅ Preprocessing completed!")
    print(f"Added {len(features_df.columns)} advanced features")
    
    # Step 5: Create feature matrix
    print("\n🎯 Creating feature matrices...")
    
    # TF-IDF features
    tfidf = TfidfVectorizer(
        max_features=3000,
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.9,
        sublinear_tf=True
    )
    
    X_tfidf = tfidf.fit_transform(df['clean_review'])
    
    # Numerical features
    feature_cols = [col for col in features_df.columns if isinstance(df[col].iloc[0], (int, float))]
    X_numerical = df[feature_cols].fillna(0)
    
    # Combine features
    from scipy.sparse import hstack
    import scipy.sparse as sp
    X_combined = hstack([X_tfidf, sp.csr_matrix(X_numerical.values)])
    
    y = df['label'].values
    
    print(f"📈 Feature matrix shape: {X_combined.shape}")
    print(f"🏷️ Label distribution: Sarcastic: {sum(y)}, Genuine: {len(y) - sum(y)}")
    
    # Step 6: Train models
    print("\n🤖 Training models...")
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_combined, y, test_size=0.2, random_state=42, stratify=y
    )
    
    models = {
        'Naive Bayes': MultinomialNB(alpha=0.1)
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        
        results[name] = {
            'Accuracy': accuracy,
            'F1-Score': f1,
            'Precision': precision,
            'Recall': recall
        }
        
        print(f"✅ {name} - Accuracy: {accuracy:.3f}, F1: {f1:.3f}")
    
    # Step 7: Display results
    print("\n📊 FINAL RESULTS:")
    print("=" * 50)
    results_df = pd.DataFrame(results).round(3)
    print(results_df)
    
    # Find best model
    best_model = results_df.loc['F1-Score'].idxmax()
    best_f1 = results_df.loc['F1-Score', best_model]
    
    print(f"\n🏆 Best Model: {best_model}")
    print(f"🎯 Best F1 Score: {best_f1}")
    
    # Step 8: Show example predictions
    print("\n🔍 Example Predictions:")
    print("-" * 40)
    
    best_model_obj = models[best_model]
    sample_indices = np.random.choice(len(X_test), 5, replace=False)
    
    for idx in sample_indices:
        prediction = best_model_obj.predict(X_test[idx])[0]
        actual = y_test[idx]
        review_text = df.iloc[X_test.indices[idx] if hasattr(X_test, 'indices') else idx]['review'][:100]
        
        print(f"Review: {review_text}...")
        print(f"Predicted: {'Sarcastic' if prediction == 1 else 'Genuine'}")
        print(f"Actual: {'Sarcastic' if actual == 1 else 'Genuine'}")
        print("-" * 40)
    
    print("\n🎉 Analysis completed successfully!")
    
    return df, results_df, models[best_model]


In [None]:
# Run the main pipeline
if __name__ == "__main__":
    print("🚀 Starting Sarcasm Detection Pipeline")
    print("=" * 50)
    
    # Step 1: Load and preprocess data
    df_raw = load_real_steam_dataset()
    if df_raw is None:
        print("❌ Failed to load dataset")
        sys.exit(1)
        
    # Step 2: Identify sarcastic reviews
    df_labeled = identify_sarcastic_reviews(df_raw)
    
    # Step 3: Prepare balanced dataset
    df_prepared = prepare_dataset_for_training(df_labeled)
    if df_prepared is None:
        print("❌ Failed to prepare dataset")
        sys.exit(1)
    
    # Step 4: Train and evaluate models
    results = train_and_evaluate_models(df_prepared, use_features=True)
    
    # Step 5: Save the best model
    best_model_name = results['results'].loc['F1-Score'].idxmax()
    if best_model_name == 'BERT':
        best_model = results['bert']['model']
        tokenizer = results['bert']['tokenizer']
        
        # Save BERT model and tokenizer
        save_dir = 'best_model'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
            
        best_model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"\n✅ Best model (BERT) saved to {save_dir}/")
    else:
        # Save Naive Bayes model and vectorizer
        import joblib
        joblib.dump(results['naive_bayes']['model'], 'best_model/naive_bayes_model.joblib')
        joblib.dump(results['naive_bayes']['vectorizer'], 'best_model/tfidf_vectorizer.joblib')
        print("\n✅ Best model (Naive Bayes) saved to best_model/")
    
    # Print final summary
    print("\n📊 Final Model Performance:")
    print(results['results'])
    
    print("\n🎉 Pipeline completed successfully!")
