In [None]:
# Sarcasm-Aware Sentiment Analysis Project
## Real Steam Reviews Dataset Implementation

This notebook implements a sarcasm detection system for Steam game reviews, combining traditional NLP techniques with gaming-specific features.


In [1]:
# Install required packages
!pip install datasets transformers wordcloud vaderSentiment textblob -q
!pip install -U scikit-learn pandas numpy matplotlib seaborn nltk -q

# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
import re
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Machine Learning imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, 
                           roc_auc_score, f1_score, precision_score, recall_score)

# Transformer models
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('vader_lexicon', quiet=True)

# Initialize sentiment analyzers
analyzer = SentimentIntensityAnalyzer()

# Set random seeds for reproducibility
np.random.seed(42)

print("✅ All packages installed and imports completed!")


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
statsmodels 0.14.2 requires packaging>=21.3, but you have packaging 20.9 which is incompatible.
streamlit 1.32.0 requires protobuf<5,>=3.20, but you have protobuf 5.28.2 which is incompatible.
xarray 2023.6.0 requires packaging>=21.3, but you have packaging 20.9 which is incompatible.[0m[31m
[0m✅ All packages installed and imports completed!


In [2]:
def load_real_steam_dataset():
    """
    Load real Steam Reviews dataset from the local CSV file
    """
    print("📥 Loading Steam Reviews dataset...")
    
    try:
        # Load the dataset
        df = pd.read_csv('steam_reviews.csv')
        
        # Rename columns to match our expected format
        df = df.rename(columns={
            'review': 'review_text',
            'funny': 'votes_funny',
            'helpful': 'votes_helpful',
            'hour_played': 'playtime_forever',
            'recommendation': 'voted_up'
        })
        
        # Convert recommendation to boolean
        df['voted_up'] = df['voted_up'].apply(lambda x: True if x == 'Recommended' else False)
        
        # Convert playtime to float (in case it's string)
        df['playtime_forever'] = pd.to_numeric(df['playtime_forever'], errors='coerce')
        
        print(f"✅ Successfully loaded dataset: {df.shape[0]} reviews")
        print("\nSample review:")
        print(df[['review_text', 'voted_up', 'votes_funny', 'votes_helpful', 'playtime_forever']].iloc[0])
        
        return df
        
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        print("\n📁 Please ensure your CSV file is named 'steam_reviews.csv' and contains the following columns:")
        print("- date_posted")
        print("- funny")
        print("- helpful")
        print("- hour_played")
        print("- is_early_access_review")
        print("- recommendation")
        print("- review")
        return None


In [3]:
def identify_sarcastic_reviews(df):
    """
    Identify sarcastic reviews from real Steam data using heuristics
    """
    print("🎭 Identifying sarcastic reviews using heuristics...")
    
    # Ensure we have the right column names
    text_column = None
    for col in ['review', 'review_text', 'content', 'text']:
        if col in df.columns:
            text_column = col
            break
    
    if text_column is None:
        print("❌ Could not find review text column")
        return df
    
    # Rename to standard column name
    df = df.rename(columns={text_column: 'review'})
    
    # Initialize sarcasm probability scores
    df['sarcasm_score'] = 0.0
    df['sarcasm_indicators'] = ''
    
    # Sarcasm detection heuristics
    sarcasm_patterns = {
        'positive_negative_mismatch': [
            (r'(great|amazing|perfect|excellent|fantastic|wonderful|brilliant|outstanding|superb|best).*(?:crash|bug|glitch|broken|terrible|awful|horrible|worst|hate|never|not)', 2.0),
            (r'10/10.*(?:would.*rage|crash|bug|glitch|broken|never)', 2.5),
            (r'love.*(?:how.*crash|bug|glitch|broken|never)', 2.0)
        ],
        'excessive_praise_with_complaints': [
            (r'perfect.*(?:if you enjoy|for people who|when you)', 1.5),
            (r'amazing.*(?:graphics|sound).*(?:1995|2000|pixelated|terrible)', 2.0),
            (r'excellent.*(?:tutorial|controls).*(?:hours|still|never|don\'t)', 1.8)
        ],
        'contradictory_statements': [
            (r'stable.*(?:crashed|crash|freeze)', 2.0),
            (r'optimized.*(?:fps|framerate|lag|slow)', 1.8),
            (r'balanced.*(?:overpowered|useless|broken)', 1.5)
        ],
        'gaming_specific_sarcasm': [
            (r'who needs.*(?:tutorial|help|guide|instructions)', 1.5),
            (r'nothing like.*(?:getting|being|playing)', 1.3),
            (r'highly recommend.*(?:if you|for people who|when)', 1.2)
        ]
    }
    
    # Apply sarcasm detection patterns
    for category, patterns in sarcasm_patterns.items():
        for pattern, weight in patterns:
            matches = df['review'].str.contains(pattern, case=False, na=False, regex=True)
            df.loc[matches, 'sarcasm_score'] += weight
            df.loc[matches, 'sarcasm_indicators'] += f"{category}; "
    
    # Additional heuristics based on Steam-specific features
    if 'voted_up' in df.columns and 'votes_funny' in df.columns:
        # Negative reviews with high funny votes are likely sarcastic
        funny_negative = (df['voted_up'] == False) & (df['votes_funny'] > df['votes_funny'].quantile(0.7))
        df.loc[funny_negative, 'sarcasm_score'] += 1.0
        df.loc[funny_negative, 'sarcasm_indicators'] += 'funny_negative; '
    
    # Detect excessive punctuation and caps (sarcasm indicators)
    df['caps_ratio'] = df['review'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / len(str(x)) if len(str(x)) > 0 else 0)
    df['exclamation_count'] = df['review'].str.count('!')
    df['question_count'] = df['review'].str.count('\?')
    
    # High caps ratio with positive words might indicate sarcasm
    high_caps = df['caps_ratio'] > 0.3
    df.loc[high_caps, 'sarcasm_score'] += 0.5
    df.loc[high_caps, 'sarcasm_indicators'] += 'high_caps; '
    
    # Multiple exclamation marks with negative sentiment
    excessive_exclamation = df['exclamation_count'] > 2
    df.loc[excessive_exclamation, 'sarcasm_score'] += 0.3
    df.loc[excessive_exclamation, 'sarcasm_indicators'] += 'excessive_punct; '
    
    # Use VADER sentiment to detect polarity mismatches
    print("🔍 Analyzing sentiment for sarcasm detection...")
    sentiment_scores = df['review'].apply(lambda x: analyzer.polarity_scores(str(x)))
    df['vader_positive'] = [score['pos'] for score in sentiment_scores]
    df['vader_negative'] = [score['neg'] for score in sentiment_scores]
    df['vader_compound'] = [score['compound'] for score in sentiment_scores]
    
    # High positive words but negative overall sentiment (potential sarcasm)
    polarity_mismatch = (df['vader_positive'] > 0.3) & (df['vader_compound'] < -0.1)
    df.loc[polarity_mismatch, 'sarcasm_score'] += 1.2
    df.loc[polarity_mismatch, 'sarcasm_indicators'] += 'polarity_mismatch; '
    
    # Create binary sarcasm labels based on score threshold
    sarcasm_threshold = df['sarcasm_score'].quantile(0.35)  # Top 65% as sarcastic
    df['is_sarcastic'] = (df['sarcasm_score'] >= sarcasm_threshold).astype(int)
    df['label'] = df['is_sarcastic']
    df['sentiment'] = df['is_sarcastic'].map({1: 'sarcastic', 0: 'genuine'})
    
    print(f"📊 Sarcasm Detection Results:")
    print(f"Sarcasm threshold: {sarcasm_threshold:.2f}")
    print(f"Distribution: {df['sentiment'].value_counts().to_dict()}")
    print(f"Total reviews: {len(df)}")
    
    return df


In [4]:
def prepare_dataset_for_training(df, target_size=1200):
    """
    Prepare the dataset for training with balanced sampling
    """
    if df is None or len(df) == 0:
        print("❌ No data available for training")
        return None
    
    print(f"📊 Preparing dataset from {len(df)} reviews...")
    
    # Clean the data
    df = df.dropna(subset=['review'])
    df = df[df['review'].str.len() > 10]  # Remove very short reviews
    
    # Sample data if we have too many reviews
    if len(df) > target_size:
        print(f"📉 Sampling {target_size} reviews from {len(df)} total...")
        
        # Try to maintain balance between sarcastic and genuine
        if 'label' in df.columns:
            sarcastic_target = int(target_size * 0.65)  # 65% sarcastic
            genuine_target = target_size - sarcastic_target
            
            sarcastic_reviews = df[df['label'] == 1]
            genuine_reviews = df[df['label'] == 0]
            
            # Sample from each group
            if len(sarcastic_reviews) >= sarcastic_target:
                sarcastic_sample = sarcastic_reviews.sample(n=sarcastic_target, random_state=42)
            else:
                sarcastic_sample = sarcastic_reviews
            
            if len(genuine_reviews) >= genuine_target:
                genuine_sample = genuine_reviews.sample(n=genuine_target, random_state=42)
            else:
                genuine_sample = genuine_reviews
            
            df = pd.concat([sarcastic_sample, genuine_sample]).sample(frac=1, random_state=42).reset_index(drop=True)
        else:
            df = df.sample(n=target_size, random_state=42).reset_index(drop=True)
    
    print(f"✅ Final dataset: {len(df)} reviews")
    if 'sentiment' in df.columns:
        print(f"📈 Final distribution: {df['sentiment'].value_counts().to_dict()}")
    
    return df


In [5]:
class SarcasmAwarePreprocessor:
    """
    Advanced text preprocessor specifically designed for Steam review sarcasm detection
    """
    
    def __init__(self):
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
        # Preserve negation and intensifier words
        self.preserve_words = {
            'not', 'no', 'never', 'nothing', 'nobody', 'nowhere', 'neither', 'nor', 
            'barely', 'hardly', 'scarcely', 'very', 'so', 'extremely', 'totally', 
            'absolutely', 'completely', 'quite', 'rather', 'really', 'definitely',
            'certainly', 'obviously', 'clearly', 'surely'
        }
        self.stop_words = self.stop_words - self.preserve_words
        
        # Steam/Gaming specific terms to preserve
        self.gaming_terms = {
            'dlc', 'fps', 'gameplay', 'multiplayer', 'singleplayer', 'coop', 'pvp',
            'respawn', 'checkpoint', 'save', 'load', 'crash', 'bug', 'glitch', 
            'lag', 'ping', 'server', 'patch', 'update', 'nerf', 'buff', 'op',
            'ragequit', 'noob', 'pro', 'speedrun', 'achievements', 'trophies'
        }
    
    def extract_advanced_sarcasm_features(self, text):
        """
        Extract comprehensive sarcasm detection features
        """
        if not isinstance(text, str):
            text = str(text)
        
        features = {}
        text_lower = text.lower()
        words = text.split()
        
        # Basic text statistics
        features['word_count'] = len(words)
        features['char_count'] = len(text)
        features['avg_word_length'] = np.mean([len(word) for word in words]) if words else 0
        features['sentence_count'] = len(re.split(r'[.!?]+', text))
        
        # Capitalization patterns (often used in sarcasm)
        features['caps_ratio'] = sum(1 for c in text if c.isupper()) / len(text) if text else 0
        features['caps_words'] = sum(1 for word in words if word.isupper())
        features['caps_sequences'] = len(re.findall(r'[A-Z]{3,}', text))
        
        # Punctuation patterns (excessive punctuation in sarcasm)
        features['exclamation_count'] = text.count('!')
        features['question_count'] = text.count('?')
        features['ellipsis_count'] = len(re.findall(r'\.{3,}', text))
        features['multiple_punct'] = len(re.findall(r'[!?]{2,}', text))
        features['punct_density'] = sum([features['exclamation_count'], features['question_count'], 
                                       features['ellipsis_count']]) / len(text) if text else 0
        
        # Sentiment analysis scores
        sentiment_scores = analyzer.polarity_scores(text)
        features.update({f'vader_{k}': v for k, v in sentiment_scores.items()})
        
        # Advanced sarcasm indicators
        features['intensifier_count'] = sum(1 for word in text_lower.split() 
                                          if word in ['very', 'so', 'extremely', 'totally', 'absolutely', 
                                                    'completely', 'quite', 'rather', 'really', 'definitely'])
        
        # Contradiction patterns
        positive_words = ['great', 'amazing', 'excellent', 'fantastic', 'wonderful', 'perfect', 
                         'brilliant', 'outstanding', 'superb', 'best', 'love', 'awesome']
        negative_words = ['terrible', 'awful', 'horrible', 'worst', 'hate', 'broken', 'crash', 
                         'bug', 'glitch', 'frustrating', 'annoying', 'impossible']
        
        features['positive_word_count'] = sum(1 for word in positive_words if word in text_lower)
        features['negative_word_count'] = sum(1 for word in negative_words if word in text_lower)
        features['pos_neg_cooccurrence'] = 1 if features['positive_word_count'] > 0 and features['negative_word_count'] > 0 else 0
        
        # Steam-specific sarcasm patterns
        steam_sarcasm_patterns = [
            r'10/10.*would.*(?:rage|quit|crash|never)',
            r'perfect.*(?:if you enjoy|for people who)',
            r'great.*(?:if you like|when you)',
            r'love.*how.*(?:crash|bug|glitch)',
            r'amazing.*(?:graphics|sound).*(?:from|like).*(?:199\d|200\d)',
            r'excellent.*(?:tutorial|help).*(?:hours|still|never)',
            r'who needs.*(?:tutorial|help|balance)',
            r'nothing like.*getting.*(?:owned|destroyed|rekt)',
            r'highly recommend.*(?:if you|for people who).*(?:enjoy|like).*(?:pain|suffering|frustration)'
        ]
        
        features['steam_sarcasm_count'] = sum(1 for pattern in steam_sarcasm_patterns 
                                            if re.search(pattern, text_lower))
        
        # Gaming terminology
        features['gaming_terms_count'] = sum(1 for term in self.gaming_terms if term in text_lower)
        
        # Quotation marks (often used sarcastically)
        features['quote_count'] = text.count('"') + text.count("'")
        
        # All caps words (emphasis in sarcasm)
        features['all_caps_words'] = len(re.findall(r'\b[A-Z]{2,}\b', text))
        
        return features
    
    def preprocess_text(self, text):
        """
        Comprehensive text preprocessing for sarcasm detection
        """
        if not isinstance(text, str):
            text = str(text)
        
        # Extract features before preprocessing
        features = self.extract_advanced_sarcasm_features(text)
        
        # Preserve important patterns before cleaning
        text = re.sub(r'(\d+)/(\d+)', r'RATING_\1_\2', text)  # Preserve ratings like 10/10
        text = re.sub(r'[.]{3,}', ' ELLIPSIS ', text)
        text = re.sub(r'[!]{2,}', ' MULTIPLE_EXCLAMATION ', text)
        text = re.sub(r'[?]{2,}', ' MULTIPLE_QUESTION ', text)
        
        # Clean URLs, mentions, etc.
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#\w+', '', text)
        
        # Preserve negation context
        negation_patterns = [
            (r'\b(not|no|never|nothing|nobody|nowhere|neither|nor)\s+(\w+)', r'\1_\2'),
            (r'\b(barely|hardly|scarcely)\s+(\w+)', r'\1_\2'),
            (r'\b(can\'t|cannot|won\'t|wouldn\'t|shouldn\'t|couldn\'t)\s+(\w+)', r'\1_\2'),
            (r'\b(don\'t|doesn\'t|didn\'t)\s+(\w+)', r'\1_\2')
        ]
        
        for pattern, replacement in negation_patterns:
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        # Tokenize and clean
        tokens = nltk.word_tokenize(text.lower())
        
        # Remove stopwords but preserve important words
        tokens = [token for token in tokens 
                 if token not in self.stop_words or token in self.preserve_words or token in self.gaming_terms]
        
        # Keep alphabetic tokens and preserved special tokens
        special_tokens = ['ELLIPSIS', 'MULTIPLE_EXCLAMATION', 'MULTIPLE_QUESTION'] + [f'RATING_{i}_{j}' for i in range(1, 11) for j in range(1, 11)]
        tokens = [token for token in tokens 
                 if token.isalpha() or token in special_tokens or '_' in token]
        
        clean_text = ' '.join(tokens)
        
        return clean_text, features


In [6]:
def main():
    """
    Main execution pipeline for real Steam reviews sarcasm detection
    """
    print("🚀 Starting Real Steam Reviews Sarcasm Detection Pipeline")
    print("=" * 60)
    
    # Step 1: Load real dataset
    df_raw = load_real_steam_dataset()
    
    if df_raw is None:
        print("❌ Could not load dataset. Please check your setup.")
        return
    
    print(f"\n📊 Raw dataset loaded: {df_raw.shape}")
    print(f"Columns: {list(df_raw.columns)}")
    
    # Step 2: Identify sarcastic reviews
    df_labeled = identify_sarcastic_reviews(df_raw)
    
    # Step 3: Prepare for training
    df = prepare_dataset_for_training(df_labeled, target_size=1200)
    
    if df is None:
        print("❌ Could not prepare dataset for training.")
        return
    
    # Step 4: Advanced preprocessing
    print("\n🔧 Applying advanced preprocessing...")
    preprocessor = SarcasmAwarePreprocessor()
    
    results = df['review'].apply(lambda x: preprocessor.preprocess_text(x))
    df['clean_review'] = [result[0] for result in results]
    features_list = [result[1] for result in results]
    
    # Convert features to DataFrame
    features_df = pd.DataFrame(features_list)
    df = pd.concat([df, features_df], axis=1)
    
    print(f"✅ Preprocessing completed!")
    print(f"Added {len(features_df.columns)} advanced features")
    
    # Step 5: Create feature matrix
    print("\n🎯 Creating feature matrices...")
    
    # TF-IDF features
    tfidf = TfidfVectorizer(
        max_features=3000,
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.9,
        sublinear_tf=True
    )
    
    X_tfidf = tfidf.fit_transform(df['clean_review'])
    
    # Numerical features
    feature_cols = [col for col in features_df.columns if isinstance(df[col].iloc[0], (int, float))]
    X_numerical = df[feature_cols].fillna(0)
    
    # Combine features
    from scipy.sparse import hstack
    import scipy.sparse as sp
    X_combined = hstack([X_tfidf, sp.csr_matrix(X_numerical.values)])
    
    y = df['label'].values
    
    print(f"📈 Feature matrix shape: {X_combined.shape}")
    print(f"🏷️ Label distribution: Sarcastic: {sum(y)}, Genuine: {len(y) - sum(y)}")
    
    # Step 6: Train models
    print("\n🤖 Training models...")
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_combined, y, test_size=0.2, random_state=42, stratify=y
    )
    
    models = {
        'Naive Bayes': MultinomialNB(alpha=0.1),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, C=1.0),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        
        results[name] = {
            'Accuracy': accuracy,
            'F1-Score': f1,
            'Precision': precision,
            'Recall': recall
        }
        
        print(f"✅ {name} - Accuracy: {accuracy:.3f}, F1: {f1:.3f}")
    
    # Step 7: Display results
    print("\n📊 FINAL RESULTS:")
    print("=" * 50)
    results_df = pd.DataFrame(results).round(3)
    print(results_df)
    
    # Find best model
    best_model = results_df.loc['F1-Score'].idxmax()
    best_f1 = results_df.loc['F1-Score', best_model]
    
    print(f"\n🏆 Best Model: {best_model}")
    print(f"🎯 Best F1 Score: {best_f1}")
    
    # Step 8: Show example predictions
    print("\n🔍 Example Predictions:")
    print("-" * 40)
    
    best_model_obj = models[best_model]
    sample_indices = np.random.choice(len(X_test), 5, replace=False)
    
    for idx in sample_indices:
        prediction = best_model_obj.predict(X_test[idx])[0]
        actual = y_test[idx]
        review_text = df.iloc[X_test.indices[idx] if hasattr(X_test, 'indices') else idx]['review'][:100]
        
        print(f"Review: {review_text}...")
        print(f"Predicted: {'Sarcastic' if prediction == 1 else 'Genuine'}")
        print(f"Actual: {'Sarcastic' if actual == 1 else 'Genuine'}")
        print("-" * 40)
    
    print("\n🎉 Analysis completed successfully!")
    
    return df, results_df, models[best_model]


In [7]:
# Run the main pipeline
if __name__ == "__main__":
    df_final, results_final, best_model_final = main()


🚀 Starting Real Steam Reviews Sarcasm Detection Pipeline
📥 Loading real Steam Reviews dataset...
🔄 Trying kaggle dataset...
zsh:1: command not found: kaggle
❌ Failed to load from kaggle: [Errno 2] No such file or directory: 'steam_reviews.csv'
🔄 Trying kaggle dataset...
zsh:1: command not found: kaggle
❌ Failed to load from kaggle: [Errno 2] No such file or directory: 'steam_reviews.csv'
🔄 Trying kaggle dataset...
zsh:1: command not found: kaggle
❌ Failed to load from kaggle: [Errno 2] No such file or directory: 'steam_reviews_2021.csv'
📁 Please upload your Steam reviews CSV file manually
Expected columns: review_text, voted_up, votes_helpful, votes_funny, playtime_forever
❌ Could not load dataset. Please check your setup.


TypeError: cannot unpack non-iterable NoneType object