In [38]:
! pip install python-dotenv




[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
import demoji
demoji.download_codes()

In [40]:
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional, GlobalMaxPool1D, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import pickle
import warnings
import google.generativeai as genai
from dotenv import load_dotenv
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

True

In [41]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Set random seeds for reproducibility

In [42]:
# Import Google Generative AI for Gemini
try:
    import google.generativeai as genai
    GEMINI_AVAILABLE = True
except ImportError:
    GEMINI_AVAILABLE = False
    print("google-generativeai not installed. Install with: pip install google-generativeai")

In [43]:
np.random.seed(42)
tf.random.set_seed(42)

Download NLTK resources

In [44]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinuk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vinuk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinuk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vinuk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vinuk\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [45]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [46]:
def preprocess_text_lstm(text):
    """Enhanced preprocessing optimized for LSTM model"""
    if pd.isna(text) or text == '':
        return ''
    text = str(text)
    
    # Remove emojis
    text = demoji.replace(text, '')
    
    # Remove URLs, mentions, hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#\w+', '', text)
    
    # Clean and normalize
    text = re.sub(r'\s+', ' ', text.strip().lower())
    text = re.sub(r'[^\w\s]', ' ', text)
    
    if not text.strip():
        return ''
    
    # Tokenization and lemmatization
    tokens = word_tokenize(text)
    processed_tokens = []
    
    for token in tokens:
        if token and len(token) > 1 and token not in stop_words:
            lemma = lemmatizer.lemmatize(token)
            processed_tokens.append(lemma)
    
    return " ".join(processed_tokens) if processed_tokens else ''

Enhanced Hate Speech Detector Class

In [47]:
class RobustLSTMHateSpeechDetector:
    def __init__(self, model_path='models/best_lstm_model.h5', 
                 tokenizer_path='models/tokenizer.pickle',
                 threshold=0.5, max_len=100, gemini=None):
        self.threshold = threshold
        self.max_len = max_len
        self.gemini = gemini  

        # Load model and tokenizer
        self.model = load_model(model_path)
        with open(tokenizer_path, 'rb') as f:
            self.tokenizer = pickle.load(f)

        # Initialize Gemini for category classification
        try:
            GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
            if GEMINI_API_KEY:
                genai.configure(api_key=GEMINI_API_KEY)
                self.gemini = genai.GenerativeModel('gemini-1.5-flash')
            else:
                self.gemini = None
                print("Gemini API key not found in environment variables")
        except Exception as e:
            self.gemini = None
            print(f"Gemini API not configured: {str(e)}")

    def preprocess(self, text):
        """Apply same preprocessing as training"""
        return preprocess_text_lstm(text)

    def predict_with_confidence(self, text):
        """Predict with confidence estimation"""
        cleaned_text = self.preprocess(text)

        if not cleaned_text:
            return False, 0.0, "Low"

        # Convert to sequence
        sequence = self.tokenizer.texts_to_sequences([cleaned_text])
        padded_sequence = pad_sequences(sequence, maxlen=self.max_len, padding='post', truncating='post')

        # Get prediction
        prob = self.model.predict(padded_sequence, verbose=0)[0][0]
        is_hate = prob > self.threshold

        # Calculate confidence based on distance from threshold
        distance_from_threshold = abs(prob - self.threshold)
        if distance_from_threshold > 0.3:
            confidence = "High"
        elif distance_from_threshold > 0.1:
            confidence = "Medium"
        else:
            confidence = "Low"

        return is_hate, float(prob), confidence

    def classify_hate_category(self, text):
        """Classify hate speech category using Gemini"""
        if not self.gemini:
            return {
                "category": "Classification not available",
                "confidence": 0.0,
                "explanation": "Gemini API not configured"
            }
        
        candidate_labels = [
            "Sexual harassment",
            "Religious hate",
            "Political hate", 
            "Racial discrimination",
            "Gender-based hate",
            "Other hate speech"
        ]
        
        prompt = f"""Classify the following text into exactly one of these categories:
        {", ".join(candidate_labels)}.
        Text: "{text}"
        Return your response in JSON format with these keys:
        - "category": the most appropriate category name
        - "confidence": your confidence score between 0-1
        - "explanation": brief explanation (1 sentence)
        """
        
        try:
            response = self.gemini.generate_content(prompt)
            json_str = response.text.replace('```json', '').replace('```', '').strip()
            result = eval(json_str)
            return result
        except Exception as e:
            return {
                "category": "Classification failed",
                "confidence": 0.0,
                "explanation": f"Error: {str(e)}"
            }

    def analyze_text(self, text):
        """Comprehensive text analysis with category classification"""
        is_hate, prob, confidence = self.predict_with_confidence(text)
        
        result = {
            "text": text,
            "cleaned_text": self.preprocess(text),
            "is_hate_speech": bool(is_hate),
            "hate_probability": prob,
            "confidence": confidence,
            "threshold_used": self.threshold
        }
        
        # Add category classification if hate speech is detected
        if is_hate:
            category_result = self.classify_hate_category(text)
            result["hate_category"] = category_result["category"]
            result["category_confidence"] = category_result["confidence"]
            result["category_explanation"] = category_result["explanation"]
        else:
            result["hate_category"] = None
            result["category_confidence"] = None
            result["category_explanation"] = None

        return result

Usage example

In [48]:
# Usage example
if __name__ == "__main__":
    
    detector = RobustLSTMHateSpeechDetector()
    
    # Test samples
    test_samples = [
        "I love this beautiful sunny day!",
        "This movie was terrible and boring",
        "Great work on your project, congratulations!",
        "I hate when people don't clean up after themselves",
        "That's so annoying when traffic is heavy",
        "You're such an idiot for doing that",
        "Men are just violent animals — can't trust any of them.",
        "Women are terrible drivers",
        "I can't stand people from that country",
        "This food tastes awful",
        "I'm frustrated with this situation",
        "Burn their holy books — that garbage shouldn't exist.",
        "You're such a worthless loser. Nobody wants you around.",
        "That whole group is a plague. You all should disappear.",
        "I can't believe they let people like you speak. Disgusting.",
        "People who vote for that party are brain-dead traitors.",
        "I'm having such a rough day, nothing's going right.",
        "Your religion is a joke — brainwashed freaks.",
        "She looks like a man. Gross.",
        "You gays are always shoving your agenda down our throats.",
        "Ugh, Monday again. Need more coffee!"
    ]

    print("\nTesting Enhanced LSTM Detector with Category Classification:")
    print("="*70)

    for text in test_samples:
        result = detector.analyze_text(text)
        print(f"\nText: {result['text']}")
        print(f"Is Hate Speech: {result['is_hate_speech']}")
        print(f"Probability: {result['hate_probability']:.3f}")
        print(f"Confidence: {result['confidence']}")
        
        if result['is_hate_speech']:
            print(f"Hate Category: {result['hate_category']}")
            print(f"Category Confidence: {result['category_confidence']}")
            print(f"Explanation: {result['category_explanation']}")
        
        print("-" * 50)




Testing Enhanced LSTM Detector with Category Classification:

Text: I love this beautiful sunny day!
Is Hate Speech: False
Probability: 0.019
Confidence: High
--------------------------------------------------

Text: This movie was terrible and boring
Is Hate Speech: False
Probability: 0.194
Confidence: High
--------------------------------------------------

Text: Great work on your project, congratulations!
Is Hate Speech: False
Probability: 0.001
Confidence: High
--------------------------------------------------

Text: I love this beautiful sunny day!
Is Hate Speech: False
Probability: 0.019
Confidence: High
--------------------------------------------------

Text: This movie was terrible and boring
Is Hate Speech: False
Probability: 0.194
Confidence: High
--------------------------------------------------

Text: Great work on your project, congratulations!
Is Hate Speech: False
Probability: 0.001
Confidence: High
--------------------------------------------------

Text: I hate wh