In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)


# Add these sentiment lexicons after your imports
positive_words = {
    'good', 'great', 'excellent', 'amazing', 'awesome', 'fantastic', 'wonderful',
    'perfect', 'love', 'like', 'enjoy', 'happy', 'pleased', 'satisfied', 'brilliant',
    'outstanding', 'superb', 'magnificent', 'marvelous', 'terrific', 'fabulous',
    'delightful', 'pleasant', 'nice', 'fine', 'cool', 'sweet', 'beautiful'
}

negative_words = {
    'bad', 'terrible', 'awful', 'horrible', 'disgusting', 'hate', 'dislike',
    'angry', 'mad', 'furious', 'annoying', 'irritating', 'frustrating', 'sad',
    'depressed', 'disappointed', 'upset', 'worried', 'concerned', 'problem',
    'issue', 'wrong', 'error', 'fail', 'failure', 'worst', 'suck', 'sucks',
    'stupid', 'dumb', 'idiot', 'crazy', 'insane', 'ridiculous', 'absurd',
    'tempered', 'rude', 'mean', 'nasty', 'cruel', 'harsh', 'difficult'
}

# Add contrast words detection
contrast_words = {
    'but', 'however', 'although', 'though', 'yet', 'nevertheless', 'nonetheless',
    'still', 'except', 'besides', 'despite', 'regardless', 'anyway'
}


# Load and preprocess data
def load_data(file_path):
    'training.1600000.processed.noemoticon.csv'

    col_names = ['target', 'id', 'date', 'flag', 'user', 'text']

    try:
        data = pd.read_csv(
            file_path,
            encoding='ISO-8859-1',
            header=None,
            names=col_names,
            usecols=[0, 1, 2, 3, 4, 5],
            on_bad_lines='skip',
            quoting=3
        )

        print(f"Initial data shape: {data.shape}")
        print("Initial target distribution:")
        print(data['target'].value_counts())

        # Map targets: 0 -> 0 (negative), 4 -> 1 (positive)
        data['target'] = data['target'].map({0: 0, 4: 1})
        data = data.dropna(subset=['target'])
        data['target'] = data['target'].astype(int)

        print("After mapping - Target distribution:")
        print(data['target'].value_counts())

        return data

    except Exception as e:
        print(f"Error loading data: {e}")
        # Create sample data for testing if file doesn't exist
        sample_data = {
            'target': [0, 1, 0, 1, 0, 1] * 1000,  # 6000 samples
            'text': [
                "I am not happy today",
                "This is amazing",
                "I don't like this movie",
                "Great job everyone",
                "This is not good at all",
                "I love this product"
            ] * 1000
        }
        data = pd.DataFrame(sample_data)
        print("Using sample data for demonstration")
        return data

# Enhanced negation handling
negation_words = {
    'no', 'not', 'never', 'none', 'nobody', 'nothing', 'neither', 'nowhere',
    'hardly', 'scarcely', 'barely', 'cannot', 'cant', 'dont', 'doesnt', 'didnt',
    'isnt', 'wasnt', 'arent', 'werent', 'wont', 'couldnt', 'shouldnt', 'mustnt',
    'wouldnt', 'havent', 'hasnt', 'hadnt', 'wont', 'aint'
}

# Remove apostrophes for matching
negation_words = set([w.replace("'", "") for w in negation_words])

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
# Remove negation words from stopwords to preserve them
stop_words = stop_words - negation_words

def expand_contractions(text):
    """Expand common contractions"""
    contractions = {
        r"n't\b": " not",
        r"'re\b": " are",
        r"'ve\b": " have",
        r"'ll\b": " will",
        r"'d\b": " would",
        r"'m\b": " am",
        r"can't": "cannot",
        r"won't": "will not",
        r"n't": " not"
    }

    for pattern, replacement in contractions.items():
        text = re.sub(pattern, replacement, text)
    return text

def enhanced_preprocess_text(text, neg_scope=2):
    """Enhanced preprocessing that handles mixed sentiment and contrast"""
    if pd.isna(text):
        return ""

    text = str(text).lower()
    text = expand_contractions(text)

    # Remove URLs, mentions, hashtags
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)

    tokens = text.split()
    processed = []

    i = 0
    while i < len(tokens):
        word = tokens[i].strip()
        word_norm = word.replace("'", "")

        if not word or len(word) < 2:
            i += 1
            continue

        # Handle contrast words (but, however, etc.) - CRITICAL for mixed sentiment
        if word_norm in contrast_words:
            processed.append(f"CONTRAST_{word_norm}")
            # After contrast, weight following sentiment more heavily
            remaining_text = " ".join(tokens[i+1:])
            neg_words_after = sum(1 for nw in negative_words if nw in remaining_text)
            pos_words_after = sum(1 for pw in positive_words if pw in remaining_text)

            if neg_words_after > pos_words_after:
                processed.append("NEGATIVE_AFTER_CONTRAST")
            i += 1

        # Handle negation
        elif word_norm in negation_words:
            processed.append(f"NEG_{stemmer.stem(word_norm)}")
            j = i + 1
            negated_count = 0
            while j < len(tokens) and negated_count < neg_scope:
                next_word = tokens[j].strip()
                if len(next_word) > 1 and next_word not in stop_words:
                    processed.append(f"NOT_{stemmer.stem(next_word)}")
                    negated_count += 1
                j += 1
            i = j if negated_count > 0 else i + 1

        # Handle sentiment words with special markers
        elif word_norm in positive_words:
            processed.append(f"POSITIVE_{stemmer.stem(word_norm)}")
            processed.append(stemmer.stem(word_norm))
            i += 1

        elif word_norm in negative_words:
            processed.append(f"NEGATIVE_{stemmer.stem(word_norm)}")
            processed.append(stemmer.stem(word_norm))
            i += 1

        # Regular words
        else:
            if word not in stop_words:
                processed.append(stemmer.stem(word))
            i += 1

    return " ".join(processed)

def create_model_pipeline():
    """Create and return multiple models for comparison"""
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Naive Bayes': MultinomialNB(),
        'Linear SVM': LinearSVC(random_state=42, max_iter=1000),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100)
    }
    return models


def calculate_sentiment_score(text):
    """Calculate sentiment score for mixed sentiment handling"""
    words = text.lower().split()

    # Count sentiment words
    pos_count = sum(1 for w in words if any(pw in w for pw in positive_words))
    neg_count = sum(1 for w in words if any(nw in w for nw in negative_words))

    # Check for contrast - if present, weight the latter part more
    has_contrast = any(cw in text.lower() for cw in contrast_words)

    if has_contrast:
        # Find contrast word position
        contrast_pos = 0
        for i, word in enumerate(words):
            if word in contrast_words:
                contrast_pos = i
                break

        # Count sentiment after contrast
        after_contrast = words[contrast_pos+1:]
        pos_after = sum(1 for w in after_contrast if any(pw in w for pw in positive_words))
        neg_after = sum(1 for w in after_contrast if any(nw in w for nw in negative_words))

        # Weight post-contrast sentiment more heavily
        total_sentiment = (pos_count - neg_count) + 2 * (pos_after - neg_after)
    else:
        total_sentiment = pos_count - neg_count

    return total_sentiment, has_contrast, pos_count, neg_count



def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """Train multiple models and return the best one"""
    models = create_model_pipeline()
    best_model = None
    best_score = 0
    results = {}

    print("\nTraining and evaluating models...")
    print("-" * 50)

    for name, model in models.items():
        print(f"Training {name}...")

        # Train model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy

        print(f"{name} Accuracy: {accuracy:.4f}")

        # Keep track of best model
        if accuracy > best_score:
            best_score = accuracy
            best_model = model

    print("-" * 50)
    print(f"Best Model: {max(results.items(), key=lambda x: x[1])[0]} with accuracy: {best_score:.4f}")

    return best_model, results


# Replace your TfidfVectorizer with these settings:
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 4),  # Include 4-grams for better context
    min_df=2,
    max_df=0.90,
    sublinear_tf=True
)

def predict_sentiment(text, model, vectorizer):
    """Enhanced prediction with sentiment scoring"""
    # Calculate sentiment score first
    sentiment_score, has_contrast, pos_count, neg_count = calculate_sentiment_score(text)

    # Preprocess text
    cleaned_text = enhanced_preprocess_text(text)

    if not cleaned_text.strip():
        return "neutral", 0.5

    # Get model prediction
    text_vector = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_vector)[0]
    probability = model.predict_proba(text_vector)[0]

    # Override model prediction for clear cases
    if has_contrast and sentiment_score < -1:
        # Strong negative after contrast
        prediction = 0
        probability = [0.8, 0.2]
    elif sentiment_score < -2:
        # Very negative overall
        prediction = 0
        probability = [0.75, 0.25]

    sentiment = "positive" if prediction == 1 else "negative"
    confidence = max(probability)

    return sentiment, confidence

def main():
    # Load data
    print("Loading data...")
    file_path = 'training.1600000.processed.noemoticon.csv'
    data = load_data(file_path)

    # Take a subset for faster processing (optional)
    # For full dataset, comment out the next line
    data = data.sample(n=min(50000, len(data)), random_state=42)

    print(f"\nUsing {len(data)} samples for training")

    # Preprocess text
    print("Preprocessing text data...")
    data['clean_text'] = data['text'].apply(enhanced_preprocess_text)

    # Remove empty texts
    data = data[data['clean_text'].str.strip() != '']
    print(f"After cleaning: {len(data)} samples")

    # Show some examples
    print("\nSample preprocessed texts:")
    for i in range(min(5, len(data))):
        original = data.iloc[i]['text']
        cleaned = data.iloc[i]['clean_text']
        target = data.iloc[i]['target']
        print(f"Target: {target}")
        print(f"Original: {original}")
        print(f"Cleaned:  {cleaned}")
        print("-" * 40)

    # Vectorization
    print("Vectorizing text data...")
    vectorizer = TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.95
    )

    X = vectorizer.fit_transform(data['clean_text'])
    y = data['target']

    print(f"Feature matrix shape: {X.shape}")
    print(f"Number of features: {X.shape[1]}")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"Training set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")

    # Train models
    best_model, results = train_and_evaluate_models(X_train, X_test, y_train, y_test)

    # Test negation handling
    print("\n" + "="*60)
    print("TESTING NEGATION HANDLING")
    print("="*60)

    # Replace your sample data with these examples:
    sample_texts = [
        "I am not happy today", "This is amazing", "I don't like this movie",
        "Great job everyone", "This is not good at all", "I love this product",
        "he is a good guy but sometimes makes me mad",
        "she is nice but can be annoying",
        "the food was great but service was terrible",
        "he is short tempered", "she has anger issues",
        "this is frustrating and annoying",
        "I hate when people are rude", "the movie was boring and stupid",
        "excellent work but could be improved", "good food but expensive",
        "not bad at all", "couldn't be worse", "absolutely terrible experience"
    ] * 3000

    sample_targets = [
        0, 1, 0, 1, 0, 1,  # Basic cases
        0, 0, 0,           # Mixed sentiment (negative outcome)
        0, 0, 0,           # Clear negative
        0, 0,              # Clear negative
        0, 0,              # Mixed with negative lean
        1, 0, 0            # Various cases
    ] * 3000

    for sentence in sample_texts:
        sentiment, confidence = predict_sentiment(sentence, best_model, vectorizer)
        print(f"Text: '{sentence}'")
        print(f"Prediction: {sentiment} (confidence: {confidence:.3f})")
        print(f"Preprocessed: '{enhanced_preprocess_text(sentence)}'")
        print("-" * 50)

    # Save model and vectorizer
    print("\nSaving model and vectorizer...")
    with open('sentiment_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)

    with open('vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)

    print("Model and vectorizer saved successfully!")

    return best_model, vectorizer

# Interactive prediction function
def interactive_prediction():
    """Load saved model and allow interactive predictions"""
    try:
        with open('sentiment_model.pkl', 'rb') as f:
            model = pickle.load(f)

        with open('vectorizer.pkl', 'rb') as f:
            vectorizer = pickle.load(f)

        print("Model loaded successfully!")
        print("Enter 'quit' to exit")

        while True:
            user_input = input("\nEnter a sentence for sentiment analysis: ")

            if user_input.lower() == 'quit':
                break

            sentiment, confidence = predict_sentiment(user_input, model, vectorizer)
            preprocessed = enhanced_preprocess_text(user_input)

            print(f"Original: '{user_input}'")
            print(f"Preprocessed: '{preprocessed}'")
            print(f"Sentiment: {sentiment}")
            print(f"Confidence: {confidence:.3f}")

    except FileNotFoundError:
        print("Model files not found. Please run the main training function first.")

if __name__ == "__main__":
    # Train the model
    model, vectorizer = main()

    # Start interactive mode
    print("\n" + "="*60)
    print("Starting interactive prediction mode...")
    interactive_prediction()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Preprocessed: 'movi bore NEGATIVE_stupid stupid'
--------------------------------------------------
Text: 'excellent work but could be improved'
Prediction: positive (confidence: 0.605)
Preprocessed: 'POSITIVE_excel excel work CONTRAST_but could improv'
--------------------------------------------------
Text: 'good food but expensive'
Prediction: positive (confidence: 0.605)
Preprocessed: 'POSITIVE_good good food CONTRAST_but expens'
--------------------------------------------------
Text: 'not bad at all'
Prediction: negative (confidence: 0.980)
Preprocessed: 'NEG_not NOT_bad'
--------------------------------------------------
Text: 'couldn't be worse'
Prediction: negative (confidence: 0.980)
Preprocessed: 'could NEG_not NOT_wors'
--------------------------------------------------
Text: 'absolutely terrible experience'
Prediction: positive (confidence: 0.605)
Preprocessed: 'absolut NEGATIVE_terribl terribl experi'
------