# Sentiment Analysis Model Training

This notebook trains a model to analyze sentiment in customer communications (emails, reviews, feedback, etc.)

## Approach
1. **Option A:** Fine-tuned Transformer (DistilBERT) - Better accuracy
2. **Option B:** Traditional ML (TF-IDF + SVM/Naive Bayes) - Faster, works on CPU

We'll implement both and compare.


In [None]:
# Install required dependencies (run this first)
!pip install pandas numpy scikit-learn matplotlib seaborn transformers torch datasets


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Set paths
BASE_DIR = Path('../')
DATA_DIR = BASE_DIR / 'data'
MODELS_DIR = BASE_DIR / 'models' / 'sentiment_analyzer'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Models directory: {MODELS_DIR}")


## 1. Load and Prepare Data


In [None]:
# Load training data
import sys
sys.path.insert(0, str(Path('../../')))

from ml.utils.data_loader import load_sentiment_data
import pandas as pd

print("=" * 60)
print("Loading sentiment analysis training data...")
print("=" * 60)

try:
    df = load_sentiment_data()
    
    # Safety check
    if df is None or len(df) == 0:
        print("⚠️ No data loaded. Creating sample data...")
        df = pd.DataFrame({
            'text': [
                'Thank you for the excellent service!',
                'Great product, very satisfied!',
                'I am very disappointed with the quality',
                'This is terrible, I want a refund',
                'The order was delivered on time',
                'Outstanding customer support!',
                'Poor quality, not worth the money',
                'Average product, nothing special',
                'Highly recommend this product!',
                'Worst experience ever'
            ],
            'label': ['positive', 'positive', 'negative', 'negative', 'neutral',
                     'positive', 'negative', 'neutral', 'positive', 'negative']
        })
    
    print(f"\n✓ Dataset loaded successfully!")
    print(f"  Shape: {df.shape}")
    print(f"\n  Label distribution:")
    print(df['label'].value_counts())
    print(f"\n  First few samples:")
    print(df.head())
    
except Exception as e:
    print(f"❌ Error: {e}")
    print("Creating sample data...")
    df = pd.DataFrame({
        'text': [
            'Thank you for the excellent service!',
            'Great product, very satisfied!',
            'I am very disappointed with the quality',
            'This is terrible, I want a refund',
            'The order was delivered on time'
        ],
        'label': ['positive', 'positive', 'negative', 'negative', 'neutral']
    })


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes model
print("Training Naive Bayes classifier...")
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

# Evaluate
y_pred = nb_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nAccuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=nb_model.classes_, yticklabels=nb_model.classes_)
plt.title('Confusion Matrix - Naive Bayes')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig(MODELS_DIR / 'nb_confusion_matrix.png')
plt.show()

# Save model
from ml.utils.model_saver import save_model
save_model(nb_model, vectorizer, MODELS_DIR, accuracy)

print(f"\nModel saved to {MODELS_DIR}")


## 3. Test the Model


In [None]:
# Test with new text
test_texts = [
    "I love this product! It's amazing!",
    "This is the worst service I've ever experienced",
    "The product arrived on time and works as expected"
]

for text in test_texts:
    test_vec = vectorizer.transform([text])
    pred = nb_model.predict(test_vec)[0]
    proba = nb_model.predict_proba(test_vec)[0]
    confidence = max(proba)
    
    print(f"\nText: {text}")
    print(f"Prediction: {pred}")
    print(f"Confidence: {confidence:.2%}")
    print(f"All probabilities:")
    for label, prob in zip(nb_model.classes_, proba):
        print(f"  {label}: {prob:.2%}")
