In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import google.generativeai as genai
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib
from dotenv import load_dotenv

# --- Configuration ---
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found in environment variables.")

genai.configure(api_key=GOOGLE_API_KEY)
EMBEDDING_MODEL = 'models/embedding-001'
DATASET_PATH = 'Hindi_Sentiment.csv'
EMBEDDINGS_CACHE_PATH = 'hindi_emotion_with_embeddings.csv' # New cache file
MODEL_PATH = 'svm_emotion_model.pkl' # New model file

In [2]:
df = pd.read_csv(DATASET_PATH)
unique_labels = df['label'].unique()
unique_labels

array(['neutral', 'surprise', 'fear', 'sadness', 'joy', 'disgust',
       'anger'], dtype=object)

In [3]:
ID_TO_LABEL = {i: label for i, label in enumerate(unique_labels)}
LABEL_TO_ID = {label: i for i, label in enumerate(unique_labels)}

In [14]:
# --- Functions --- (get_gemini_embedding remains the same)
def get_gemini_embedding(text):
    """Generates an embedding for the given text using the Gemini API."""
    if not text or not isinstance(text, str):
        return [0] * 768
    try:
        result = genai.embed_content(model=EMBEDDING_MODEL, content=text)
        return result['embedding']
    except Exception as e:
        print(f"Error getting embedding for text: '{text}'. Error: {e}")
        return [0] * 768

def main():
    print("--- Starting Multi-Class Emotion Model Training ---")

    # --- 1. Load Data and Generate Embeddings ---
    if os.path.exists(EMBEDDINGS_CACHE_PATH):
        print(f"Loading pre-computed embeddings from {EMBEDDINGS_CACHE_PATH}...")
        df = pd.read_csv(EMBEDDINGS_CACHE_PATH)
        df['embedding'] = df['embedding'].apply(lambda x: np.array(eval(x)))
    else:
        print(f"Loading dataset from {DATASET_PATH}...")
        df = pd.read_csv(DATASET_PATH)
        
        # --- <<< UPDATED: Clean and filter for our target emotions >>> ---
        df.dropna(subset=['label', 'sentences'], inplace=True)
        df['label'] = df['label'].str.strip().str.lower() # Standardize labels
        df = df[df['label'].isin(unique_labels)] # Keep only the labels we want
        
        # Optional: Use a sample for faster training
        # df = df.sample(n=1000, random_state=42).reset_index(drop=True) 
        
        print(f"Generating embeddings for {len(df)} texts... (This may take a while)")
        df['embedding'] = df['sentences'].apply(get_gemini_embedding)
        time.sleep(1) 
        
        print(f"Saving embeddings to {EMBEDDINGS_CACHE_PATH}...")
        df.to_csv(EMBEDDINGS_CACHE_PATH, index=False)

    # --- 2. Prepare Data for SVM ---
    print("Preparing data for SVM training...")
    df['label_id'] = df['label'].map(LABEL_TO_ID) # Map labels to numeric IDs
    df.dropna(subset=['label_id'], inplace=True)
    df['label_id'] = df['label_id'].astype(int)

    X = np.array(df['embedding'].tolist())
    y = df['label_id'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # --- 3. Hyperparameter Tuning with GridSearchCV ---
    print("\nStarting Hyperparameter Tuning with GridSearchCV...")
    param_grid = [
        {'kernel': ['linear'], 'C': [0.1, 1, 10]},
        {'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [0.1, 0.01, 'scale']},
    ]
    base_svm = SVC(probability=True, class_weight='balanced')
    grid_search = GridSearchCV(estimator=base_svm, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
    
    print("Fitting the grid search... (This will take a while)")
    grid_search.fit(X_train, y_train)
    
    print(f"\nBest parameters found: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
    svm_model = grid_search.best_estimator_

    # --- 4. Evaluate Model ---
    print("Evaluating model...")
    y_pred = svm_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    # --- <<< UPDATED: Use our label names in the report >>> ---
    print(classification_report(y_test, y_pred, target_names=unique_labels))

    # --- 5. Save the Trained Model AND the Label Mapping ---
    print(f"Saving trained model and labels to {MODEL_PATH}...")
    # It's best practice to save the model and its metadata (like labels) together
    model_data = {
        'model': svm_model,
        'id_to_label': ID_TO_LABEL
    }
    joblib.dump(model_data, MODEL_PATH)
    print("Generating and saving confusion matrix...")
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=EMOTION_LABELS, yticklabels=EMOTION_LABELS)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix for Tuned SVM')
    plt.savefig('confusion_matrix_tuned.png')
    
    print("--- Training Complete ---")

if __name__ == '__main__':
    main()

--- Starting Multi-Class Emotion Model Training ---
Loading pre-computed embeddings from hindi_emotion_with_embeddings.csv...
Preparing data for SVM training...

Starting Hyperparameter Tuning with GridSearchCV...
Fitting the grid search... (This will take a while)
Fitting 3 folds for each of 12 candidates, totalling 36 fits

Best parameters found: {'C': 10, 'kernel': 'linear'}
Best cross-validation accuracy: 0.3975
Evaluating model...
Model Accuracy: 0.4144

Classification Report:
              precision    recall  f1-score   support

     neutral       0.67      0.54      0.60       744
    surprise       0.25      0.43      0.32       180
        fear       0.09      0.12      0.10        49
     sadness       0.14      0.23      0.17       112
         joy       0.40      0.30      0.34       282
     disgust       0.07      0.06      0.07        47
       anger       0.33      0.34      0.33       186

    accuracy                           0.41      1600
   macro avg       0.28  

NameError: name 'confusion_matrix' is not defined

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Generating and saving confusion matrix...")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fm='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Tuned SVM')
plt.savefig('confusion_matrix_tuned.png')
    
print("--- Training Complete ---")

Generating and saving confusion matrix...


NameError: name 'y_test' is not defined