## 1. Load Libraries & Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load dataset
df = pd.read_csv("/Users/tdf/Downloads/voice_emotion_project/voice_emotion_features_max.csv")

print(df.shape)
df.head()

(7035, 68)


Unnamed: 0,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,...,tonnetz_0,tonnetz_1,tonnetz_2,tonnetz_3,tonnetz_4,tonnetz_5,zcr,rms,spec_bw,label
0,-329.59491,86.756355,36.017418,24.669662,-3.993649,-0.238354,-1.785476,-11.477121,1.851376,-4.34765,...,-0.038315,-0.02792,-0.15552,-0.022064,0.051455,-0.005912,0.07907,0.03714,1909.07929,happy
1,-284.672699,71.960381,30.055868,27.643963,11.352015,-3.636509,0.140571,-16.349529,-11.72587,-0.844399,...,-0.00178,-0.033866,-0.070969,-0.088409,0.01901,0.034722,0.099757,0.080096,1960.19773,happy
2,-320.321411,94.481728,39.490005,21.114134,0.376732,0.086866,-3.697634,-5.30229,-1.843154,-10.104664,...,-0.015244,-0.005136,-0.079195,-0.109682,0.027029,0.033616,0.071734,0.047575,1960.795204,happy
3,-321.519836,113.328003,31.563755,17.429359,1.344368,-7.852514,-4.849203,-4.934086,-0.255196,-7.510925,...,0.031885,0.002903,-0.022905,-0.082021,0.0326,0.007381,0.072727,0.035213,1773.361819,happy
4,-374.524078,113.507393,36.383156,27.486307,7.868597,2.612098,5.879338,-13.007985,-10.430767,-1.531215,...,-0.030364,-0.0424,-0.108914,-0.134162,0.026602,-0.003448,0.065014,0.020297,1796.575801,happy


## 2. Separate Features & Labels

In [4]:
X = df.drop(columns=["label"])
y = df["label"]

print("Classes:", y.unique())

Classes: ['happy' 'sad' 'fear' 'neutral' 'angry']


## 3. Train / Validation Split (Stratified)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

## 4. Prediction Models

### Model A: Random Forest (Strong Baseline)

In [6]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.5394456289978679
              precision    recall  f1-score   support

       angry       0.68      0.72      0.70       293
        fear       0.54      0.25      0.34       292
       happy       0.44      0.47      0.46       293
     neutral       0.48      0.56      0.51       236
         sad       0.55      0.70      0.62       293

    accuracy                           0.54      1407
   macro avg       0.54      0.54      0.53      1407
weighted avg       0.54      0.54      0.53      1407



#### Hyperparameter Tuning for Random Forest Using GridSearch 

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [300, 500, 800],
    "max_depth": [None, 20, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

rf = RandomForestClassifier(
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

grid = GridSearchCV(
    rf,
    param_grid,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("Best RF:", grid.best_score_)
print("Best params:", grid.best_params_)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best RF: 0.5302061122956645
Best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}


### Model B: SVM (Often Best for Emotion Features)

In [7]:
svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(
        kernel="rbf",
        C=10,
        gamma="scale",
        class_weight="balanced"
    ))
])

svm_pipeline.fit(X_train, y_train)

y_pred_svm = svm_pipeline.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.5472636815920398
              precision    recall  f1-score   support

       angry       0.66      0.73      0.70       293
        fear       0.47      0.45      0.46       292
       happy       0.50      0.43      0.46       293
     neutral       0.48      0.54      0.51       236
         sad       0.59      0.59      0.59       293

    accuracy                           0.55      1407
   macro avg       0.54      0.55      0.54      1407
weighted avg       0.54      0.55      0.54      1407



#### SVM Tuning 

In [12]:
param_grid = {
    "svm__C": [1, 5, 10, 20],
    "svm__gamma": ["scale", 0.01, 0.001]
}

svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="rbf", class_weight="balanced"))
])

grid = GridSearchCV(
    svm_pipeline,
    param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("Best SVM score:", grid.best_score_)
print("Best params:", grid.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best SVM score: 0.5445984363894811
Best params: {'svm__C': 5, 'svm__gamma': 0.01}
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  31.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  16.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=  48.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=  25.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  16.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=  45.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=  42.0s
[C

[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  32.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=  28.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=  17.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=  43.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  16.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  28.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=  25.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  16.0s
[CV] END max_depth=No

[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  21.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  16.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=  46.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=  43.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=  29.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=  16.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=  41.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=800; total time=  37.5s
[CV] END max_depth=No

[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=  48.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=  47.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=  14.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=  45.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=  15.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=  24.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=500; total time=  25.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=  13.7s
[CV] END max_depth=Non

### Model C: Logistic Regression (Control Baseline)

In [8]:
logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        max_iter=3000,
        class_weight="balanced"
    ))
])

logreg.fit(X_train, y_train)

y_pred_lr = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.5031982942430704


## 5. Cross-validation for Reliable Accuracy

In [9]:
from sklearn.model_selection import cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    rf,
    X,
    y,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)

print("CV Accuracy:", scores.mean(), "+/-", scores.std())

CV Accuracy: 0.5408670931058991 +/- 0.007641609129049415


## 6. Classical ML Model Performance

| Model                 | Accuracy | Precision (macro avg) | Recall (macro avg) | F1-score (macro avg) | Notes / Hyperparameters |
|-----------------------|----------|---------------------|------------------|---------------------|------------------------|
| Random Forest         | 0.54     | 0.54                | 0.54             | 0.53                | n_estimators=500, max_depth=None, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', class_weight='balanced' |
| SVM (RBF Kernel)      | 0.55     | 0.54                | 0.55             | 0.54                | C=5, gamma=0.01, class_weight='balanced' |
| Logistic Regression   | 0.50     | 0.50                | 0.50             | 0.50                | max_iter=3000, class_weight='balanced' |

**Cross-Validation (Random Forest 5-fold):** 0.541 ± 0.008  

## 7. Conclusion
Classical ML models provide a baseline, but their performance is limited by handcrafted features. This low accuracy highlights the need for deep learning approaches (eg. Wav2Vec2) to better capture complex audio patterns for emotion recognition.
