In [1]:
import pandas as pd
train_data = pd.read_csv(r'D:\data_analysis\speech_emotion_recognition\data\EnglishDataset\features\statistic_features\train.csv')
val_data = pd.read_csv(r'D:\data_analysis\speech_emotion_recognition\data\EnglishDataset\features\statistic_features\validation.csv')
test_data = pd.read_csv(r'D:\data_analysis\speech_emotion_recognition\data\EnglishDataset\features\statistic_features\test.csv')

X_train = train_data.drop(columns=['label', 'file_path'])
y_train = train_data['label']

X_val = val_data.drop(columns=['label', 'file_path'])
y_val = val_data['label']

X_test = test_data.drop(columns=['label', 'file_path'])
y_test = test_data['label']

In [2]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((26921, 310), (26921,), (3369, 310), (3369,), (3336, 310), (3336,))

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from loguru import logger

X_train_combined = pd.concat([X_train, X_val])
y_train_combined = pd.concat([y_train, y_val])

scaler = StandardScaler()
X_train_combined = scaler.fit_transform(X_train_combined)
X_test = scaler.transform(X_test)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train_combined, y_train_combined)

best_params = grid_search.best_params_
logger.info(f"Best parameters found: {best_params}")

best_rf = grid_search.best_estimator_
best_rf.fit(X_train_combined, y_train_combined)

import joblib

# Lưu mô hình best_rf vào file
joblib.dump(best_rf, 'best_random_forest_model.pkl')

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

logger.info(f"Test Accuracy: {test_accuracy}")
logger.info(f"Test Classification Report:\n{test_report}")

print(f"Test Accuracy: {test_accuracy}")
print(f"Test Classification Report:\n{test_report}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[32m2024-06-01 17:28:32.759[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mBest parameters found: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}[0m
[32m2024-06-01 17:30:46.652[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m42[0m - [1mTest Accuracy: 0.6432853717026379[0m
[32m2024-06-01 17:30:46.654[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m43[0m - [1mTest Classification Report:
              precision    recall  f1-score   support

       Angry       0.73      0.82      0.77       555
   Disgusted       0.60      0.51      0.55       518
     Fearful       0.74      0.47      0.58       550
       Happy       0.58      0.54      0.56       542
     Neutral       0.56      0.72      0.63       470
         Sad       0.61      0.76      0.68       521
    Suprised       0.88      0.79      0.84       180

    accuracy                           0.64      3336
   macr

Test Accuracy: 0.6432853717026379
Test Classification Report:
              precision    recall  f1-score   support

       Angry       0.73      0.82      0.77       555
   Disgusted       0.60      0.51      0.55       518
     Fearful       0.74      0.47      0.58       550
       Happy       0.58      0.54      0.56       542
     Neutral       0.56      0.72      0.63       470
         Sad       0.61      0.76      0.68       521
    Suprised       0.88      0.79      0.84       180

    accuracy                           0.64      3336
   macro avg       0.67      0.66      0.66      3336
weighted avg       0.65      0.64      0.64      3336

