In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load data
data_emotion = pd.read_csv("/kaggle/input/gender-age-and-emotion-detection-from-voice/cleaned_emotion.csv")
# now we are going with cleaned_age data

In [3]:
# Preprocess data
X = data_emotion.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'X', 'label'])  # Drop unnecessary columns
y = data_emotion['label']


In [4]:
# Encoding labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [5]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [6]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# Define and train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [8]:
# Predictions and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [10]:
print("Emotion Detection Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
# the model accuracy of random forest is 62% this is possioble becacuse the data misclassification

Emotion Detection Model Accuracy: 0.6208791208791209
Classification Report:
               precision    recall  f1-score   support

       angry       0.59      0.70      0.64        56
       happy       0.62      0.48      0.54        63
         sad       0.65      0.70      0.67        63

    accuracy                           0.62       182
   macro avg       0.62      0.62      0.62       182
weighted avg       0.62      0.62      0.62       182



In [11]:
# now we are going with age detection model

# Load required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
data_age = pd.read_csv("/kaggle/input/gender-age-and-emotion-detection-from-voice/cleaned_age.csv")


In [12]:
data_age.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,66,67,0.133338,0.069304,0.107668,0.089192,0.195267,0.106075,3.043456,13.694173,...,0.133338,0.121968,0.047337,0.277457,0.822656,0.0,4.6875,4.6875,0.076296,young
1,84,85,0.137433,0.058518,0.112037,0.092841,0.200079,0.107238,2.807995,12.77665,...,0.137433,0.111204,0.047151,0.277457,1.313384,0.0,6.046875,6.046875,0.135811,young
2,85,86,0.142227,0.065447,0.112242,0.093455,0.202909,0.109455,2.380899,9.942833,...,0.142227,0.118711,0.047013,0.275862,0.59375,0.0,6.539062,6.539062,0.096102,matured
3,87,88,0.133325,0.072849,0.11336,0.082861,0.203753,0.120892,1.904123,7.799218,...,0.133325,0.1162,0.047105,0.27907,0.424922,0.0,5.8125,5.8125,0.08188,young
4,88,89,0.130487,0.070407,0.113418,0.076098,0.196188,0.120089,1.820873,8.561101,...,0.130487,0.114802,0.047151,0.27907,0.19807,0.0,1.078125,1.078125,0.131579,matured


In [13]:
data_age.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'meanfreq', 'sd', 'median', 'Q25', 'Q75',
       'IQR', 'skew', 'kurt', 'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun',
       'minfun', 'maxfun', 'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx',
       'label'],
      dtype='object')

In [14]:
# Preprocess data
X = data_age.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'label'])  # Drop unnecessary columns
y = data_age['label']


In [15]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [17]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [18]:
# Define and train model
age_model = RandomForestClassifier(random_state=42)
age_model.fit(X_train, y_train)


In [19]:
# Predictions and evaluation
y_pred = age_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [20]:
print("Age Detection Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
# the accuracy of this model is 61%

Age Detection Model Accuracy: 0.6139747995418099
Classification Report:
               precision    recall  f1-score   support

     matured       0.60      0.70      0.65       398
         old       0.76      0.35      0.48       105
       young       0.61      0.59      0.60       370

    accuracy                           0.61       873
   macro avg       0.66      0.55      0.58       873
weighted avg       0.62      0.61      0.61       873



In [None]:
# Explanation
#Label Encoding: Encodes the age categories into numeric values for model training.
#Scaling: Standardizes features for better model performance.
#Random Forest Model: A simple but powerful classifier, effective for many feature-based classification tasks.
#Classification Report: Provides accuracy, precision, recall, and F1-score for each age category.

In [21]:
# the accuracy of both models is very low 
# now we are with going with try optimization methods like hyper parameter tunnig

In [22]:
# importing library
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


In [23]:
# Define parameter grid
param_dist = {
    "n_estimators": randint(50, 200),
    "max_depth": randint(3, 15),
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 10)
}

In [24]:
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    age_model, param_distributions=param_dist, n_iter=20, cv=5, random_state=42, n_jobs=-1
)

In [25]:
# Fit to find the best parameters
random_search.fit(X_train, y_train)

In [26]:
# Best model
best_model = random_search.best_estimator_

In [31]:
# Predictions and evaluation
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [28]:
print("Optimized Age Detection Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Best Parameters:", random_search.best_params_)

# but with the applying of hyper parameter tuniing the accuracy isnt improved is its because of data misclassification

Optimized Age Detection Model Accuracy: 0.6048109965635738
Classification Report:
               precision    recall  f1-score   support

     matured       0.59      0.69      0.63       398
         old       0.81      0.29      0.42       105
       young       0.61      0.60      0.61       370

    accuracy                           0.60       873
   macro avg       0.67      0.53      0.55       873
weighted avg       0.62      0.60      0.60       873

Best Parameters: {'max_depth': 12, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 104}


In [32]:
import joblib

# Save the model
model_filename = "optimized_age_model.pkl"
joblib.dump(best_model, model_filename)

# Code to download the model file
from IPython.display import FileLink
FileLink(model_filename)
