In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   filename            1000 non-null   object 
 1   tempo               1000 non-null   float64
 2   beats               1000 non-null   int64  
 3   chroma_stft         1000 non-null   float64
 4   rmse                1000 non-null   float64
 5   spectral_centroid   1000 non-null   float64
 6   spectral_bandwidth  1000 non-null   float64
 7   rolloff             1000 non-null   float64
 8   zero_crossing_rate  1000 non-null   float64
 9   mfcc1               1000 non-null   float64
 10  mfcc2               1000 non-null   float64
 11  mfcc3               1000 non-null   float64
 12  mfcc4               1000 non-null   float64
 13  mfcc5               1000 non-null   float64
 14  mfcc6               1000 non-null   float64
 15  mfcc7               1000 non-null   float64
 16  mfcc8  

In [4]:
df.isna().sum()

filename              0
tempo                 0
beats                 0
chroma_stft           0
rmse                  0
spectral_centroid     0
spectral_bandwidth    0
rolloff               0
zero_crossing_rate    0
mfcc1                 0
mfcc2                 0
mfcc3                 0
mfcc4                 0
mfcc5                 0
mfcc6                 0
mfcc7                 0
mfcc8                 0
mfcc9                 0
mfcc10                0
mfcc11                0
mfcc12                0
mfcc13                0
mfcc14                0
mfcc15                0
mfcc16                0
mfcc17                0
mfcc18                0
mfcc19                0
mfcc20                0
label                 0
dtype: int64

In [5]:
df.duplicated().sum()

0

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tempo,1000.0,119.601702,28.297367,54.978391,99.384014,117.453835,135.999178,234.90767
beats,1000.0,57.138,14.225728,18.0,47.0,56.0,65.25,117.0
chroma_stft,1000.0,0.378656,0.081689,0.171782,0.319641,0.383075,0.435974,0.663573
rmse,1000.0,0.130929,0.065685,0.005276,0.086625,0.122448,0.175793,0.398012
spectral_centroid,1000.0,2201.834226,715.961347,569.930721,1627.793931,2209.46878,2691.969702,4434.439444
spectral_bandwidth,1000.0,2242.559613,526.337663,897.994319,1907.136505,2221.408983,2578.474352,3509.578677
rolloff,1000.0,4571.702159,1574.770035,749.062137,3380.956639,4658.67183,5534.197785,8676.405868
zero_crossing_rate,1000.0,0.103637,0.041834,0.021701,0.070281,0.099539,0.132007,0.274829
mfcc1,1000.0,-144.479173,100.235661,-552.064063,-200.695133,-120.206072,-73.895019,42.034587
mfcc2,1000.0,99.552199,31.331904,-1.527148,76.81148,98.452551,119.893638,193.096518


In [7]:
df.drop('filename',axis=1,inplace=True)

In [8]:
X = df.drop('label',axis=1)

In [9]:
y = df['label']

In [10]:
encoder = LabelEncoder()

In [11]:
y = encoder.fit_transform(y)

In [12]:
joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
scaler = StandardScaler()

In [15]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [17]:
# Train multiple classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [18]:

# Initialize the classifiers
logreg = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()

# Train the classifiers
logreg.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
svm.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# Evaluate the models
from sklearn.metrics import accuracy_score

# Make predictions on the testing set
y_pred_logreg = logreg.predict(X_test)
y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_svm = svm.predict(X_test)

# Calculate accuracy scores
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

print("Accuracy Scores:")
print("Logistic Regression:", accuracy_logreg)
print("Decision Tree:", accuracy_dt)
print("Random Forest:", accuracy_rf)
print("SVM:", accuracy_svm)

Accuracy Scores:
Logistic Regression: 0.605
Decision Tree: 0.415
Random Forest: 0.58
SVM: 0.585


In [20]:
# Select the best-performing model
best_model = max([(accuracy_logreg, "Logistic Regression"),
                  (accuracy_dt, "Decision Tree"),
                  (accuracy_rf, "Random Forest"),
                  (accuracy_svm, "SVM")])

selected_model = best_model[1]
print("Selected Model:", selected_model)

Selected Model: Logistic Regression


In [21]:
# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_classifier = RandomForestClassifier()
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and best score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

# Evaluate the model on the test set
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: ", accuracy)
best_model = grid_search.best_estimator_

Best Hyperparameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score:  0.66125
Test Accuracy:  0.575


In [22]:
# Save the selected model
joblib.dump(best_model, 'genre_model.pkl')

['genre_model.pkl']