In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('music_genre.csv')

In [6]:
df.shape

(1000, 28)

In [7]:
df.columns

Index(['filename', 'chroma_stft', 'rmse', 'spectral_centroid',
       'spectral_bandwidth', 'rolloff', 'zero_crossing_rate', 'mfcc1', 'mfcc2',
       'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10',
       'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15', 'mfcc16', 'mfcc17',
       'mfcc18', 'mfcc19', 'mfcc20', 'label'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,blues.00000.wav,0.349943,0.130225,1784.420446,2002.650192,3806.485316,0.083066,-113.596748,121.557297,-19.158825,...,8.810669,-3.667368,5.751691,-5.162763,0.750948,-1.691938,-0.409953,-2.300209,1.219929,blues
1,blues.00001.wav,0.340983,0.095918,1529.835316,2038.617579,3548.820207,0.056044,-207.556793,124.006721,8.93056,...,5.376803,-2.23912,4.216963,-6.012273,0.93611,-0.716537,0.293876,-0.287431,0.531573,blues
2,blues.00002.wav,0.363603,0.175573,1552.481958,1747.165985,3040.514948,0.076301,-90.754387,140.4599,-29.109968,...,5.789265,-8.905224,-1.08372,-9.218359,2.455806,-7.726901,-1.815723,-3.433434,-2.226821,blues
3,blues.00003.wav,0.404779,0.141191,1070.119953,1596.333948,2185.028454,0.033309,-199.431152,150.099213,5.647593,...,6.087677,-2.476421,-1.07389,-2.874778,0.780977,-3.316932,0.637982,-0.61969,-3.408233,blues
4,blues.00004.wav,0.30859,0.091563,1835.494603,1748.362448,3580.945013,0.1015,-160.266037,126.198807,-35.60545,...,-2.806384,-6.934123,-7.558618,-9.173553,-4.512165,-5.453538,-0.924161,-4.409333,-11.703781,blues


In [9]:
df.drop('filename', axis=1, inplace=True)

In [10]:
df.isnull().sum()

Unnamed: 0,0
chroma_stft,0
rmse,0
spectral_centroid,0
spectral_bandwidth,0
rolloff,0
zero_crossing_rate,0
mfcc1,0
mfcc2,0
mfcc3,0
mfcc4,0


In [11]:
df.duplicated().sum()

13

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 987 entries, 0 to 999
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   chroma_stft         987 non-null    float64
 1   rmse                987 non-null    float64
 2   spectral_centroid   987 non-null    float64
 3   spectral_bandwidth  987 non-null    float64
 4   rolloff             987 non-null    float64
 5   zero_crossing_rate  987 non-null    float64
 6   mfcc1               987 non-null    float64
 7   mfcc2               987 non-null    float64
 8   mfcc3               987 non-null    float64
 9   mfcc4               987 non-null    float64
 10  mfcc5               987 non-null    float64
 11  mfcc6               987 non-null    float64
 12  mfcc7               987 non-null    float64
 13  mfcc8               987 non-null    float64
 14  mfcc9               987 non-null    float64
 15  mfcc10              987 non-null    float64
 16  mfcc11       

In [14]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
blues,100
classical,100
country,100
jazz,100
rock,100
disco,99
reggae,99
hiphop,98
pop,98
metal,93


In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [16]:
# 0 blues
# 1 classical
# 2 country
# 3 disco
# 4 hiphop
# 5 jazz
# 6 metal
# 7 pop
# 8 reggae
# 9 rock

In [17]:
df.head()

Unnamed: 0,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,0.349943,0.130225,1784.420446,2002.650192,3806.485316,0.083066,-113.596748,121.557297,-19.158825,42.351028,...,8.810669,-3.667368,5.751691,-5.162763,0.750948,-1.691938,-0.409953,-2.300209,1.219929,0
1,0.340983,0.095918,1529.835316,2038.617579,3548.820207,0.056044,-207.556793,124.006721,8.93056,35.874683,...,5.376803,-2.23912,4.216963,-6.012273,0.93611,-0.716537,0.293876,-0.287431,0.531573,0
2,0.363603,0.175573,1552.481958,1747.165985,3040.514948,0.076301,-90.754387,140.4599,-29.109968,31.689013,...,5.789265,-8.905224,-1.08372,-9.218359,2.455806,-7.726901,-1.815723,-3.433434,-2.226821,0
3,0.404779,0.141191,1070.119953,1596.333948,2185.028454,0.033309,-199.431152,150.099213,5.647593,26.871927,...,6.087677,-2.476421,-1.07389,-2.874778,0.780977,-3.316932,0.637982,-0.61969,-3.408233,0
4,0.30859,0.091563,1835.494603,1748.362448,3580.945013,0.1015,-160.266037,126.198807,-35.60545,22.153301,...,-2.806384,-6.934123,-7.558618,-9.173553,-4.512165,-5.453538,-0.924161,-4.409333,-11.703781,0


In [18]:
X = df.drop('label', axis=1)
y = df['label']

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [20]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [21]:
from xgboost import XGBClassifier

In [22]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [23]:
y_pred = model.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6212121212121212


In [25]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

In [26]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

In [27]:
grid_search.fit(X_train, y_train)

In [28]:
grid_search.best_params_

{'learning_rate': 0.2, 'n_estimators': 300}

In [29]:
grid_search.best_score_

0.6196645972748529

In [30]:
y_pred_grid = grid_search.predict(X_test)

In [31]:
print(accuracy_score(y_test, y_pred_grid))

0.6111111111111112
