In [1]:
!pip install duckdb --no-index --find-links=file:///kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/duck_pkg
!pip install polars --no-index --find-links=/kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/polars_pkg

Looking in links: file:///kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/duck_pkg
Processing /kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/duck_pkg/duckdb-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: duckdb
Successfully installed duckdb-0.8.1
Looking in links: /kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/polars_pkg


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import duckdb as dd
import polars as pl
import pyarrow
import os
import glob
import shutil
import zipfile
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
import plotly.express as px
import librosa
from IPython.display import Audio
import pickle
from joblib import dump, load
from pathlib import Path
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [3]:
# Path to the directory containing your audio dataset
dataset_dir = '/kaggle/input/birdclef-2024/train_audio'
# Initialize an empty dictionary to store the mapping between audio files and labels
label_mapping = {}
# Iterate over subdirectories (classes) in the dataset directory
for label in os.listdir(dataset_dir):
    label_dir = os.path.join(dataset_dir, label)
    # Check if the item in the dataset directory is a directory
    if os.path.isdir(label_dir):
        # Iterate over audio files in the subdirectory (class)
        for audio_file in os.listdir(label_dir):
            # Add the mapping between audio file path and label to the dictionary
            audio_file_path = os.path.join(label_dir, audio_file)
            label_mapping[audio_file_path] = label
            
# label_mapping

# Create a list of tuples containing the audio file paths and labels
data = [(audio_file_path, label) for audio_file_path, label in label_mapping.items()]
# Create a Pandas DataFrame from the list of tuples
annotated_data = pd.DataFrame(data, columns=['audio_file_path', 'label'])

label_encoder = LabelEncoder()
annotated_data['encoded_label'] = label_encoder.fit_transform(annotated_data['label'])

print(annotated_data.head(5))
print(annotated_data.shape)

                                     audio_file_path    label  encoded_label
0  /kaggle/input/birdclef-2024/train_audio/redspu...  redspu1            137
1  /kaggle/input/birdclef-2024/train_audio/redspu...  redspu1            137
2  /kaggle/input/birdclef-2024/train_audio/redspu...  redspu1            137
3  /kaggle/input/birdclef-2024/train_audio/redspu...  redspu1            137
4  /kaggle/input/birdclef-2024/train_audio/redspu...  redspu1            137
(24459, 3)


In [5]:
dd.sql("select label, count(distinct(audio_file_path)) as files from annotated_data group by label order by 2").pl()

label,files
str,i64
"""niwpig1""",5
"""integr""",5
"""asiope1""",5
"""blaeag1""",6
"""wynlau1""",6
…,…
"""grywag""",500
"""graher1""",500
"""grnsan""",500
"""comsan""",500


In [6]:
# Path to the directory containing your audio dataset
dataset_dir = '/kaggle/input/birdclef-2024/unlabeled_soundscapes'
# Initialize an empty dictionary to store the mapping between audio files and labels
label_mapping = {}

for audio_file in os.listdir(dataset_dir):
    # Add the mapping between audio file path and label to the dictionary
    audio_file_path = os.path.join(dataset_dir, audio_file)
    label_mapping[audio_file_path] = 'unlabelled'
            
# label_mapping

# Create a list of tuples containing the audio file paths and labels
data = [(audio_file_path, label) for audio_file_path, label in label_mapping.items()]
# Create a Pandas DataFrame from the list of tuples
unannotated_data = pd.DataFrame(data, columns=['audio_file_path', 'label'])

# label_encoder = LabelEncoder()
unannotated_data['encoded_label'] = 999

print(unannotated_data.head(5))
print(unannotated_data.shape)

                                     audio_file_path       label  \
0  /kaggle/input/birdclef-2024/unlabeled_soundsca...  unlabelled   
1  /kaggle/input/birdclef-2024/unlabeled_soundsca...  unlabelled   
2  /kaggle/input/birdclef-2024/unlabeled_soundsca...  unlabelled   
3  /kaggle/input/birdclef-2024/unlabeled_soundsca...  unlabelled   
4  /kaggle/input/birdclef-2024/unlabeled_soundsca...  unlabelled   

   encoded_label  
0            999  
1            999  
2            999  
3            999  
4            999  
(8444, 3)


In [7]:
combined_data = dd.sql("select * from annotated_data where label = 'niwpig1' union select * from unannotated_data").pl()
combined_data.shape

(8449, 3)

In [8]:
def get_file_duration(full_file_path):
    duration = librosa.get_duration(path=full_file_path, sr=32000)
    return round(duration,2)

vect_func = np.vectorize(get_file_duration)

combined_data_pd = combined_data.to_pandas()

combined_data_pd['file_duration'] = vect_func(combined_data_pd['audio_file_path'])
print(combined_data_pd.shape)

(8449, 4)


In [9]:
list1 = dd.sql(" select distinct label from combined_data_pd ad where ad.file_duration >= 5 and ad.file_duration <= 240 ")\
.pl().to_series().to_list()
print(len(list1))

list2 = dd.sql(" select distinct label from combined_data_pd ad ").pl().to_series().to_list()
print(len(list2))

set(list1) ^ set(list2)

183
183


set()

In [21]:
files_per_species_w_rnk = dd.sql(" select *, row_number()over(partition by label, encoded_label order by file_duration desc) as rn \
from combined_data_pd where file_duration <= 240").pl().sort(by=['encoded_label','rn'])

len(files_per_species_w_rnk.filter(pl.col('rn')<=10).select(pl.col('label')).unique().to_series().to_list())

2

In [22]:
files_per_species_final = files_per_species_w_rnk.filter(pl.col('rn')<=35)
# files_per_species_final

file_count_avg_dur = dd.sql("select label, count(distinct(audio_file_path)) as files, avg(file_duration) as avg_file_duration \
from files_per_species_final group by label").pl()

In [23]:
pl.Config(fmt_str_lengths=100)
files_per_species_final.tail(10)

audio_file_path,label,encoded_label,file_duration,rn
str,str,i64,f64,i64
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/2147319176.ogg""","""unlabelled""",999,240.0,26
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1174305357.ogg""","""unlabelled""",999,240.0,27
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/780943208.ogg""","""unlabelled""",999,240.0,28
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/939222520.ogg""","""unlabelled""",999,240.0,29
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1532260202.ogg""","""unlabelled""",999,240.0,30
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1885530991.ogg""","""unlabelled""",999,240.0,31
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1664258004.ogg""","""unlabelled""",999,240.0,32
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1829475960.ogg""","""unlabelled""",999,240.0,33
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1349598595.ogg""","""unlabelled""",999,240.0,34
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1810983255.ogg""","""unlabelled""",999,240.0,35


In [24]:
dd.sql("select label, count(distinct(audio_file_path)) as files, avg(file_duration) as avg_file_duration \
from files_per_species_final group by label order by 2 desc").pl()

label,files,avg_file_duration
str,i64,f64
"""unlabelled""",35,240.0
"""niwpig1""",5,9.856


In [14]:
fig_file_count_avg_dur1 = px.histogram(file_count_avg_dur, x="files")
fig_file_count_avg_dur1.show()

In [21]:
fig_file_count_avg_dur2 = px.histogram(file_count_avg_dur, x="avg_file_duration")
fig_file_count_avg_dur2.show()

In [25]:
from tqdm import tqdm

#labels = []
features = []
#feature_dict = {}

# Loop through each audio file in the dataset directory
for i in tqdm(range(files_per_species_final.shape[0]), desc = 'Outer Loop'):
    # labels.append(annotated_data['label'].iloc[i])
    file_path = files_per_species_final.item(i,0)
    # lat = files_per_species_final.item(i,3)
    # lon = files_per_species_final.item(i,4)
    audio, sample_rate = librosa.load(file_path, sr=32000)
    samples_per_segment = sample_rate * 5
    if len(audio) > 7680000:
        total_samples = 7680000
    else:
        total_samples = len(audio)

    for j in range(0, total_samples+160000, samples_per_segment):
        if j + samples_per_segment <= total_samples:
            segment = audio[j:j + samples_per_segment]
            # chunk = cur_file+'_'+str(int((j/32000)+5))
            # mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
            # flattened_features = (np.mean(mfccs.T, axis=0))
            # melspec = librosa.feature.melspectrogram(y=segment, sr=32000, n_fft=500, hop_length = 50)
            mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
            # melspec = librosa.feature.melspectrogram(y=segment, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, fmin=fmin, fmax=fmax)
            # mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, fmin=fmin, fmax=fmax)
            # flattened_melspec_features = (np.mean(melspec.T, axis=0))
            """flattened_mfcc_features = (np.mean(mfccs.T, axis=0))
            flattened_features = np.append(flattened_melspec_features, flattened_mfcc_features)"""
            # features.append(np.append(flattened_features, np.array([lat,lon])))
            flattened_features = (np.mean(mfccs.T, axis=0))
            #features.append(flattened_features)
            #labels.append(files_per_species_final.item(i,2))
            features.append({'file_path': file_path, 'label': files_per_species_final.item(i,2), 'feature_vector': flattened_features})
            

Outer Loop: 100%|██████████| 40/40 [00:50<00:00,  1.26s/it]


In [26]:
features_df = pd.DataFrame.from_dict(features)
features_df.head(5)

Unnamed: 0,file_path,label,feature_vector
0,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-352.7747, 58.093998, -34.373913, 6.123985, -..."
1,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-349.19034, 54.61164, -31.270224, 5.5404563, ..."
2,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-356.98993, 65.46216, -36.910046, 6.490638, -..."
3,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-360.84445, 71.26596, -41.32735, 7.428374, -1..."
4,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-341.08212, 87.44293, 14.027007, 16.87994, 17..."


In [27]:
features_df.tail(5)

Unnamed: 0,file_path,label,feature_vector
1683,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-407.1456, 50.568966, 7.1867356, 16.862848, 4..."
1684,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-413.33792, 45.748333, 4.3280754, 17.645176, ..."
1685,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-405.50815, 50.267776, 4.8828516, 14.646171, ..."
1686,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-404.0319, 57.001934, 5.019319, 11.485949, -3..."
1687,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-400.8654, 56.807384, 7.2469954, 9.848623, -2..."


In [28]:
feature_matrix = np.vstack(features_df.feature_vector.values)

In [29]:
feature_matrix.shape

(1688, 40)

In [35]:
from sklearn.metrics import silhouette_score

n_clusters = [10, 35, 45, 55, 75, 95, 100, 105, 110, 125, 135]
#[100, 182, 210, 250, 300, 350, 400, 410, 430, 450]
metrics_tracking = {}
metrics_tracking['n_clusters'] = n_clusters
list_inertia = []
list_sh_score = []

In [36]:
for cluster in n_clusters:
    print("starting kmeans with {0} clusters".format(cluster))
    kmeans = KMeans(n_clusters = cluster, init='k-means++', random_state=42)
    kmeans.fit(feature_matrix)
    
    print("inertia = {0}".format(kmeans.inertia_))
    list_inertia.append(kmeans.inertia_)
    
    sh_score = silhouette_score(feature_matrix, kmeans.labels_)
    print("silhouette score = {0}".format(sh_score))
    list_sh_score.append(sh_score)
    
metrics_tracking['inertia'] = list_inertia
metrics_tracking['silhouette_score'] = list_sh_score

test_df = pd.DataFrame(metrics_tracking)

starting kmeans with 10 clusters






inertia = 2452696.75
silhouette score = 0.3468366265296936
starting kmeans with 35 clusters






inertia = 638042.9375
silhouette score = 0.4861394464969635
starting kmeans with 45 clusters






inertia = 528967.625
silhouette score = 0.4693186283111572
starting kmeans with 55 clusters






inertia = 447455.40625
silhouette score = 0.44389939308166504
starting kmeans with 75 clusters






inertia = 354928.625
silhouette score = 0.4051316976547241
starting kmeans with 95 clusters






inertia = 307858.03125
silhouette score = 0.34270042181015015
starting kmeans with 100 clusters






inertia = 301020.375
silhouette score = 0.3327166438102722
starting kmeans with 105 clusters






inertia = 288558.9375
silhouette score = 0.3341989517211914
starting kmeans with 110 clusters






inertia = 282312.875
silhouette score = 0.3275374174118042
starting kmeans with 125 clusters






inertia = 254370.796875
silhouette score = 0.32146450877189636
starting kmeans with 135 clusters






inertia = 242829.484375
silhouette score = 0.3117636442184448


In [37]:
test_df

Unnamed: 0,n_clusters,inertia,silhouette_score
0,10,2452697.0,0.346837
1,35,638042.9,0.486139
2,45,528967.6,0.469319
3,55,447455.4,0.443899
4,75,354928.6,0.405132
5,95,307858.0,0.3427
6,100,301020.4,0.332717
7,105,288558.9,0.334199
8,110,282312.9,0.327537
9,125,254370.8,0.321465


In [38]:
fig = px.line(test_df, x='n_clusters', y='silhouette_score', markers=True)
fig.show()

In [39]:
fig = px.line(test_df, x='n_clusters', y='inertia', markers=True)
fig.show()

In [40]:
kmeans = KMeans(n_clusters = 35, init='k-means++', random_state=42)
kmeans.fit(feature_matrix)





In [41]:
features_df['cluster'] = kmeans.labels_

In [42]:
features_df.head(10)

Unnamed: 0,file_path,label,feature_vector,cluster
0,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-352.7747, 58.093998, -34.373913, 6.123985, -...",4
1,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-349.19034, 54.61164, -31.270224, 5.5404563, ...",4
2,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-356.98993, 65.46216, -36.910046, 6.490638, -...",4
3,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-360.84445, 71.26596, -41.32735, 7.428374, -1...",17
4,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-341.08212, 87.44293, 14.027007, 16.87994, 17...",25
5,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-229.25327, 43.339508, -25.134214, 29.236397,...",24
6,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-348.63962, 35.000065, 6.173913, 17.852373, 1...",9
7,/kaggle/input/birdclef-2024/train_audio/niwpig...,119,"[-314.33997, 85.71277, -10.978396, 13.328247, ...",25
8,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-346.0044, 81.405876, -1.5168529, 12.904893, ...",4
9,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-353.9868, 79.329414, -0.95603395, 12.37933, ...",4


In [58]:
list_of_clusters = dd.sql(" select distinct cluster from ( select *, row_number()over(order by files desc) as rn from ( select cluster, count(distinct(label)) as data_count, count(distinct(file_path)) as files \
from features_df group by cluster )t1 )t2 where rn <= 5").pl().to_series().to_list()

print(list_of_clusters)

[4, 25, 31, 11, 24]


In [59]:
features_df['label'] = np.where(features_df['cluster'].isin(list_of_clusters), 119, features_df['label'])

In [62]:
final_list_of_files = dd.sql("select distinct file_path from features_df where label = 119 ").pl().to_series().to_list()

print(final_list_of_files)

['/kaggle/input/birdclef-2024/train_audio/niwpig1/XC122770.ogg', '/kaggle/input/birdclef-2024/train_audio/niwpig1/XC123013.ogg', '/kaggle/input/birdclef-2024/train_audio/niwpig1/XC538454.ogg', '/kaggle/input/birdclef-2024/train_audio/niwpig1/XC178211.ogg', '/kaggle/input/birdclef-2024/train_audio/niwpig1/XC123012.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/16130626.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/1276289015.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/1647563459.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/1850745759.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/1924554622.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/1767111649.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/1259366822.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/80551754.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/1715079772.ogg', '/kaggle/input/birdclef-2024/unlabeled_soundscapes/195220054.ogg', '/ka

In [44]:
def get_cluster_centroids(cluster_id):
    return kmeans.cluster_centers_[cluster_id]

features_df['centroid'] = features_df['cluster'].apply(get_cluster_centroids)

def get_ftr_w_centroid(feature, cluster_id):
    list_of_ftr_n_cc = [feature, kmeans.cluster_centers_[cluster_id]]
    return list_of_ftr_n_cc

features_df['ftr_w_centroid'] = features_df.apply(lambda x: get_ftr_w_centroid(x['feature_vector'], x['cluster']), axis=1)

In [45]:
def calc_cos_sim(ftr_w_centroid):
    v = ftr_w_centroid[0]
    w = ftr_w_centroid[1]
    
    cos_before_norm = np.dot(v, w) / (np.linalg.norm(v) * np.linalg.norm(w))
    rounded_cos_sim = round(cos_before_norm, 2)
    return rounded_cos_sim

features_df['cosine_similarity'] = features_df['ftr_w_centroid'].apply(calc_cos_sim)

In [46]:
fig_cluster = px.histogram(features_df, x="cosine_similarity")
fig_cluster.show()

In [48]:
dd.sql("select cluster, count(1) as records, count(distinct(file_path)) as files from features_df where label = 119 group by cluster").pl()

cluster,records,files
i32,i64,i64
4,3,1
17,1,1
25,2,2
24,1,1
9,1,1


In [49]:
features_df_pl = pl.from_pandas(features_df)

list_56 = features_df_pl.filter(pl.col('cluster')==4).select(pl.col('label')).unique().to_series().to_list()

list_56

[999, 119]

In [50]:
features_df_pl = pl.from_pandas(features_df)

list_101 = features_df_pl.filter(pl.col('cluster')==25).select(pl.col('label')).unique().to_series().to_list()

list_101

[119, 999]

In [51]:
dd.sql("select distinct encoded_label, label from files_per_species_final where encoded_label in \
(119, 999) ").pl()

encoded_label,label
i64,str
119,"""niwpig1"""
999,"""unlabelled"""


In [42]:
features_df['label'] = np.where(features_df['label'] == 999, 24, features_df['label'])

In [44]:
#features_df[features_df['cluster']==56]

dd.sql("select distinct label, feature_vector from features_df where cluster in (56,101) ").pl()

label
i64
179
24
91


In [45]:
extracted_training_features_five_sec = features_df['feature_vector'].to_numpy()
labels_five_sec = features_df['label'].to_numpy()

In [46]:
#extracted_train_feat_five_sec_mix_match = np.array(features)

with open("extracted_train_feat_five_sec_mix_match", "wb") as file:   #Pickling
    pickle.dump(extracted_training_features_five_sec, file)
    
with open("labels_five_sec_mix_match", "wb") as file:   #Pickling
    pickle.dump(labels_five_sec, file)

In [3]:
feature_file_path = '/kaggle/input/mix-n-match-v1/extracted_train_feat_five_sec_mix_match'
label_file_path = '/kaggle/input/mix-n-match-v1/labels_five_sec_mix_match'

with open(feature_file_path, "rb") as file:
    pickled_extracted_features_five_sec = pickle.load(file)
    
with open(label_file_path, "rb") as file:
    labels_five_sec = pickle.load(file)

In [4]:
x_five_sec = np.vstack(pickled_extracted_features_five_sec)
y_five_sec = labels_five_sec

print(x_five_sec.shape)
print(y_five_sec.shape)

(25705, 40)
(25705,)


In [5]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)

In [50]:
for i in range(len(C_range)):
    print(C_range[i])
    
for i in range(len(gamma_range)):
    print(gamma_range[i])

0.01
0.1
1.0
10.0
100.0
1000.0
10000.0
100000.0
1000000.0
10000000.0
100000000.0
1000000000.0
10000000000.0
1e-09
1e-08
1e-07
1e-06
1e-05
0.0001
0.001
0.01
0.1
1.0
10.0
100.0
1000.0


In [6]:
from sklearn.preprocessing import LabelBinarizer

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

rbf_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", C=C_range[4], gamma=gamma_range[4], probability=True, class_weight="balanced"))
])

sample_labels = [i for i in range(0, 182, 1)]
y_onehot_tests, y_scores = dict(), dict()

for fold, (train, test) in enumerate(skf.split(x_five_sec, y_five_sec)):
    print("fold = {0}".format(fold))
    x_train = np.ascontiguousarray(x_five_sec[train])
    y_train = np.ascontiguousarray(y_five_sec[train])
    x_test = np.ascontiguousarray(x_five_sec[test])
    y_test = np.ascontiguousarray(y_five_sec[test])
    print("train feature shape & label shape = {0} & {1}".format(x_train.shape, y_train.shape))
    print("test feature shape & label shape = {0} & {1}".format(x_test.shape, y_test.shape))
    rbf_kernel_svm_model = rbf_kernel_svm_clf.fit(x_train, y_train)
    y_predict = rbf_kernel_svm_model.predict_proba(x_test)
    print("predictions shape = {0}".format(y_predict.shape))
    ras_ovr = roc_auc_score(y_test, y_predict, multi_class='ovr', average='macro', labels=sample_labels)
    ras_ovo = roc_auc_score(y_test, y_predict, multi_class='ovo', average='macro', labels=sample_labels)
    print(f"Macro-averaged One-vs-Rest ROC AUC score: ", round(ras_ovr,2))
    print(f"Macro-averaged One-vs-One ROC AUC score: ", round(ras_ovo,2))
    y_scores[fold] = y_predict
    label_binarizer = LabelBinarizer().fit(y_train)
    y_onehot_test = label_binarizer.transform(y_test)
    y_onehot_tests[fold] = y_onehot_test

fold = 0
train feature shape & label shape = (20564, 40) & (20564,)
test feature shape & label shape = (5141, 40) & (5141,)
predictions shape = (5141, 182)
Macro-averaged One-vs-Rest ROC AUC score:  0.87
Macro-averaged One-vs-One ROC AUC score:  0.87
fold = 1
train feature shape & label shape = (20564, 40) & (20564,)
test feature shape & label shape = (5141, 40) & (5141,)
predictions shape = (5141, 182)
Macro-averaged One-vs-Rest ROC AUC score:  0.87
Macro-averaged One-vs-One ROC AUC score:  0.87
fold = 2
train feature shape & label shape = (20564, 40) & (20564,)
test feature shape & label shape = (5141, 40) & (5141,)
predictions shape = (5141, 182)
Macro-averaged One-vs-Rest ROC AUC score:  0.88
Macro-averaged One-vs-One ROC AUC score:  0.88
fold = 3
train feature shape & label shape = (20564, 40) & (20564,)
test feature shape & label shape = (5141, 40) & (5141,)
predictions shape = (5141, 182)
Macro-averaged One-vs-Rest ROC AUC score:  0.87
Macro-averaged One-vs-One ROC AUC score:  0

In [51]:
rbf_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", C=C_range[5], gamma=gamma_range[4], probability=True, class_weight="balanced"))
])

rbf_kernel_svm_model = rbf_kernel_svm_clf.fit(x_five_sec, y_five_sec)

In [7]:
dump(rbf_kernel_svm_model, 'rbf_kernel_svm_model_mixmatch_c100_1e_05.joblib')

['rbf_kernel_svm_model_mixmatch_c100_1e_05.joblib']