# Import all required packages

In [None]:
!pip install pandas
!pip install librosa
!pip install numpy
!pip install pickle
!pip install scikit-learn

In [2]:
import pandas as pd
from tqdm import tqdm
import librosa
import numpy as np
import pickle
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# Data preparation

## Reading a markup file and audio recordings

In [3]:
df_opochka = pd.read_excel('/content/enm1930_ner.xlsx')

In [4]:
df_opochka_data = df_opochka[['path', 'dialect']]

In [5]:
def load_audio(df_data):
    audios = []
    labels = []
    for index, row in tqdm(df_data.iterrows()):
        audio, sample_rate = librosa.load(row['path'], sr=16000)
        mfccs = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=40)
        mfccs_s = np.mean(mfccs.T, axis=0)
        audios.append(mfccs_s)
        labels.append(row['dialect'])
    return audios, labels

In [6]:
audios, labels = load_audio(df_opochka_data)
df_audio = pd.DataFrame(audios)
df_audio['label'] = labels

468it [00:30, 15.52it/s]


# Evaluation on already trained models

## Loading pretrained models

In [8]:
loaded_model = pickle.load(open('clf.pkl', 'rb'))
loaded_model_xgb = pickle.load(open('xgb_cl.pkl', 'rb'))
loaded_model_gaus = pickle.load(open('gaus.pkl', 'rb'))

## Evaluation on LGBMClassifier

In [24]:
X_test = df_audio.drop(['label'], axis=1)
y_pred = list(loaded_model.predict(X_test))
labels = np.array(df_audio['label'])
print(f1_score(labels, y_pred))
print(precision_score(labels, y_pred))
print(recall_score(labels, y_pred))

0.01993355481727575
0.75
0.010101010101010102


## Evaluation on XGBClassifier

In [27]:
X_test = df_audio.drop(['label'], axis=1)
y_pred = list(loaded_model_xgb.predict(X_test))
labels = np.array(df_audio['label'])
print(f1_score(labels, y_pred))
print(precision_score(labels, y_pred))
print(recall_score(labels, y_pred))

0.006711409395973153
1.0
0.003367003367003367


## Evaluation on GaussianNB

In [28]:
X_test = df_audio.drop(['label'], axis=1)
y_pred = list(loaded_model_gaus.predict(X_test))
labels = np.array(df_audio['label'])
print(f1_score(labels, y_pred))
print(precision_score(labels, y_pred))
print(recall_score(labels, y_pred))

0.0
0.0
0.0


  _warn_prf(average, modifier, msg_start, len(result))


# Additional training on new data

## Split on test and train

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_audio.drop(['label'], axis=1), df_audio['label'], test_size=0.5, random_state=22)

## Additional training of the model LGBMClassifier

In [12]:
loaded_model.fit(X_train, y_train)
y_pred = loaded_model.predict(X_test)
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.7018633540372671
0.6348314606741573
0.7847222222222222


## Additional training of the model XGBClassifier

In [13]:
loaded_model_xgb.fit(X_train, y_train)
y_pred = loaded_model_xgb.predict(X_test)
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.7321428571428571
0.640625
0.8541666666666666


## Additional training of the model GaussianNB

In [14]:
loaded_model_gaus.fit(X_train, y_train)
y_pred = loaded_model_gaus.predict(X_test)
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.7349397590361446
0.648936170212766
0.8472222222222222


# Save models

In [15]:
with open('LGBMClassifier_with_opochka.pkl', 'wb') as f:
    pickle.dump(loaded_model, f)
with open('XGBClassifier_with_opochka.pkl', 'wb') as f:
    pickle.dump(loaded_model_xgb, f)
with open('GaussianNB_with_opochka.pkl', 'wb') as f:
    pickle.dump(loaded_model_gaus, f)