# Catboost

In [47]:
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score, confusion_matrix
import sklearn.metrics as metrics
import librosa

from catboost import Pool
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_sample_weight
import optuna

In [48]:
def get_labelname(Tr):
    """[summary]
    training下のラベル名を取得
    Returns:
        [type]: [description]
    """    
    current = os.getcwd()
    if Tr == False:
        filepath = current + '/train/'
    elif Tr == True:
        filepath = current + '/test/'
    print(filepath)
    labellist = []
    for dir in os.listdir(filepath):
        if os.path.isdir(os.path.join(filepath, dir))==True:
            labellist.append(dir)
    
    return labellist, filepath

def wav2list(p):
    """[summary]
    Get audio file list to process all at once
    Returns:
        list : list of audio path
    """
    p = Path(p)
    audio_list = list(p.glob('*.wav'))

    if len(audio_list) == 0:
        sys.exit('Not found in {}'.format(p))

    return audio_list

def get_mfcc_librosa(p):
    """[summary]
    librosaライブラリを用いて24次元MFCCを抽出する
    データはtraining以下に置き, 各ラベルごとにフォルダを作ってデータを置いておく
    Args:
        p ([str]): .wavデータが置いてあるディレクトリ名
    Returns:
        [tupple]: (ファイル名, 24次元のMFCC)
    """    
    wavlist = wav2list(p)
    _name = []
    _mfcc = []
    wavlist.sort()

    for wavfile in wavlist:
        y, sr = librosa.core.load(wavfile,sr=44100)
        tmp = librosa.feature.mfcc(y=y, sr=44100, hop_length=10, win_length=100, n_mfcc=24)
        ceps = tmp.mean(axis=1)
        # print(ceps)
        _name.append(wavfile.stem)
        _mfcc.append(ceps)

    return _name, _mfcc

def make_df(Tr):
    """[summary]
    trainディレクトリ下から.wavデータを取ってきて抽出したMFCCとラベルから構成されるデータフレームを作成
    Returns:
        df_new[dataframe]: 学習データのデータフレーム
    """    
    labellist, filepath = get_labelname(Tr)
    print(labellist)
    cols = [x for x in range(24)]
    print(cols)
    cols.append('label')
    print(cols)
    df_new = pd.DataFrame(index = [], columns=cols)
    # print(df_new)
    for label in labellist:
        # print(filepath, label)
        labelpath = filepath + label
        filename, tmp = get_mfcc_librosa(labelpath)
        df = pd.DataFrame(tmp, index=filename)
        df = df.assign(label=label)
        df_new = pd.concat([df_new, df], axis=0)
    
    return df_new

In [49]:
# Training dataのデータフレーム作成 
df = make_df(Tr=True)
# x : 24次元のMFCC特徴量, y : データのラベル
x = df.iloc[:, 0:24]
y = df.iloc[:, 24]

In [50]:
label = set(y)
label_list = list(label)
label_list.sort()

for i in range(len(label_list)):
    y[y == label_list[i]] =i

y = np.array(y, dtype = "int")
y

In [51]:
df_test = make_df(Tr=False)
# x : 24次元のMFCC特徴量, y : データのラベル
x_t = df_test.iloc[:, 0:24]
y_t = df_test.iloc[:, 24]

In [52]:
label_test = set(y_t)
label_list_test = list(label_test)
label_list_test.sort()

for i in range(len(label_list_test)):
    y_t[y_t == label_list_test[i]] =i

y_t = np.array(y_t, dtype = "int")
y_t

## CatBoostが扱えるようなデータにする

In [53]:
categorical_features_indices = np.where(x.dtypes != np.float)[0]

train_pool = Pool(x, y, cat_features=categorical_features_indices)
validate_pool = Pool(x_t, y_t, cat_features=categorical_features_indices)

## モデルのインスタンス作成 → 学習

In [54]:
balanced = {0:3.5, 1:3.5, 2:1, 3:3}
# balanced = {0:1, 1:1, 2:1, 3:1}
model = CatBoostClassifier(custom_loss=['F1'],
                           random_seed=42, 
                           class_weights = balanced)

In [507]:
model.fit(train_pool, 
          eval_set=validate_pool,    # 検証用データ
          early_stopping_rounds=10,  # 10回以上精度が改善しなければ中止
          use_best_model=True,       # 最も精度が高かったモデルを使用するかの設定
          plot=True) 

In [30]:
params = {
    'depth' : 1,                  # 木の深さ
    'learning_rate' : 0.09,       # 学習率
    'early_stopping_rounds' : 10,
    'iterations' : 300, 
    'custom_loss' :['F1'], 
    'random_seed' :42,
    'use_best_model': True,
}

In [31]:
model = CatBoostClassifier(**params, class_weights = balanced)
model.fit(train_pool, eval_set=validate_pool)

In [55]:
def objective(trial):
    # パラメータの指定
    params = {
        'iterations' : trial.suggest_int('iterations', 50, 250),                         
        'depth' : trial.suggest_int('depth', 1, 6),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3), 
        'random_strength' :trial.suggest_int('random_strength', 0, 100), 
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    }

    # 学習
    model = CatBoostClassifier(**params, class_weights=balanced, random_seed=42)
    model.fit(train_pool)
    # 予測
    preds = model.predict(validate_pool)
    return (1.0 - f1_score(preds, y_t, average=None).mean())

In [56]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

In [57]:
study.best_params

In [58]:
model = CatBoostClassifier(iterations=study.best_params["iterations"],
                           depth=study.best_params["depth"],
                           learning_rate=study.best_params["learning_rate"],
                           random_strength=study.best_params["random_strength"],
                           bagging_temperature=study.best_params["bagging_temperature"],
                           use_best_model=True,
                           custom_loss="F1",
                           random_seed=42,
                           class_weights=balanced,
                           )
model.fit(train_pool, eval_set=validate_pool)

In [59]:
%%time
y_pred = model.predict(x_t)

## 学習ecopa, テストarena(混合行列はvoices, buzzer, whistle, base)

In [513]:
acc = accuracy_score(y_t, y_pred)
f1_test = f1_score(y_t, y_pred, average=None)

print('accuracy : {}'.format(acc))
print('F1 : {}'.format(f1_test))
print('F1_average : {}'.format(f1_test.mean()))

In [45]:
acc = accuracy_score(y_t, y_pred)
f1_test = f1_score(y_t, y_pred, average=None)

print('accuracy : {}'.format(acc))
print('F1 : {}'.format(f1_test))
print('F1_average : {}'.format(f1_test.mean()))

In [46]:
confusion = confusion_matrix(y_t, y_pred)
confusion

## 学習arena, テストecopa(混合行列はvoices, buzzer, whistle, base)

In [505]:
acc = accuracy_score(y_t, y_pred)
f1_test = f1_score(y_t, y_pred, average=None)

print('accuracy : {}'.format(acc))
print('F1 : {}'.format(f1_test))
print('F1_average : {}'.format(f1_test.mean()))

In [468]:
acc = accuracy_score(y_t, y_pred)
f1_test = f1_score(y_t, y_pred, average=None)

print('accuracy : {}'.format(acc))
print('F1 : {}'.format(f1_test))
print('F1_average : {}'.format(f1_test.mean()))

In [530]:
confusion = confusion_matrix(y_t, y_pred)
confusion

## 学習arena, テストecopa

In [60]:
acc = accuracy_score(y_t, y_pred)
f1_test = f1_score(y_t, y_pred, average=None)

print('accuracy : {}'.format(acc))
print('F1 : {}'.format(f1_test))
print('F1_average : {}'.format(f1_test.mean()))

In [61]:
confusion = confusion_matrix(y_t, y_pred)
confusion