# XGBoostの実装

## 必要そうなライブラリをインポート

In [99]:
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import sklearn.metrics as metrics
import librosa

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
import optuna

from functools import partial

In [100]:
def get_labelname(Tr):
    """[summary]
    training下のラベル名を取得
    Returns:
        [type]: [description]
    """    
    current = os.getcwd()
    if Tr == True:
        filepath = current + '/train/'
    elif Tr == False:
        filepath = current + '/test/'
    print(filepath)
    labellist = []
    for dir in os.listdir(filepath):
        if os.path.isdir(os.path.join(filepath, dir))==True:
            labellist.append(dir)
    
    return labellist, filepath

def wav2list(p):
    """[summary]
    Get audio file list to process all at once
    Returns:
        list : list of audio path
    """
    p = Path(p)
    audio_list = list(p.glob('*.wav'))

    if len(audio_list) == 0:
        sys.exit('Not found in {}'.format(p))

    return audio_list

def get_mfcc_librosa(p):
    """[summary]
    librosaライブラリを用いて24次元MFCCを抽出する
    データはtraining以下に置き, 各ラベルごとにフォルダを作ってデータを置いておく
    Args:
        p ([str]): .wavデータが置いてあるディレクトリ名
    Returns:
        [tupple]: (ファイル名, 24次元のMFCC)
    """    
    wavlist = wav2list(p)
    _name = []
    _mfcc = []
    wavlist.sort()

    for wavfile in wavlist:
        y, sr = librosa.core.load(wavfile,sr=44100)
        tmp = librosa.feature.mfcc(y=y, sr=44100, hop_length=10, win_length=100, n_mfcc=24)
        ceps = tmp.mean(axis=1)
        # print(ceps)
        _name.append(wavfile.stem)
        _mfcc.append(ceps)

    return _name, _mfcc

## 学習データのデータフレーム

In [101]:
def make_df(Tr):
    """[summary]
    trainディレクトリ下から.wavデータを取ってきて抽出したMFCCとラベルから構成されるデータフレームを作成
    Returns:
        df_new[dataframe]: 学習データのデータフレーム
    """    
    labellist, filepath = get_labelname(Tr)
    print(labellist)
    cols = [x for x in range(24)]
    print(cols)
    cols.append('label')
    print(cols)
    df_new = pd.DataFrame(index = [], columns=cols)
    # print(df_new)
    for label in labellist:
        # print(filepath, label)
        labelpath = filepath + label
        filename, tmp = get_mfcc_librosa(labelpath)
        df = pd.DataFrame(tmp, index=filename)
        df = df.assign(label=label)
        df_new = pd.concat([df_new, df], axis=0)
    
    return df_new

In [102]:
# Training dataのデータフレーム作成 
df = make_df(Tr=True)
# x : 24次元のMFCC特徴量, y : データのラベル
x = df.iloc[:, 0:24]
y = df.iloc[:, 24]

In [103]:
label = set(y)
label_list = list(label)
label_list.sort()

for i in range(len(label_list)):
    y[y == label_list[i]] =i

y = np.array(y, dtype = "int")
print(len(y))
y

## テストデータのデータフレーム作成

In [104]:
df_test = make_df(Tr=False)
# x : 24次元のMFCC特徴量, y : データのラベル
x_t = df_test.iloc[:, 0:24]
y_t = df_test.iloc[:, 24]

In [105]:
label_test = set(y_t)
label_list_test = list(label_test)
label_list_test.sort()

for i in range(len(label_list_test)):
    y_t[y_t == label_list_test[i]] =i

y_t = np.array(y_t, dtype = "int")
print(len(y_t))
y_t

## XGBoostが扱うデータ形式にする

In [106]:
xgb_train = xgb.DMatrix(x, label=y)
xgb_test = xgb.DMatrix(x_t, label=y_t)

In [107]:
'''
XGBoostのパラメータを決める
 - objective : 多クラス分類でクラス別の確率をもとにクラスタリングを行うのでsoftmax
 - num_class : 4クラス分類 
'''

param = {
    'objective': 'multi:softmax',
    'num_class': 4,
}

## 学習と評価(パラメータチューニングなし)

In [54]:
model_time = xgb.XGBRegressor()

In [55]:
%%time
model_time.fit(x,y) 

In [176]:
y_predict = model.predict(xgb_test)

In [177]:
acc = accuracy_score(y_t, y_predict)
f1_test = f1_score(y_t, y_predict, average='weighted')

print('accuracy : {}'.format(acc))
print('accuracy : {}'.format(f1_test))

## 学習と評価(グリッドサーチ)

In [36]:
params = {'eta': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
          'gamma': [0.001, 0.01, 0.1, 1],
          'n_estimators': [100], 'max_depth':[2],
          'min_child_weigh': [1], 'nthread': [2] }

model = xgb.XGBClassifier(objective='multi:softmax') 

skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=1)  
clf = GridSearchCV(estimator=model, param_grid=params,   
                    cv=skf, scoring="accuracy", n_jobs=1, verbose=3)  
clf.fit(x, y) 

In [37]:
print("Best score: %.4f" % (clf.best_score_))  
print(clf.best_params_) 

In [38]:
param = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eta': 0.5, 
    'gamma': 0.01,
    'max_depth': 2,
    'nthread': 2,
} 

In [51]:
%%time
model = xgb.train(param,
                  xgb_train,
                  num_boost_round=100,
                  )

In [40]:
y_predict = model.predict(xgb_test)

## 学習と評価(Optunaによるハイパーパラメータ最適化)

In [108]:
# balanced = {0:3.5, 1:3.5, 2:1, 3:3}
# balanced = {0:1, 1:1, 2:1, 3:1}
xgboost_tuna = XGBClassifier(random_state=42)

### Objective Functionの作成

In [109]:
# colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1)
def opt(trial):
    n_estimators = trial.suggest_int('n_estimators', 0, 500)
    max_depth = trial.suggest_int('max_depth', 1, 5)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 50)
    eta = trial.suggest_loguniform('eta', 0.01, 3.0)
    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.01, 0.3, 0.01)
    xgboost_tuna = XGBClassifier(
        random_state=42,
        n_estimators = n_estimators,
        max_depth = max_depth,
        min_child_weight = min_child_weight,
        eta = eta,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
    )
    # 5分割してCrosss Validation
    # kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    xgboost_tuna.fit(x, y, sample_weight=compute_sample_weight("balanced", y))
    tuna_pred_test = xgboost_tuna.predict(x_t)
    
    # scores = cross_validate(xgboost_tuna, X=X, y=y, cv=kf, scoring=['f1_weighted'])
    
    f1 = f1_score(y_t, tuna_pred_test, average=None)
    return (1.0 - f1.mean())

In [110]:
study = optuna.create_study()
study.optimize(opt, n_trials=100)

In [111]:
print(study.best_params)
print(study.best_value)
print(study.best_trial)

In [112]:
#     colsample_bytree=study.best_params["colsample_bytree"],
fin_xgboost = XGBClassifier(
    random_state=42,
    n_estimators=study.best_params["n_estimators"],
    max_depth=study.best_params["max_depth"],
    min_child_weight=study.best_params["min_child_weight"],
    eta = study.best_params["eta"],
    subsample=study.best_params["subsample"],
    colsample_bytree=study.best_params["colsample_bytree"],
)

In [113]:
fin_xgboost.fit(x, y, sample_weight=compute_sample_weight("balanced", y))

In [114]:
%%time
# テストデータで推測値を算出
y_predict = fin_xgboost.predict(x_t)

In [121]:
%%timeit
fin_xgboost.predict(x_t)

## 評価

## 学習ecopa, テストarena(混合行列はvoices, buzzer, whistle, base)

In [17]:
acc = accuracy_score(y_t, y_predict)
f1_test = f1_score(y_t, y_predict, average=None)
precision_rbf_test = precision_score(y_t, y_predict, average=None)
recall_rbf_test = recall_score(y_t, y_predict, average=None)

print('accuracy : {}'.format(acc))
print("Precision : "+ str(precision_rbf_test))
print("Recall : "+ str(recall_rbf_test))
print('f1: {}'.format(f1_test.mean()))

In [116]:
acc = accuracy_score(y_t, y_predict)
f1_test = f1_score(y_t, y_predict, average=None)
precision_rbf_test = precision_score(y_t, y_predict, average=None)
recall_rbf_test = recall_score(y_t, y_predict, average=None)

print('accuracy : {}'.format(acc))
print("Precision : "+ str(precision_rbf_test))
print("Recall : "+ str(recall_rbf_test))
print('f1: {}'.format(f1_test.mean()))

## 学習arena, テストecopa(混合行列はvoices, buzzer, whistle, base)

In [98]:
acc = accuracy_score(y_t, y_predict)
f1_test = f1_score(y_t, y_predict, average=None)
precision_rbf_test = precision_score(y_t, y_predict, average=None)
recall_rbf_test = recall_score(y_t, y_predict, average=None)

print('accuracy : {}'.format(acc))
print("Precision : "+ str(precision_rbf_test))
print("Recall : "+ str(recall_rbf_test))
print('f1: {}'.format(f1_test))
print('f1_mean: {}'.format(f1_test.mean()))