# xgbのためのデータセットの準備

In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from Mydataset import MyDataset
import os
import glob

root_dir = "./data_temp"

train_indices = [] # 学習データのindex
val_indices = [] # validationデータのindex

mydataset = MyDataset(root_pth=root_dir, test=False)
n_samples = len(mydataset) # サンプル(データ)数 -> 31769
folder_count = np.load(os.path.join(root_dir, 'audio', 'folder_count.npy')).tolist()

total_num = 0
for num in folder_count:
    # folder_count : [6134, 4534, 4386, 3994, 4342, 5382, 1100, 1001, 923]
    fol_indices = list(range(total_num, total_num+num))
    fol_indices = random.sample(fol_indices, num)
    # train : validation = 80 : 20
    train_size = int(num * 0.8)

    train_indices += fol_indices[0:train_size]
    val_indices += fol_indices[train_size:]

    total_num += num

In [2]:
print(len(train_indices))
print(len(val_indices))
print(len(train_indices) + len(val_indices))

24683
6176
30859


In [3]:
import glob

X_train = []
X_valid = []
y_train = []
y_valid = []

filling_type = np.load("./data_temp/audio/filling_type.npy")
pouring_or_shaking = np.load("./data_temp/audio/pouring_or_shaking.npy")
label = filling_type * pouring_or_shaking

files = glob.glob("./data_temp/audio/mfcc/*.npy")
print("files : ", len(files))

for i, file in enumerate(files):
    if i in train_indices:
        X_train.append(np.load(file))
        y_train.append(label[i])
    if i in val_indices:
        X_valid.append(np.load(file))
        y_valid.append(label[i])

print("X_train : ", len(X_train), "   y_train : ", len(y_train))
print("X_valid : ", len(X_valid), "   y_valid : ", len(y_valid))


files :  30859
X_train :  24683    y_train :  24683
X_valid :  6176    y_valid :  6176


In [None]:
X_train[0].shape

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

params = {
    "max_depth" : [1,2,3,4,5],
    "min_sample_leaf" : [1,2,3,4,5,6,7,8,9,10],
    "min_sample_split" : [2,3,4,5]
}

# データ型の変換
X_train, y_train = np.asarray(X_train), np.asarray(y_train)
X_valid, y_valid = np.asarray(X_valid), np.asarray(y_valid)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

forest = GridSearchCV()

# 勘違いしてた。別にxgbostはaudioでOK

In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from Mydataset import MyDataset
import os
import glob
import scipy.io.wavfile

In [2]:
sample_rate, audio = scipy.io.wavfile.read("data_temp/1/audio/s0_fi0_fu0_b0_l0_audio.wav")
print(audio.shape)
audio = audio.reshape(-1)
audio.shape

(116940, 8)


(935520,)

In [3]:
sample_rate, audio1 = scipy.io.wavfile.read("data/1/audio/s0_fi0_fu0_b0_l1_audio.wav")
print(audio1.shape)
audio1 = audio1.reshape(-1)
audio1.shape

(102387, 8)


(819096,)

In [4]:
max = audio.shape[0]
padding_len = max - audio1.shape[0]
padding_len

116424

In [5]:
np.pad(audio1, (0, padding_len), "constant").shape

(935520,)

In [6]:
# data path -> root_pth
root_pth = "./data"

# df <- annotations.csv
df = pd.read_csv('annotations_sort.csv', header = 0)
df_len=len(df) # データ数

# 以下xgboost用の変数
audio_filling_type = [] # audio用のlabel
audio_pour_shake = []
audio_max = 0

for fileidx in range(df_len):
    # pandas : df.iatの説明
    # https://note.nkmk.me/python-pandas-at-iat-loc-iloc/
    file_name = df.iat[fileidx, 2]
    folder_num = df.iat[fileidx, 0] # container_id
    start_time =  df.iat[fileidx, 9] # start
    end_time = df.iat[fileidx, 10] # end
    filling_type = df.iat[fileidx, 4] # filling_type:0~3(none pasta rice water)
    
    # python : rsplitの説明
    # https://note.nkmk.me/python-split-rsplit-splitlines-re/
    # s0_fi0_fu0_b0_l0_c2 -> s0_fi0_fu0_b0_l0_audio.wav
    audio_filename = file_name.rsplit("_", 1)[0] + '_audio.wav'

    audio_path = os.path.join(root_pth, str(folder_num), 'audio', audio_filename)
    # 377番目の音声データは飛ばす
    if audio_path == "./data/1/audio/s2_fi1_fu2_b1_l0_audio.wav" :
        continue

    # wavファイルの読み取り : scipy.io.wavfile ↓公式サイト
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.read.html
    # 返り値 : sample_rate==int signal==numpy array (N_samples, N_channels)
    sample_rate, signal = scipy.io.wavfile.read(audio_path)
    # sample_rate:44100, signal:(N, 8)(numpy.ndarray)

    # numpyのcast 
    # https://note.nkmk.me/python-numpy-dtype-astype/
    signal = signal.astype("float32")
    # トリミング手法
    # https://librosa.org/doc/main/generated/librosa.effects.trim.html
    signal /= np.abs(signal).max() # 正規化

    # xgboost 用のデータ処理
    audio_filling_type.append(filling_type)
    # container_id(folder_num)が1~6ならpouring, 7~9:shaking
    pouring = [1,2,3,4,5,6]
    shaking = [7,8,9]
    if folder_num in pouring :
        audio_pour_shake.append(1)
    elif folder_num in shaking:
        audio_pour_shake.append(0)
    else :
        print("no container id")
    
    # 次元を揃えるために一番次元の長いやつを求めておく
    if audio_max < signal.reshape(-1).shape[0]:
        audio_max = signal.reshape(-1).shape[0]


In [7]:
sample_rate, audio1 = scipy.io.wavfile.read("data/1/audio/s0_fi0_fu0_b0_l0_audio.wav")
sample_rate, audio2 = scipy.io.wavfile.read("data/1/audio/s0_fi0_fu0_b0_l1_audio.wav")

X_train = []
X_train.append(audio1)
X_train.append(audio2)
print(len(X_train))

2


In [8]:
audio1.reshape(-1).shape[0]
audio1.shape
audio1[:, 0].shape

(116940,)

In [9]:
audio_x = []
audio_y = []

filling_type = np.load("./data/audio/audio_filling_type.npy")
pour_or_shake = np.load("./data/audio/audio_pour_shake.npy")
label = filling_type * pour_or_shake

print("filling_type : ", len(filling_type))
print("pour_or_shake : ", len(pour_or_shake))
print("label : ", len(label))
print("label-kind : ", np.unique(np.array(label)))

# チャンネル1だけでやってみる
total_idx = 0
max_length = 0
for i in range(1, 10):
    files = glob.glob(f"./data/{i}/audio/*.wav")
    for idx, file in enumerate(files):
        sample_rate, audio = scipy.io.wavfile.read(file)
        if max_length < audio[:,0].shape[0]:
            max_length = audio[:,0].shape[0]
        # print(type(audio))
        # break
        # audio_x.append(audio)
        # audio_y.append(label[total_idx + idx])

print('max_length : ', max_length)
total_idx = 0       
for i in range(1, 10):
    files = glob.glob(f"./data/{i}/audio/*.wav")
    # print('files ; ', len(files))
    count = 0
    for idx, file in enumerate(files):
        sample_rate, audio = scipy.io.wavfile.read(file)
        pad_len = max_length - audio[:,0].shape[0]
        audio_x.append(np.pad(audio[:,0], (0, pad_len), "constant"))
        audio_y.append(label[total_idx+idx])
        count += 1
    total_idx += count

print("audio_x : ", len(audio_x))
print("audio_y : ", len(audio_y))

filling_type = np.array(filling_type)
print(np.unique(filling_type))
audio_x = np.array(audio_x)
audio_y = np.array(audio_y)
print(np.unique(audio_y))
print("audio_y==0 : ", np.count_nonzero(audio_y==0))
print("audio_y==1 : ", np.count_nonzero(audio_y==1))
print("audio_y==2 : ", np.count_nonzero(audio_y==2))
print("audio_y==3 : ", np.count_nonzero(audio_y==3))
print(252 + 143 + 144 + 144)

filling_type :  683
pour_or_shake :  683
label :  683
label-kind :  [0 1 2 3]
max_length :  1587410
audio_x :  683
audio_y :  683
[0 1 2 3]
[0 1 2 3]
audio_y==0 :  252
audio_y==1 :  143
audio_y==2 :  144
audio_y==3 :  144
683


In [10]:
print(audio_x.shape)
print(audio_x[0].shape)

(683, 1587410)
(1587410,)


In [16]:
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss

X_train, X_valid, y_train, y_valid = train_test_split(audio_x, audio_y,
                                                      test_size=0.2,
                                                      shuffle=True,
                                                      random_state=0)

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
watchlist = [(dtrain, "train"), (dvalid, "eval")]
# main parameter
params = {
    "objective" : 'multi:softprob',
    "num_class" : 4,
    "eval_metric" : "mlogloss"
}
num_round = 50

# params_space = {
#     "eta" : [0.01, 1.0, 1.0],
#     "gamma" : [0, 0.1],
#     "n_estimators" : [10, 100],
#     "max_depth" : [2,3,4],
#     "min_child_weight" : [1,2]um
# }
model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=10)

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# forest = GridSearchCV(model ,params_space, cv=skf, scoring="accuracy", n_jobs=1, verbose=3)
# forest.fit(X_train, y_train)
# best_params = forest.best_params_
# print(best_params)
# best_score = forest.best_score_
# print("best score : ", best_score)

va_pred = model.predict(dvalid)
va_pred = np.argmax(va_pred, axis=1)
score_acc = accuracy_score(y_valid, va_pred)
score_logloss = log_loss(y_valid, va_pred)
print("acc : ", score_acc)
print("log-loss : ", score_logloss)



[0]	train-mlogloss:1.01945	eval-mlogloss:1.34267
[1]	train-mlogloss:0.77658	eval-mlogloss:1.33808
[2]	train-mlogloss:0.59060	eval-mlogloss:1.34531
[3]	train-mlogloss:0.44595	eval-mlogloss:1.33524
[4]	train-mlogloss:0.34250	eval-mlogloss:1.34560
[5]	train-mlogloss:0.27169	eval-mlogloss:1.34890
[6]	train-mlogloss:0.21600	eval-mlogloss:1.35028
[7]	train-mlogloss:0.17119	eval-mlogloss:1.35117
[8]	train-mlogloss:0.13845	eval-mlogloss:1.35061
[9]	train-mlogloss:0.11447	eval-mlogloss:1.33090
[10]	train-mlogloss:0.09425	eval-mlogloss:1.32172
[11]	train-mlogloss:0.07820	eval-mlogloss:1.32419
[12]	train-mlogloss:0.06617	eval-mlogloss:1.32934
[13]	train-mlogloss:0.05670	eval-mlogloss:1.32047
[14]	train-mlogloss:0.04898	eval-mlogloss:1.32559
[15]	train-mlogloss:0.04286	eval-mlogloss:1.31748
[16]	train-mlogloss:0.03798	eval-mlogloss:1.32680
[17]	train-mlogloss:0.03377	eval-mlogloss:1.31745
[18]	train-mlogloss:0.03045	eval-mlogloss:1.32019
[19]	train-mlogloss:0.02767	eval-mlogloss:1.32303
[20]	train

ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [28]:
pred = np.argmax(va_pred, axis=1)
y_valid.shape
va_pred.shape
pred.shape

(137,)

In [39]:
print(np.unique(pred))
print(np.unique(y_valid))

[0 1 2 3]
[0 1 2 3]


In [38]:
pred

array([2, 0, 2, 0, 3, 3, 2, 3, 0, 0, 1, 2, 3, 0, 2, 1, 2, 3, 3, 2, 1, 0,
       3, 0, 2, 2, 0, 0, 3, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 1, 1, 3, 0, 0,
       3, 0, 2, 1, 2, 0, 0, 2, 1, 0, 1, 0, 1, 2, 2, 2, 0, 2, 0, 0, 0, 0,
       3, 0, 0, 3, 0, 0, 0, 1, 0, 1, 3, 2, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0,
       2, 3, 3, 0, 3, 3, 0, 1, 2, 2, 2, 1, 0, 2, 1, 3, 0, 1, 1, 2, 1, 0,
       0, 1, 2, 1, 3, 0, 0, 1, 1, 0, 0, 0, 2, 1, 0, 2, 0, 1, 3, 0, 0, 0,
       0, 1, 0, 0, 2])

In [33]:
y_valid

array([0, 2, 3, 0, 3, 2, 3, 2, 3, 0, 2, 2, 2, 0, 0, 1, 1, 3, 3, 2, 3, 3,
       0, 0, 1, 1, 0, 1, 3, 0, 0, 0, 1, 3, 0, 3, 0, 2, 1, 1, 3, 2, 2, 0,
       2, 0, 2, 3, 0, 0, 1, 0, 3, 0, 2, 0, 3, 0, 2, 0, 3, 1, 2, 0, 2, 3,
       3, 3, 1, 3, 0, 0, 2, 2, 3, 1, 3, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 2, 1, 1, 3, 3, 0, 0, 2, 3, 0, 0, 2, 1, 1, 2, 1, 1, 3, 1, 0,
       0, 0, 3, 2, 1, 0, 0, 3, 1, 1, 2, 2, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0,
       3, 1, 0, 3, 3])

In [40]:
score_acc = accuracy_score(y_valid, pred)
# score_logloss = log_loss(y_valid, pred)
print("acc : ", score_acc)
# print("log-loss : ", score_logloss)

acc :  0.43795620437956206
