<a href="https://colab.research.google.com/github/ttogle918/AI_practice/blob/main/dacon_%EC%9D%8C%EC%84%B1_%EB%B6%84%EB%A5%98_%EA%B2%BD%EC%A7%84%EB%8C%80%ED%9A%8C/cnn2d.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DACON sample 코드

[[Baseline] 1. 데이터 분석 입문자를 위한 데이터 분석 & 예측](https://dacon.io/competitions/official/235905/codeshare/5137?page=1&dtype=recent&ptype&fType)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# dataset

In [2]:
# audio 전처리를 위한 lib
import librosa
import librosa.display as dsp
from IPython.display import Audio

In [3]:
# 데이터 전처리를 위한 lib
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [4]:
train_path = '/content/drive/MyDrive/Colab Notebooks/dataset/dacon/user_data/train/'
train_csv_path = '/content/drive/MyDrive/Colab Notebooks/dataset/dacon/user_data/train.csv'

In [5]:
train = pd.read_csv(train_csv_path)
train.head(3)

Unnamed: 0,file_name,label
0,001.wav,9
1,002.wav,0
2,004.wav,1


In [6]:
train.shape

(400, 2)

In [7]:
# 같은 결과
train.label.groupby(train.label).count()
# train.label.value_counts()

label
0    40
1    40
2    40
3    40
4    40
5    40
6    40
7    40
8    40
9    40
Name: label, dtype: int64

In [8]:
import random

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(929)

In [9]:
data, sample_rate = librosa.load(f'{train_path}001.wav', sr = 16000)
print('sample_rate:', sample_rate, ', audio shape:', data.shape)
print('length:', data.shape[0]/float(sample_rate), 'secs')

sample_rate: 16000 , audio shape: (10192,)
length: 0.637 secs


- sample_rate: 16000
  - 초당 16000개(16000Hz 주파수)의 샘플을 가지고 있는 데이터, 1초에 음성 신호를 16000번 샘플링
  - default : 22050Hz
  - 16000Hz로 설정한 이유 : 사람의 목소리는 대부분 16000Hz안에 포함되기 때문이다.
audio shape: (10192,)
length: 0.637 secs = sample_rate * audio_shape

## dataframe 생성

In [10]:
def train_dataset():
    dataset = []
    for file in tqdm(os.listdir(train_path),colour='green'):
        if 'wav' in file:
            abs_file_path = os.path.join(train_path,file)
            data, sr = librosa.load(abs_file_path, sr = 16000)
            class_label = int(train[train.file_name == file].label)
            dataset.append([data, sr, class_label])
    
    print("Dataset 생성 완료")
    return pd.DataFrame(dataset,columns=['data','sample_rate', 'label'])

In [11]:
train_wav = train_dataset()

100%|[32m██████████[0m| 400/400 [00:12<00:00, 33.30it/s]

Dataset 생성 완료





In [12]:
train_wav.head(3)

Unnamed: 0,data,sample_rate,label
0,"[0.00027186112, 0.00052218814, 0.00040256415, ...",16000,1
1,"[0.00011985076, 0.00016174652, 0.00017246709, ...",16000,9
2,"[3.6655838e-05, -3.7366447e-06, 3.4776433e-05,...",16000,5


In [13]:
train_wav['len'] = train_wav.data.apply(len)
train_wav.head()

Unnamed: 0,data,sample_rate,label,len
0,"[0.00027186112, 0.00052218814, 0.00040256415, ...",16000,1,10328
1,"[0.00011985076, 0.00016174652, 0.00017246709, ...",16000,9,10192
2,"[3.6655838e-05, -3.7366447e-06, 3.4776433e-05,...",16000,5,8642
3,"[-0.00010774565, -0.0001280595, -0.00013393736...",16000,0,14058
4,"[9.2610695e-05, 0.00018442213, 0.00021447388, ...",16000,7,11326


In [14]:
train_wav.len.describe()

count      400.000000
mean     10326.187500
std       1811.356277
min       5711.000000
25%       8934.750000
50%      10250.500000
75%      11593.000000
max      15573.000000
Name: len, dtype: float64

In [19]:
maxlen = max(train_wav.len)
train_wav['data'] = train_wav['data'].apply(lambda x : librosa.util.fix_length(x, size=maxlen))
train_wav.head(3)

Unnamed: 0,data,sample_rate,label,len
0,"[0.00027186112, 0.00052218814, 0.00040256415, ...",16000,1,10328
1,"[0.00011985076, 0.00016174652, 0.00017246709, ...",16000,9,10192
2,"[3.6655838e-05, -3.7366447e-06, 3.4776433e-05,...",16000,5,8642


In [25]:
train_wav['llen'] = train_wav.data.apply(len)
train_wav['llen'].unique()
train_wav.head(3)

Unnamed: 0,data,sample_rate,label,len,llen
0,"[0.00027186112, 0.00052218814, 0.00040256415, ...",16000,1,10328,15573
1,"[0.00011985076, 0.00016174652, 0.00017246709, ...",16000,9,10192,15573
2,"[3.6655838e-05, -3.7366447e-06, 3.4776433e-05,...",16000,5,8642,15573


In [26]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(train_wav.data, train_wav.label, test_size=0.2, stratify=train_wav.label)
print('학습시킬 train 셋 : ', train_X.shape, train_X.iloc[0].shape, train_y.shape)
print('검증할 val 셋 : ', val_X.shape, val_X.iloc[0].shape, val_y.shape)

학습시킬 train 셋 :  (320,) (15573,) (320,)
검증할 val 셋 :  (80,) (15573,) (80,)


In [27]:
val_y.value_counts()

7    8
6    8
4    8
0    8
3    8
5    8
2    8
1    8
9    8
8    8
Name: label, dtype: int64

## 음성 데이터 특징 추출


In [34]:
def preprocess_train_dataset(data, y_data, maxlen):
    mfccs = []
    y_data_list = []
    for d, y_ in zip(data, y_data) :
        d = librosa.util.fix_length(d, size=maxlen,)
        for r in (0, 1e-4, 1e-3):
          extracted_features = librosa.feature.mfcc(y= d + ((np.random.rand(*d.shape) - 0.5) * r), sr=16000, n_mfcc=64)
          mfccs.append(extracted_features)
          y_data_list.append(y_)

    mfccs = np.array(mfccs)
    y_data_list = np.array(y_data_list)
    return mfccs, y_data_list

def preprocess_val_dataset(data, y_data, maxlen):
    mfccs = []
    y_data_list = []
    for d, y_ in zip(data, y_data) :
        d = librosa.util.fix_length(d, size=maxlen)
        extracted_features = librosa.feature.mfcc(y= d, sr=16000, n_mfcc=64)
        mfccs.append(extracted_features)
        y_data_list.append(y_)
    mfccs = np.array(mfccs)
    y_data_list = np.array(y_data_list)
    return mfccs, y_data_list

In [35]:
train_X, train_y = preprocess_train_dataset(train_X, train_y, maxlen)
val_X, val_y = preprocess_val_dataset(val_X, val_y, maxlen)

In [36]:
train_X.shape, val_X.shape

((960, 64, 31), (80, 64, 31))

In [37]:
train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], train_X.shape[2], -1)
val_X = val_X.reshape(val_X.shape[0], val_X.shape[1], val_X.shape[2], -1)
train_X.shape, val_X.shape

((960, 64, 31, 1), (80, 64, 31, 1))

# 모델 학습 : CNN


In [38]:
from keras import Sequential
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

from keras.layers import Dense, GlobalAveragePooling2D, MaxPooling2D, Conv2D, Flatten, Dropout
from keras.models import Model

In [39]:
input_shape = (train_X.shape[1],train_X.shape[2], 1)
input_shape

(64, 31, 1)

In [98]:
# conv_256_2.csv
def create_Model() :
  model = Sequential()
  model.add(Conv2D(32, kernel_size=2, padding='same', activation='relu', input_shape=input_shape))
  model.add(Conv2D(64, kernel_size=2, padding='same', activation='relu'))
  model.add(MaxPooling2D())
  model.add(Dropout(0.5))

  model.add(Conv2D(128, kernel_size=3, strides=2, padding='same', activation='relu'))
  model.add(Conv2D(256, kernel_size=3, strides=2, padding='same', activation='relu'))
  model.add(GlobalAveragePooling2D())
  model.add(Dropout(0.5))

  model.add(Flatten())

  model.add(Dense(256, activation="relu"))
  model.add(Dense(64, activation="relu"))
  model.add(Dropout(0.1))

  model.add(Dense(10, activation="softmax"))
  model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [70]:
# test : 0.98, cnn_256.csv
# conv_256.csv
def create_Model() :
    
  model = Sequential()
  model.add(Conv2D(32, kernel_size=2, padding='same', activation='relu', input_shape=input_shape))
  model.add(Conv2D(64, kernel_size=2, padding='same', activation='relu'))
  model.add(MaxPooling2D())
  model.add(Dropout(0.5))

  model.add(Conv2D(128, kernel_size=3, strides=2, padding='same', activation='relu'))
  model.add(Conv2D(256, kernel_size=3, strides=2, padding='same', activation='relu'))
  model.add(GlobalAveragePooling2D())
  model.add(Dropout(0.5))

  model.add(Flatten())

  model.add(Dense(64, activation="relu"))
  model.add(Dropout(0.1))

  model.add(Dense(10, activation="softmax"))
  model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [99]:
model = create_Model()
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_44 (Conv2D)          (None, 64, 31, 32)        160       
                                                                 
 conv2d_45 (Conv2D)          (None, 64, 31, 64)        8256      
                                                                 
 max_pooling2d_15 (MaxPoolin  (None, 32, 15, 64)       0         
 g2D)                                                            
                                                                 
 dropout_30 (Dropout)        (None, 32, 15, 64)        0         
                                                                 
 conv2d_46 (Conv2D)          (None, 16, 8, 128)        73856     
                                                                 
 conv2d_47 (Conv2D)          (None, 8, 4, 256)         295168    
                                                     

In [100]:
from keras.models import load_model
from keras.callbacks import Callback

class save(Callback):
 def __init__(self, model):
  self.count = 0
  self.model = model

 def on_epoch_end(self, bath, logs={}):
  self.count = self.count + 1
  if self.count % 10 == 0:
   model.save("mymodel_epoch_{}.h5".format(self.count))

In [101]:
s = save(model)
es = EarlyStopping(monitor='val_accuracy', patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True)

history = model.fit(train_X, train_y, epochs=100, validation_data=(val_X, val_y), callbacks=[es, mc])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


In [102]:
model.evaluate(val_X, val_y)



[0.0005299928598105907, 1.0]

In [103]:
train_X, train_y  = train_wav.data, train_wav.label
train_X, train_y = preprocess_train_dataset(train_X, train_y, maxlen)
train_X.shape, train_y.shape

((1200, 64, 31), (1200,))

In [119]:
model = create_Model()
s = save(model)
es = EarlyStopping(monitor='accuracy', patience=3)
mc = ModelCheckpoint('best_model.h5', monitor='accuracy', save_best_only=True)
history = model.fit(train_X, train_y, epochs=18,  callbacks=[es, mc])

Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


# test

In [120]:
test_path = '/content/drive/MyDrive/Colab Notebooks/dataset/dacon/user_data/test/'
test_csv_path = '/content/drive/MyDrive/Colab Notebooks/dataset/dacon/user_data/test.csv'

In [121]:
test = pd.read_csv(test_csv_path)
test.head()

Unnamed: 0,file_name
0,003.wav
1,008.wav
2,010.wav
3,015.wav
4,024.wav


In [122]:
def test_dataset():
    dataset = []
    for file in tqdm(os.listdir(test_path),colour='green'):
        if 'wav' in file:
            abs_file_path = os.path.join(test_path,file)
            data, sr = librosa.load(abs_file_path, sr = 16000)
            dataset.append([data, file])
    
    print("Dataset 생성 완료")
    return pd.DataFrame(dataset,columns=['data', 'file_name'])

In [123]:
test_wav = test_dataset()

100%|[32m██████████[0m| 200/200 [00:05<00:00, 34.86it/s]

Dataset 생성 완료





In [124]:
test_wav.head(3)

Unnamed: 0,data,file_name
0,"[0.00027645202, 0.00047594117, 0.0004510493, 0...",010.wav
1,"[0.00048952625, 0.00085201755, 0.00076409266, ...",008.wav
2,"[-0.00054350717, -0.0008907122, -0.00071864494...",081.wav


In [125]:
def preprocess_test_dataset(data, maxlen):
    mfccs = []
    for d in data :
        d = librosa.util.fix_length(d, size=maxlen)
        extracted_features = librosa.feature.mfcc(y= d, sr=16000, n_mfcc=64)
        mfccs.append(extracted_features)
    mfccs = np.array(mfccs)
    return mfccs

In [126]:
mfccs_2 = preprocess_test_dataset(test_wav.data, maxlen)

In [127]:
# 모델의 예측과 실제 정답값을 비교합니다.
prediction = model.predict(mfccs_2)
prediction = np.argmax(prediction, axis=1)
prediction.shape

(200,)

In [128]:
test_wav['pred_label'] = prediction
test_wav.head(2)

Unnamed: 0,data,file_name,pred_label
0,"[0.00027645202, 0.00047594117, 0.0004510493, 0...",010.wav,8
1,"[0.00048952625, 0.00085201755, 0.00076409266, ...",008.wav,9


In [129]:
submission = pd.read_csv(f'{test_path[:-5]}sample_submission.csv')
submission['label'] = -1
submission.head(2)

Unnamed: 0,file_name,label
0,003.wav,-1
1,008.wav,-1


In [130]:
def get_pred(test_wav, submission) :
  for i, subm in enumerate(submission.file_name) :
    label = test_wav[test_wav['file_name'] == subm].pred_label
    submission['label'].iloc[i] = label
  return submission

In [131]:
submission = get_pred(test_wav, submission)
submission.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,file_name,label
0,003.wav,0
1,008.wav,9
2,010.wav,8
3,015.wav,8
4,024.wav,2


In [132]:
submission.label.value_counts()

8    23
0    21
1    21
9    20
5    20
6    20
4    20
2    19
7    19
3    17
Name: label, dtype: int64

In [133]:
submission.to_csv(f'cnn_512_2.csv', index=False)