In [2]:
'''
Audio covid-19  AI

We modified the [Baseline] code in the following ways. 

1. librosa.effects.trim()함수를 이용한 전처리 - removal of silent part
2. training데이터를 train, validation으로 split한 후, MLPClassifier로 교차 검증 후 최적의 모델 파라미터 결정 

'''
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt

from PIL import Image
import matplotlib as mpl
import librosa.display
import warnings
warnings.filterwarnings(action='ignore') 

CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':133
}

# def seed_everything(seed):
#     random.seed(seed)
#     os.environ['PYTHONHASHSEED'] = str(seed)
#     np.random.seed(seed)

# seed_everything(CFG['SEED']) # Seed 고정

train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

In [3]:
def get_mfcc_feature(df, data_type, save_path):
    
    #cm_hot = mpl.cm.get_cmap('hot')
    
    # Data Folder path
    root_folder = './'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        # 참조 코드: https://github.com/virufy/virufy-covid/virufy_cdf_quickstart.ipynb
        _, ints = librosa.effects.trim(y, top_db=40) 
        pad = 0.25*sr
        start = int(max(ints[0]-pad, 0))
        end = int(min(ints[1]+pad, len(y)))
        y3 = y[start:end]

        chunk = 4.09 # second.
        y3_ = y3[:np.floor(chunk*sr).astype(int)]
        y4 = np.zeros(int(sr*chunk))
        y4[:min(len(y4), len(y3_))] = y3_[:min(len(y4), len(y3_))]        
        mfcc4 = librosa.feature.mfcc(y=y4, sr=sr, n_mfcc=CFG['N_MFCC'])            
        
        print(uid, y.shape, y4.shape, mfcc4.shape)
                
        # 추출된 MFCC들의 평균을 Feature로 사용
        y_feature = []
        for e in mfcc4:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1, CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [5]:
get_mfcc_feature(train_df, 'train', 'open/train_mfcc_data3.csv')
get_mfcc_feature(test_df, 'test', 'open/test_mfcc_data3.csv')

open/train_mfcc_data3.csv is exist.
open/test_mfcc_data3.csv is exist.


In [8]:

# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('open/train_mfcc_data3.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']


In [9]:

def onehot_encoding(ohe, x):
    
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)

    return x


In [10]:

# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)


In [11]:
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import f1_score
import os, sys, time

## train 데이터를 대상으로 7:3으로 train/test split, random_state 값이 test결과에 영향. 
X_train, X_test, y_train, y_test=  train_test_split(train_x, train_y, test_size=0.3, random_state=1200)

# print(X_train.shape, X_test.shape, train_x.shape)

# 테스트한 파라미터 조합
'''
tuned_parameters = {
    'activation': (['identity', 'logistic', 'tanh', 'relu']),
    'hidden_layer_sizes': ([[40], [45], [50], [55], [60], [65], [70], [75], [80], [85], [90], [95], [100], [105], [110], [115], [120], [125], [130], [135], [140]]),
    'alpha':     ([0.01,0.001, 0.0001]),
    'batch_size':         ['auto'],
    'learning_rate_init':    [0.01, 0.001],
    'solver': (['lbfgs', 'sgd', 'adam'])
}
'''

# 아래는 GridSearchCV 혹은 RandomizedSearchCV를 사용하여 찾은 최적의 파라미터 조합.
tuned_parameters = {
    'activation': ['relu'],
    'hidden_layer_sizes': [65],
    'alpha':     [0.01],
    'batch_size':  ['auto'],
    'learning_rate_init':    [0.01],
    'solver': ["adam"]
}

clf =  GridSearchCV(MLPClassifier(), tuned_parameters, cv=5, n_jobs=1, scoring='f1_macro', verbose=1)  


st = time.time()
clf.fit(X_train, y_train)
print('elapsed... ', time.time()-st)

print(clf.best_params_)
a = clf.best_params_

clf2 = MLPClassifier(solver=a['solver'],learning_rate_init=a['learning_rate_init'], hidden_layer_sizes=a['hidden_layer_sizes'], 
                           batch_size=a['batch_size'], alpha=a['alpha'], activation=a['activation'], random_state=CFG['SEED'])

clf_nn_opt = clf2.fit(X_train, y_train)

## testing 
# y_pred = clf_nn_opt.predict(X_test)
y_pred = clf2.predict(X_test)

res = f1_score(y_test, y_pred, average='macro')

print(f'test of train data: f1_macro scores = {res}')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
elapsed...  1.4346873760223389
{'activation': 'relu', 'alpha': 0.01, 'batch_size': 'auto', 'hidden_layer_sizes': 65, 'learning_rate_init': 0.01, 'solver': 'adam'}
test of train data: f1_macro scores = 0.6056301130071329


In [14]:

#
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
#

model = clf_nn_opt # training데이터를 이용하여 최적의 파라미터로 학습한 모델

test_x = pd.read_csv('open/test_mfcc_data3.csv')
test_x = test_x.drop(columns=['id'])

# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

# Model 추론
preds = model.predict(test_x)

submission = pd.read_csv('sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('./submit6.csv', index=False)

