# Code

## Import

In [None]:
!pip install librosa

In [1]:
import random
import pandas as pd
import numpy as np
import os
import librosa
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore') 

## Hyperparameter Setting

In [2]:
CFG = {
    'SR':16000,
    'N_MFCC':16, # MFCC 벡터를 추출할 개수
    'SEED':42
}

In [None]:
CFG2 = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':42
}

In [None]:
CFG3 = {
    'SR':16000,
    'N_MFCC':64, # MFCC 벡터를 추출할 개수
    'SEED':42
}

## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-Processing 1
- MFCC 벡터의 개수를 16개,32개,64개로 변경해가며 데이터를 생성하였습니다.
- 제공된 unlabeled 데이터를 이용하여 Pseudo-labeling 기법을 사용하였습니다.

In [None]:
train_df = pd.read_csv('./train_data.csv')
unlabeled_df = pd.read_csv('./unlabeled_data.csv')
test_df = pd.read_csv('./test_data.csv')

In [None]:
#mfcc_16data 생성

def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './wav_dataset'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features_mfcc = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        
        y_feature_mfcc = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature_mfcc.append(np.mean(e))
        features_mfcc.append(y_feature_mfcc)
         
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features_mfcc, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [None]:
get_mfcc_feature(train_df, 'train', './train_mfcc_16data.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_16data.csv')
get_mfcc_feature(unlabeled_df, 'unlabeled', './unlabeled_mfcc_16data.csv')

In [None]:
#mfcc_32data 생성

def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './wav_dataset'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features_mfcc = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG2['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG2['N_MFCC'])
        
        y_feature_mfcc = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature_mfcc.append(np.mean(e))
        features_mfcc.append(y_feature_mfcc)
         
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features_mfcc, columns=['mfcc_'+str(x) for x in range(1,CFG2['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [None]:
get_mfcc_feature(train_df, 'train', './train_mfcc_32data.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_32data.csv')
get_mfcc_feature(unlabeled_df, 'unlabeled', './unlabeled_mfcc_32data.csv')

In [None]:
#mfcc_64data 생성

def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './wav_dataset'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features_mfcc = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG3['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG3['N_MFCC'])
        
        y_feature_mfcc = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature_mfcc.append(np.mean(e))
        features_mfcc.append(y_feature_mfcc)
         
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features_mfcc, columns=['mfcc_'+str(x) for x in range(1,CFG3['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [None]:
get_mfcc_feature(train_df, 'train', './train_mfcc_64data.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_64data.csv')
get_mfcc_feature(unlabeled_df, 'unlabeled', './unlabeled_mfcc_64data.csv')

## Data Pre-Processing 2
- Audio data Augmentation 방법중 shifting 방법을 이용하였습니다.
- Augmentation을 피처의 증강으로 이용하였습니다.

In [4]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df_1 = pd.read_csv('./train_mfcc_16data.csv')
train_df_2 = pd.read_csv('./train_mfcc_32data.csv')
train_df_3 = pd.read_csv('./train_mfcc_64data.csv')

unlabeled_df_1 = pd.read_csv('./unlabeled_mfcc_16data.csv')
unlabeled_df_2 = pd.read_csv('./unlabeled_mfcc_32data.csv')
unlabeled_df_3 = pd.read_csv('./unlabeled_mfcc_64data.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x_1 = train_df_1.drop(columns=['id', 'covid19'])
train_x_2 = train_df_2.drop(columns=['id', 'covid19'])
train_x_3 = train_df_3.drop(columns=['id', 'covid19'])

unlabeled_x_1 = unlabeled_df_1.drop(columns=['id'])
unlabeled_x_2 = unlabeled_df_2.drop(columns=['id'])
unlabeled_x_3 = unlabeled_df_3.drop(columns=['id'])

train_y = train_df_1['covid19']

In [5]:
wav_raw_1 = train_x_1.iloc[:,4:]
wav_raw_2 = train_x_2.iloc[:,4:]
wav_raw_3 = train_x_3.iloc[:,4:]

In [6]:
def shifting_sound(data, sr=16000, roll_rate=0.1):
    # [1, 2, 3, 4] 를 [4, 1, 2, 3]으로 만들어주는 방법입니다.
    aug_shift = np.roll(data, int(data.shape[1] * roll_rate))
    return aug_shift

aug_shift_1 = shifting_sound(wav_raw_1)
aug_shift_2 = shifting_sound(wav_raw_2)
aug_shift_3 = shifting_sound(wav_raw_3)

display(aug_shift_1)
display(aug_shift_2)
display(aug_shift_3)

array([[  -2.23654938, -274.9347229 ,   29.34542465, ...,    2.88732243,
           4.51717758,   -8.18710232],
       [  -0.45959276, -311.56317139,   52.47814941, ...,  -10.30236626,
           1.40450609,  -10.50794125],
       [   2.69900846, -438.29000854,   46.58890915, ...,   -1.47819424,
          -3.56525421,   -3.86844754],
       ...,
       [  -2.36556435, -347.20593262,   58.54412842, ...,   -5.46239138,
          -8.02673149,   -8.0779047 ],
       [  -5.49954224, -179.11195374,   70.69786072, ...,  -16.36296082,
           0.70131713,   -4.22028732],
       [  -0.87974226, -308.60488892,   85.64264679, ...,   -2.31646729,
           8.49924278,   -1.39309096]])

array([[ 1.19278967,  0.21075779,  4.18634939, ..., -0.19273011,
         2.41778421, -4.73664999],
       [ 1.23778832, -1.60042548, -1.46241903, ..., -6.74335337,
         0.25510457, -0.96699423],
       [-2.11305428, -2.43355489,  0.8811782 , ..., -0.07402468,
        -0.96413034, -0.73573095],
       ...,
       [ 3.84709215,  2.54003477,  5.93859673, ..., -3.18389273,
        -1.44622397, -0.02181726],
       [-1.60147119, -1.63030052, -1.38229728, ..., -1.20514739,
        -0.13417681, -4.72931862],
       [ 0.55795479, -5.24990654, -0.27840546, ...,  1.21095073,
         3.16694379,  1.4702338 ]])

array([[ 0.84166545,  0.41382128, -0.49755898, ...,  0.4329302 ,
        -0.86410016, -0.60264784],
       [-0.46152139,  0.66825175, -0.58335084, ...,  2.02786255,
         0.73700804,  1.50391042],
       [ 1.01127827,  1.08652115,  1.286461  , ..., -0.96496171,
        -1.21297586, -0.30587834],
       ...,
       [ 1.67024875, -0.52223587,  1.7132796 , ...,  0.41118056,
         0.48267183,  0.18056367],
       [ 0.09386925, -0.19366808, -0.28223333, ...,  1.64802313,
         0.16493782,  0.52008176],
       [-1.79950297, -0.68733519, -0.69162065, ...,  1.45486069,
        -1.14849317, -0.34716225]])

In [7]:
shift_df_1 = pd.DataFrame(aug_shift_1, columns=['shift_'+str(x) for x in range(1,17)])
shift_df_2 = pd.DataFrame(aug_shift_2, columns=['shift_'+str(x) for x in range(1,33)])
shift_df_3 = pd.DataFrame(aug_shift_3, columns=['shift_'+str(x) for x in range(1,65)])

In [8]:
display(train_x_1)
display(train_x_2)
display(train_x_3)

Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16
0,24,female,0,1,-274.934723,29.345425,-19.152718,-7.836880,-9.094099,-8.553542,-32.653938,-12.341261,-5.151807,2.669801,-7.865372,10.570987,2.887322,4.517178,-8.187102,-0.459593
1,51,male,0,0,-311.563171,52.478149,-0.098956,-11.070889,5.932185,-1.739854,-15.766101,-1.017933,-3.985818,7.574249,-10.625893,3.174112,-10.302366,1.404506,-10.507941,2.699008
2,22,male,0,0,-438.290009,46.588909,-22.689060,-3.607528,-13.873103,0.270997,-9.013165,-9.624514,-7.756683,2.647656,-7.490925,1.030777,-1.478194,-3.565254,-3.868448,-2.445375
3,29,female,1,0,-368.426086,46.939358,-7.443123,-3.694382,-20.511757,-9.271688,-10.894087,-11.857489,2.555442,-2.403031,-17.141569,4.961744,-8.069848,0.244983,-3.172915,-2.251981
4,23,male,0,0,-535.194458,7.165524,-7.422007,2.231187,-5.300425,-0.644981,-6.101685,-0.874724,0.401199,0.912838,-5.177626,-1.340888,-3.866654,-1.082266,-2.545630,-1.411004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,male,0,0,-328.487671,68.190376,-40.383747,2.297682,0.355717,-30.347511,-7.288190,-22.665266,-9.879593,-8.907995,-9.181956,-1.602759,-13.798248,11.850845,-9.357815,1.985346
3801,25,male,0,0,-386.257324,73.931343,-12.114974,-0.500644,-16.141827,-15.249634,-19.629404,-21.388702,-9.878449,-3.746335,-15.705230,2.397489,-10.983771,3.842001,-11.904511,-2.365564
3802,26,female,0,0,-347.205933,58.544128,12.969809,27.973339,20.632843,10.184925,6.837224,0.177250,2.810400,2.416439,-13.125198,-2.336237,-5.462391,-8.026731,-8.077905,-5.499542
3803,27,female,0,0,-179.111954,70.697861,-14.571251,-10.143574,-25.649059,2.032261,-11.038777,-10.497705,-12.233163,11.066545,-10.677567,2.922090,-16.362961,0.701317,-4.220287,-0.879742


Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
0,24,female,0,1,-274.934723,29.345425,-19.152718,-7.836880,-9.094099,-8.553542,...,-2.320942,2.150005,-0.925417,2.116030,-0.192730,2.417784,-4.736650,1.237788,-1.600425,-1.462419
1,51,male,0,0,-311.563171,52.478149,-0.098956,-11.070889,5.932185,-1.739854,...,-6.494778,0.545812,-6.261986,-2.384403,-6.743353,0.255105,-0.966994,-2.113054,-2.433555,0.881178
2,22,male,0,0,-438.290009,46.588909,-22.689060,-3.607528,-13.873103,0.270997,...,-0.156510,-1.682014,2.618637,1.244486,-0.074025,-0.964130,-0.735731,-0.420304,0.795621,0.411339
3,29,female,1,0,-368.426086,46.939358,-7.443123,-3.694382,-20.511757,-9.271688,...,-0.155855,3.839285,-2.503367,2.750743,1.758510,2.094587,0.295868,1.737648,-0.654136,1.847976
4,23,male,0,0,-535.194458,7.165524,-7.422007,2.231187,-5.300425,-0.644981,...,-0.144311,-0.413284,-1.452623,0.235582,-0.973687,0.777570,-0.735323,1.141641,-0.497988,1.190930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,male,0,0,-328.487671,68.190376,-40.383747,2.297682,0.355717,-30.347511,...,-0.168663,1.255295,-4.972386,-0.053485,1.204618,3.961083,-4.195477,2.746365,-4.138545,-2.572868
3801,25,male,0,0,-386.257324,73.931343,-12.114974,-0.500644,-16.141827,-15.249634,...,-2.914732,4.907058,1.142662,6.096552,2.885290,3.611389,-0.631856,3.847092,2.540035,5.938597
3802,26,female,0,0,-347.205933,58.544128,12.969809,27.973339,20.632843,10.184925,...,-2.396356,-2.741529,-2.629161,-1.906816,-3.183893,-1.446224,-0.021817,-1.601471,-1.630301,-1.382297
3803,27,female,0,0,-179.111954,70.697861,-14.571251,-10.143574,-25.649059,2.032261,...,-7.733274,5.680011,-2.057645,1.684632,-1.205147,-0.134177,-4.729319,0.557955,-5.249907,-0.278405


Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,...,mfcc_55,mfcc_56,mfcc_57,mfcc_58,mfcc_59,mfcc_60,mfcc_61,mfcc_62,mfcc_63,mfcc_64
0,24,female,0,1,-274.934723,29.345425,-19.152718,-7.836880,-9.094099,-8.553542,...,0.093353,0.432930,-0.864100,-0.602648,-0.461521,0.668252,-0.583351,0.287641,-0.681759,0.603734
1,51,male,0,0,-311.563171,52.478149,-0.098956,-11.070889,5.932185,-1.739854,...,0.504636,2.027863,0.737008,1.503910,1.011278,1.086521,1.286461,1.617640,1.334173,1.472357
2,22,male,0,0,-438.290009,46.588909,-22.689060,-3.607528,-13.873103,0.270997,...,-1.325226,-0.964962,-1.212976,-0.305878,0.241152,-0.408523,0.236640,-0.560149,-0.272871,0.320133
3,29,female,1,0,-368.426086,46.939358,-7.443123,-3.694382,-20.511757,-9.271688,...,-0.153836,1.330828,-1.098836,0.810187,0.287573,0.238690,-0.322925,-0.009798,-0.211682,0.867691
4,23,male,0,0,-535.194458,7.165524,-7.422007,2.231187,-5.300425,-0.644981,...,-0.154846,0.132687,-0.022355,0.527574,0.520241,0.363083,0.235858,0.248603,0.227507,0.120870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,male,0,0,-328.487671,68.190376,-40.383747,2.297682,0.355717,-30.347511,...,0.299990,0.856173,1.214331,2.336195,1.159138,-0.270297,-1.853823,0.351842,0.456829,0.604778
3801,25,male,0,0,-386.257324,73.931343,-12.114974,-0.500644,-16.141827,-15.249634,...,0.224516,0.162466,1.620834,1.827752,1.670249,-0.522236,1.713280,2.704759,4.021746,1.803691
3802,26,female,0,0,-347.205933,58.544128,12.969809,27.973339,20.632843,10.184925,...,-0.594465,0.411181,0.482672,0.180564,0.093869,-0.193668,-0.282233,0.268567,-0.226053,-0.097798
3803,27,female,0,0,-179.111954,70.697861,-14.571251,-10.143574,-25.649059,2.032261,...,0.488653,1.648023,0.164938,0.520082,-1.799503,-0.687335,-0.691621,2.027461,1.323672,1.376801


In [9]:
train_x_1 = pd.concat([train_x_1, shift_df_1], axis=1)
train_x_2 = pd.concat([train_x_2, shift_df_2], axis=1)
train_x_3 = pd.concat([train_x_3, shift_df_3], axis=1)

display(train_x_1)
display(train_x_2)
display(train_x_3)

Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,...,shift_7,shift_8,shift_9,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16
0,24,female,0,1,-274.934723,29.345425,-19.152718,-7.836880,-9.094099,-8.553542,...,-8.553542,-32.653938,-12.341261,-5.151807,2.669801,-7.865372,10.570987,2.887322,4.517178,-8.187102
1,51,male,0,0,-311.563171,52.478149,-0.098956,-11.070889,5.932185,-1.739854,...,-1.739854,-15.766101,-1.017933,-3.985818,7.574249,-10.625893,3.174112,-10.302366,1.404506,-10.507941
2,22,male,0,0,-438.290009,46.588909,-22.689060,-3.607528,-13.873103,0.270997,...,0.270997,-9.013165,-9.624514,-7.756683,2.647656,-7.490925,1.030777,-1.478194,-3.565254,-3.868448
3,29,female,1,0,-368.426086,46.939358,-7.443123,-3.694382,-20.511757,-9.271688,...,-9.271688,-10.894087,-11.857489,2.555442,-2.403031,-17.141569,4.961744,-8.069848,0.244983,-3.172915
4,23,male,0,0,-535.194458,7.165524,-7.422007,2.231187,-5.300425,-0.644981,...,-0.644981,-6.101685,-0.874724,0.401199,0.912838,-5.177626,-1.340888,-3.866654,-1.082266,-2.545630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,male,0,0,-328.487671,68.190376,-40.383747,2.297682,0.355717,-30.347511,...,-30.347511,-7.288190,-22.665266,-9.879593,-8.907995,-9.181956,-1.602759,-13.798248,11.850845,-9.357815
3801,25,male,0,0,-386.257324,73.931343,-12.114974,-0.500644,-16.141827,-15.249634,...,-15.249634,-19.629404,-21.388702,-9.878449,-3.746335,-15.705230,2.397489,-10.983771,3.842001,-11.904511
3802,26,female,0,0,-347.205933,58.544128,12.969809,27.973339,20.632843,10.184925,...,10.184925,6.837224,0.177250,2.810400,2.416439,-13.125198,-2.336237,-5.462391,-8.026731,-8.077905
3803,27,female,0,0,-179.111954,70.697861,-14.571251,-10.143574,-25.649059,2.032261,...,2.032261,-11.038777,-10.497705,-12.233163,11.066545,-10.677567,2.922090,-16.362961,0.701317,-4.220287


Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,...,shift_23,shift_24,shift_25,shift_26,shift_27,shift_28,shift_29,shift_30,shift_31,shift_32
0,24,female,0,1,-274.934723,29.345425,-19.152718,-7.836880,-9.094099,-8.553542,...,-1.137396,-0.307009,2.126430,-2.320942,2.150005,-0.925417,2.116030,-0.192730,2.417784,-4.736650
1,51,male,0,0,-311.563171,52.478149,-0.098956,-11.070889,5.932185,-1.739854,...,-0.561427,-2.600846,1.611449,-6.494778,0.545812,-6.261986,-2.384403,-6.743353,0.255105,-0.966994
2,22,male,0,0,-438.290009,46.588909,-22.689060,-3.607528,-13.873103,0.270997,...,0.254581,-2.406795,-0.479780,-0.156510,-1.682014,2.618637,1.244486,-0.074025,-0.964130,-0.735731
3,29,female,1,0,-368.426086,46.939358,-7.443123,-3.694382,-20.511757,-9.271688,...,-0.390333,-3.497266,-0.065157,-0.155855,3.839285,-2.503367,2.750743,1.758510,2.094587,0.295868
4,23,male,0,0,-535.194458,7.165524,-7.422007,2.231187,-5.300425,-0.644981,...,2.080978,-2.039872,0.233488,-0.144311,-0.413284,-1.452623,0.235582,-0.973687,0.777570,-0.735323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,male,0,0,-328.487671,68.190376,-40.383747,2.297682,0.355717,-30.347511,...,4.054575,-10.544050,2.293817,-0.168663,1.255295,-4.972386,-0.053485,1.204618,3.961083,-4.195477
3801,25,male,0,0,-386.257324,73.931343,-12.114974,-0.500644,-16.141827,-15.249634,...,-7.077020,-9.290753,-1.092900,-2.914732,4.907058,1.142662,6.096552,2.885290,3.611389,-0.631856
3802,26,female,0,0,-347.205933,58.544128,12.969809,27.973339,20.632843,10.184925,...,-2.824714,-5.576842,-2.217162,-2.396356,-2.741529,-2.629161,-1.906816,-3.183893,-1.446224,-0.021817
3803,27,female,0,0,-179.111954,70.697861,-14.571251,-10.143574,-25.649059,2.032261,...,-0.683911,-5.094481,-0.611145,-7.733274,5.680011,-2.057645,1.684632,-1.205147,-0.134177,-4.729319


Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,...,shift_55,shift_56,shift_57,shift_58,shift_59,shift_60,shift_61,shift_62,shift_63,shift_64
0,24,female,0,1,-274.934723,29.345425,-19.152718,-7.836880,-9.094099,-8.553542,...,-1.001481,0.148684,0.027494,1.168359,-0.769299,0.813916,0.093353,0.432930,-0.864100,-0.602648
1,51,male,0,0,-311.563171,52.478149,-0.098956,-11.070889,5.932185,-1.739854,...,-2.870737,-1.186830,-0.699097,-0.136317,-1.364793,0.764080,0.504636,2.027863,0.737008,1.503910
2,22,male,0,0,-438.290009,46.588909,-22.689060,-3.607528,-13.873103,0.270997,...,-0.135131,0.116626,0.275591,0.290342,0.190022,0.551670,-1.325226,-0.964962,-1.212976,-0.305878
3,29,female,1,0,-368.426086,46.939358,-7.443123,-3.694382,-20.511757,-9.271688,...,0.131393,-0.096934,0.016830,0.659916,0.173445,0.780549,-0.153836,1.330828,-1.098836,0.810187
4,23,male,0,0,-535.194458,7.165524,-7.422007,2.231187,-5.300425,-0.644981,...,-0.332720,0.149945,-0.050679,0.258125,-0.029052,0.210105,-0.154846,0.132687,-0.022355,0.527574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,male,0,0,-328.487671,68.190376,-40.383747,2.297682,0.355717,-30.347511,...,0.757229,1.808339,1.007579,2.139623,0.212016,0.295380,0.299990,0.856173,1.214331,2.336195
3801,25,male,0,0,-386.257324,73.931343,-12.114974,-0.500644,-16.141827,-15.249634,...,-0.803737,-2.212960,-2.281625,-1.359232,-0.016992,-0.777442,0.224516,0.162466,1.620834,1.827752
3802,26,female,0,0,-347.205933,58.544128,12.969809,27.973339,20.632843,10.184925,...,-2.436469,-1.392836,-1.343862,-1.160944,-1.828461,-1.617238,-0.594465,0.411181,0.482672,0.180564
3803,27,female,0,0,-179.111954,70.697861,-14.571251,-10.143574,-25.649059,2.032261,...,-1.382611,-1.075960,-1.040983,0.843531,-0.774418,1.546442,0.488653,1.648023,0.164938,0.520082


In [10]:
wav_raw_1 = unlabeled_x_1.iloc[:,4:]
wav_raw_2 = unlabeled_x_2.iloc[:,4:]
wav_raw_3 = unlabeled_x_3.iloc[:,4:]

In [11]:
aug_shift_1 = shifting_sound(wav_raw_1)
aug_shift_2 = shifting_sound(wav_raw_2)
aug_shift_3 = shifting_sound(wav_raw_3)

In [12]:
shift_df_1 = pd.DataFrame(aug_shift_1, columns=['shift_'+str(x) for x in range(1,17)])
shift_df_2 = pd.DataFrame(aug_shift_2, columns=['shift_'+str(x) for x in range(1,33)])
shift_df_3 = pd.DataFrame(aug_shift_3, columns=['shift_'+str(x) for x in range(1,65)])

In [13]:
unlabeled_x_1 = pd.concat([unlabeled_x_1, shift_df_1], axis=1)
unlabeled_x_2 = pd.concat([unlabeled_x_2, shift_df_2], axis=1)
unlabeled_x_3 = pd.concat([unlabeled_x_3, shift_df_3], axis=1)

## Data Pre-Processing 3

In [14]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [15]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x_1['gender'].values.reshape(-1,1))

train_x_1 = onehot_encoding(ohe, train_x_1)
train_x_2 = onehot_encoding(ohe, train_x_2)
train_x_3 = onehot_encoding(ohe, train_x_3)

unlabeled_x_1 = onehot_encoding(ohe, unlabeled_x_1)
unlabeled_x_2 = onehot_encoding(ohe, unlabeled_x_2)
unlabeled_x_3 = onehot_encoding(ohe, unlabeled_x_3)

In [16]:
#MFCC 벡터값에 절대값을 씌워줬을 때 의미있는 정보를 얻을 수 있을 것 같아 절대값을 씌워주었습니다.
#실제로 public leaderboard에서 성능향상이 있었습니다.

train_x_1 = train_x_1.abs()
train_x_2 = train_x_2.abs()
train_x_3 = train_x_3.abs()

unlabeled_x_1 = unlabeled_x_1.abs()
unlabeled_x_2 = unlabeled_x_2.abs()
unlabeled_x_3 = unlabeled_x_3.abs()

display(train_x_1)
display(unlabeled_x_1)

Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,female,male,other
0,24,0,1,274.934723,29.345425,19.152718,7.836880,9.094099,8.553542,32.653938,...,5.151807,2.669801,7.865372,10.570987,2.887322,4.517178,8.187102,1.0,0.0,0.0
1,51,0,0,311.563171,52.478149,0.098956,11.070889,5.932185,1.739854,15.766101,...,3.985818,7.574249,10.625893,3.174112,10.302366,1.404506,10.507941,0.0,1.0,0.0
2,22,0,0,438.290009,46.588909,22.689060,3.607528,13.873103,0.270997,9.013165,...,7.756683,2.647656,7.490925,1.030777,1.478194,3.565254,3.868448,0.0,1.0,0.0
3,29,1,0,368.426086,46.939358,7.443123,3.694382,20.511757,9.271688,10.894087,...,2.555442,2.403031,17.141569,4.961744,8.069848,0.244983,3.172915,1.0,0.0,0.0
4,23,0,0,535.194458,7.165524,7.422007,2.231187,5.300425,0.644981,6.101685,...,0.401199,0.912838,5.177626,1.340888,3.866654,1.082266,2.545630,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,0,0,328.487671,68.190376,40.383747,2.297682,0.355717,30.347511,7.288190,...,9.879593,8.907995,9.181956,1.602759,13.798248,11.850845,9.357815,0.0,1.0,0.0
3801,25,0,0,386.257324,73.931343,12.114974,0.500644,16.141827,15.249634,19.629404,...,9.878449,3.746335,15.705230,2.397489,10.983771,3.842001,11.904511,0.0,1.0,0.0
3802,26,0,0,347.205933,58.544128,12.969809,27.973339,20.632843,10.184925,6.837224,...,2.810400,2.416439,13.125198,2.336237,5.462391,8.026731,8.077905,1.0,0.0,0.0
3803,27,0,0,179.111954,70.697861,14.571251,10.143574,25.649059,2.032261,11.038777,...,12.233163,11.066545,10.677567,2.922090,16.362961,0.701317,4.220287,1.0,0.0,0.0


Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,female,male,other
0,35,1,0,1120.774292,3.262080,1.125641,0.140864,1.190534,0.121197,0.768138,...,0.635220,0.389337,0.431899,0.030479,0.466748,0.239471,0.219655,0.0,1.0,0.0
1,40,0,1,563.662659,40.990429,11.358990,10.386800,0.678367,3.211374,2.927805,...,4.099243,3.943284,3.821737,1.099119,1.816008,2.376451,3.652292,1.0,0.0,0.0
2,33,0,0,471.127167,14.603410,2.081593,0.689829,2.041967,4.464318,8.590043,...,4.839125,3.035390,2.582682,0.257422,3.092693,1.962798,3.283542,0.0,1.0,0.0
3,35,0,0,479.808258,9.709412,4.041464,1.983426,4.197571,1.436662,4.900989,...,5.852075,0.559872,3.433453,0.292010,3.905158,0.817956,2.846402,0.0,1.0,0.0
4,54,0,0,735.220825,34.609222,9.149350,14.897414,1.717604,8.525716,6.998227,...,4.237062,4.491402,5.985547,5.298206,8.921616,4.610049,7.845514,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1862,8,0,0,460.734985,22.145943,5.924560,6.891038,4.565688,5.322806,17.261806,...,9.555286,1.777077,0.332384,2.992923,2.914103,1.097831,2.742417,1.0,0.0,0.0
1863,29,0,1,331.947937,34.791584,6.310560,7.249038,11.100346,5.172902,0.035717,...,4.951110,2.869026,11.379020,4.896920,17.078371,0.194625,4.672140,0.0,1.0,0.0
1864,17,0,0,429.886780,26.009357,0.204722,1.174492,4.136636,1.897107,7.577222,...,5.878228,4.985948,6.863891,2.100668,6.555089,1.732090,9.760283,0.0,1.0,0.0
1865,22,0,0,647.736206,27.062975,7.233325,16.181942,4.624072,1.537723,6.056340,...,4.315625,6.540668,1.004547,4.144324,0.962796,1.375306,0.268144,0.0,1.0,0.0


In [17]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x_1 = pd.read_csv('./test_mfcc_16data.csv')
test_x_2 = pd.read_csv('./test_mfcc_32data.csv')
test_x_3 = pd.read_csv('./test_mfcc_64data.csv')

test_x_1 = test_x_1.drop(columns=['id'])
test_x_2 = test_x_2.drop(columns=['id'])
test_x_3 = test_x_3.drop(columns=['id'])

In [18]:
wav_raw_1 = test_x_1.iloc[:,4:]
wav_raw_2 = test_x_2.iloc[:,4:]
wav_raw_3 = test_x_3.iloc[:,4:]

In [19]:
aug_shift_1 = shifting_sound(wav_raw_1)
aug_shift_2 = shifting_sound(wav_raw_2)
aug_shift_3 = shifting_sound(wav_raw_3)

In [20]:
shift_df_1 = pd.DataFrame(aug_shift_1, columns=['shift_'+str(x) for x in range(1,17)])
shift_df_2 = pd.DataFrame(aug_shift_2, columns=['shift_'+str(x) for x in range(1,33)])
shift_df_3 = pd.DataFrame(aug_shift_3, columns=['shift_'+str(x) for x in range(1,65)])

In [21]:
test_x_1 = pd.concat([test_x_1, shift_df_1], axis=1)
test_x_2 = pd.concat([test_x_2, shift_df_2], axis=1)
test_x_3 = pd.concat([test_x_3, shift_df_3], axis=1)

In [22]:
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x_1 = onehot_encoding(ohe, test_x_1)
test_x_1 = test_x_1.abs()

test_x_2 = onehot_encoding(ohe, test_x_2)
test_x_2 = test_x_2.abs()

test_x_3 = onehot_encoding(ohe, test_x_3)
test_x_3 = test_x_3.abs()

## Train_1(MFCC_16)
- baseline model인 MLPClassifier보다 트리생성에 랜덤성이 추가된 ExtraTreesClassifier를 사용했을 때 성능향상이 있어 base model로 ExtraTreesClassifier를 사용하였습니다.
- model tuning은 모델의 일반화 성능을 위해 진행하지 않았습니다.

In [23]:
def score_function(real, pred):
    score = f1_score(real, pred, average="macro")
    return score

In [24]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

In [25]:
models = []
scores = []

for tri, vai in cv.split(train_x_1):
    print("="*50)
    
    preds = []
    model = ExtraTreesClassifier(random_state=CFG['SEED'])
    #model = MLPClassifier(random_state=CFG['SEED'])
                              
    model.fit(train_x_1.iloc[tri], train_y[tri])
    
    pred = model.predict_proba(train_x_1.iloc[vai])[:,1]
    pred = np.where(pred>=0.2 ,1 ,0)
    score = score_function(train_y[vai],pred)
    
    models.append(model)
    scores.append(score)



In [26]:
display(scores)
display(np.mean(scores))

[0.6143724696356275,
 0.5543859649122808,
 0.6447703000508561,
 0.622882096069869,
 0.5786697122967314,
 0.5783941605839416,
 0.5314200053192776,
 0.5589941972920697,
 0.5433958776227008,
 0.6017815037987949]

0.582906628758215

## Inference_1(unlabeled_data_1)

In [27]:
# Model 추론

preds = []
for i,(tri, vai) in enumerate( cv.split(train_x_1) ):
    pred = models[i].predict_proba(unlabeled_x_1)[:,1]
    preds.append(pred)

In [28]:
preds = np.mean(preds , axis = 0 )

In [29]:
preds_unlabeled = np.where(preds >=0.25,1,0)

In [30]:
preds_unlabeled.sum()

210

In [31]:
unlabeled_y_1 = preds_unlabeled

## Train_2(MFCC_32)

In [32]:
models = []
scores = []

for tri, vai in cv.split(train_x_2):
    print("="*50)
    
    preds = []
    model = ExtraTreesClassifier(random_state=CFG['SEED'])
    #model = MLPClassifier(random_state=CFG['SEED'])
                              
    model.fit(train_x_2.iloc[tri], train_y[tri])
    
    pred = model.predict_proba(train_x_2.iloc[vai])[:,1]
    pred = np.where(pred>=0.2 ,1 ,0)
    score = score_function(train_y[vai],pred)
    
    models.append(model)
    scores.append(score)



In [33]:
display(scores)
display(np.mean(scores))

[0.6381915772089182,
 0.5477207977207977,
 0.6047456130669904,
 0.6539223781998349,
 0.577702490745969,
 0.6010981912144703,
 0.5586527293844368,
 0.5681713774510989,
 0.5258341806081893,
 0.6223602484472051]

0.589839958404791

## Inference_2(unlabeled_data_2)

In [34]:
# Model 추론

preds = []
for i,(tri, vai) in enumerate( cv.split(train_x_2) ):
    pred = models[i].predict_proba(unlabeled_x_2)[:,1]
    preds.append(pred)

In [35]:
preds = np.mean(preds , axis = 0 )

In [36]:
preds_unlabeled = np.where(preds >=0.25,1,0)

In [37]:
preds_unlabeled.sum()

171

In [38]:
unlabeled_y_2 = preds_unlabeled

## Train_3(MFCC_64)

In [39]:
models = []
scores = []

for tri, vai in cv.split(train_x_3):
    print("="*50)
    
    preds = []
    model = ExtraTreesClassifier(random_state=CFG['SEED'])
    #model = MLPClassifier(random_state=CFG['SEED'])
                              
    model.fit(train_x_3.iloc[tri], train_y[tri])
    
    pred = model.predict_proba(train_x_3.iloc[vai])[:,1]
    pred = np.where(pred>=0.2 ,1 ,0)
    score = score_function(train_y[vai],pred)
    
    models.append(model)
    scores.append(score)



In [40]:
display(scores)
display(np.mean(scores))

[0.6346904889969833,
 0.605010005545242,
 0.6076737106017192,
 0.6350574712643678,
 0.6247313753581661,
 0.628721261789396,
 0.5458496305953933,
 0.5353247984826932,
 0.5314200053192776,
 0.6223602484472051]

0.5970838996400444

## Inference_3(unlabeled_data_3)

In [41]:
# Model 추론

preds = []
for i,(tri, vai) in enumerate( cv.split(train_x_3) ):
    pred = models[i].predict_proba(unlabeled_x_3)[:,1]
    preds.append(pred)

In [42]:
preds = np.mean(preds , axis = 0 )

In [43]:
preds_unlabeled = np.where(preds >=0.25,1,0)

In [44]:
preds_unlabeled.sum()

100

In [45]:
unlabeled_y_3 = preds_unlabeled

## Data merging(train + unlabaled)

In [46]:
train_x_1 = pd.concat([train_x_1,unlabeled_x_1],axis=0, ignore_index=True)
train_x_2 = pd.concat([train_x_2,unlabeled_x_2],axis=0, ignore_index=True)
train_x_3 = pd.concat([train_x_3,unlabeled_x_3],axis=0, ignore_index=True)

display(train_x_1)
display(train_x_2)
display(train_x_3)

Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,female,male,other
0,24,0,1,274.934723,29.345425,19.152718,7.836880,9.094099,8.553542,32.653938,...,5.151807,2.669801,7.865372,10.570987,2.887322,4.517178,8.187102,1.0,0.0,0.0
1,51,0,0,311.563171,52.478149,0.098956,11.070889,5.932185,1.739854,15.766101,...,3.985818,7.574249,10.625893,3.174112,10.302366,1.404506,10.507941,0.0,1.0,0.0
2,22,0,0,438.290009,46.588909,22.689060,3.607528,13.873103,0.270997,9.013165,...,7.756683,2.647656,7.490925,1.030777,1.478194,3.565254,3.868448,0.0,1.0,0.0
3,29,1,0,368.426086,46.939358,7.443123,3.694382,20.511757,9.271688,10.894087,...,2.555442,2.403031,17.141569,4.961744,8.069848,0.244983,3.172915,1.0,0.0,0.0
4,23,0,0,535.194458,7.165524,7.422007,2.231187,5.300425,0.644981,6.101685,...,0.401199,0.912838,5.177626,1.340888,3.866654,1.082266,2.545630,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5667,8,0,0,460.734985,22.145943,5.924560,6.891038,4.565688,5.322806,17.261806,...,9.555286,1.777077,0.332384,2.992923,2.914103,1.097831,2.742417,1.0,0.0,0.0
5668,29,0,1,331.947937,34.791584,6.310560,7.249038,11.100346,5.172902,0.035717,...,4.951110,2.869026,11.379020,4.896920,17.078371,0.194625,4.672140,0.0,1.0,0.0
5669,17,0,0,429.886780,26.009357,0.204722,1.174492,4.136636,1.897107,7.577222,...,5.878228,4.985948,6.863891,2.100668,6.555089,1.732090,9.760283,0.0,1.0,0.0
5670,22,0,0,647.736206,27.062975,7.233325,16.181942,4.624072,1.537723,6.056340,...,4.315625,6.540668,1.004547,4.144324,0.962796,1.375306,0.268144,0.0,1.0,0.0


Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,shift_26,shift_27,shift_28,shift_29,shift_30,shift_31,shift_32,female,male,other
0,24,0,1,274.934723,29.345425,19.152718,7.836880,9.094099,8.553542,32.653938,...,2.320942,2.150005,0.925417,2.116030,0.192730,2.417784,4.736650,1.0,0.0,0.0
1,51,0,0,311.563171,52.478149,0.098956,11.070889,5.932185,1.739854,15.766101,...,6.494778,0.545812,6.261986,2.384403,6.743353,0.255105,0.966994,0.0,1.0,0.0
2,22,0,0,438.290009,46.588909,22.689060,3.607528,13.873103,0.270997,9.013165,...,0.156510,1.682014,2.618637,1.244486,0.074025,0.964130,0.735731,0.0,1.0,0.0
3,29,1,0,368.426086,46.939358,7.443123,3.694382,20.511757,9.271688,10.894087,...,0.155855,3.839285,2.503367,2.750743,1.758510,2.094587,0.295868,1.0,0.0,0.0
4,23,0,0,535.194458,7.165524,7.422007,2.231187,5.300425,0.644981,6.101685,...,0.144311,0.413284,1.452623,0.235582,0.973687,0.777570,0.735323,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5667,8,0,0,460.734985,22.145943,5.924560,6.891038,4.565688,5.322806,17.261806,...,1.168158,0.281287,0.027616,0.804502,0.053456,1.071405,1.741596,1.0,0.0,0.0
5668,29,0,1,331.947937,34.791584,6.310560,7.249038,11.100346,5.172902,0.035717,...,6.816239,1.239252,7.371759,4.854304,5.922822,2.728520,0.768779,0.0,1.0,0.0
5669,17,0,0,429.886780,26.009357,0.204722,1.174492,4.136636,1.897107,7.577222,...,3.359710,1.737853,1.403238,3.027201,2.623329,1.607557,2.409688,0.0,1.0,0.0
5670,22,0,0,647.736206,27.062975,7.233325,16.181942,4.624072,1.537723,6.056340,...,1.979098,0.542786,1.097919,0.407303,0.257606,0.489647,1.584274,0.0,1.0,0.0


Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,shift_58,shift_59,shift_60,shift_61,shift_62,shift_63,shift_64,female,male,other
0,24,0,1,274.934723,29.345425,19.152718,7.836880,9.094099,8.553542,32.653938,...,1.168359,0.769299,0.813916,0.093353,0.432930,0.864100,0.602648,1.0,0.0,0.0
1,51,0,0,311.563171,52.478149,0.098956,11.070889,5.932185,1.739854,15.766101,...,0.136317,1.364793,0.764080,0.504636,2.027863,0.737008,1.503910,0.0,1.0,0.0
2,22,0,0,438.290009,46.588909,22.689060,3.607528,13.873103,0.270997,9.013165,...,0.290342,0.190022,0.551670,1.325226,0.964962,1.212976,0.305878,0.0,1.0,0.0
3,29,1,0,368.426086,46.939358,7.443123,3.694382,20.511757,9.271688,10.894087,...,0.659916,0.173445,0.780549,0.153836,1.330828,1.098836,0.810187,1.0,0.0,0.0
4,23,0,0,535.194458,7.165524,7.422007,2.231187,5.300425,0.644981,6.101685,...,0.258125,0.029052,0.210105,0.154846,0.132687,0.022355,0.527574,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5667,8,0,0,460.734985,22.145943,5.924560,6.891038,4.565688,5.322806,17.261806,...,0.853799,0.466124,0.285205,0.093071,0.352478,0.259504,0.593708,1.0,0.0,0.0
5668,29,0,1,331.947937,34.791584,6.310560,7.249038,11.100346,5.172902,0.035717,...,3.599545,1.319965,4.543694,2.035554,2.938769,0.805406,1.591742,0.0,1.0,0.0
5669,17,0,0,429.886780,26.009357,0.204722,1.174492,4.136636,1.897107,7.577222,...,0.299142,0.691836,0.307424,0.308616,0.927968,0.434896,0.134586,0.0,1.0,0.0
5670,22,0,0,647.736206,27.062975,7.233325,16.181942,4.624072,1.537723,6.056340,...,2.027822,1.386646,2.259917,1.988593,1.303187,0.816345,1.098722,0.0,1.0,0.0


In [47]:
unlabeled_y_1 = pd.Series(unlabeled_y_1)
train_y_1 = pd.concat([train_y,unlabeled_y_1],axis=0,ignore_index=True)

unlabeled_y_2 = pd.Series(unlabeled_y_2)
train_y_2 = pd.concat([train_y,unlabeled_y_2],axis=0,ignore_index=True)

unlabeled_y_3 = pd.Series(unlabeled_y_3)
train_y_3 = pd.concat([train_y,unlabeled_y_3],axis=0,ignore_index=True)

## Re_training_1(MFCC_16)

In [48]:
#MFCC_16

models = []
scores = []

for tri, vai in cv.split(train_x_1):
    print("="*50)
    
    preds = []
    model = ExtraTreesClassifier(random_state=CFG['SEED'])
#    model = MLPClassifier(random_state=CFG['SEED'])

                              
    model.fit(train_x_1.iloc[tri], train_y_1[tri])
    
    pred = model.predict_proba(train_x_1.iloc[vai])[:,1]
    pred = np.where(pred>=0.2 ,1 ,0)
    score = score_function(train_y_1[vai],pred)
    
    models.append(model)
    scores.append(score)



In [49]:
display(scores)
display(np.mean(scores))

[0.7012622720897616,
 0.721915846843102,
 0.75,
 0.7219736438859945,
 0.7665078284547312,
 0.698798810703667,
 0.718856364874064,
 0.6671491640732432,
 0.743559329265691,
 0.7032163742690059]

0.7193239634459261

## Inference_1(test_data_1)

In [50]:
# Model 추론

preds_1 = []
for i,(tri, vai) in enumerate( cv.split(train_x_1) ):
    pred = models[i].predict_proba(test_x_1)[:,1]
    preds_1.append(pred)

## Re_training_2(MFCC_32)

In [51]:
#MFCC_32

models = []
scores = []

for tri, vai in cv.split(train_x_2):
    print("="*50)
    
    preds = []
    model = ExtraTreesClassifier(random_state=CFG['SEED'])
#    model = MLPClassifier(random_state=CFG['SEED'])

                              
    model.fit(train_x_2.iloc[tri], train_y_2[tri])
    
    pred = model.predict_proba(train_x_2.iloc[vai])[:,1]
    pred = np.where(pred>=0.2 ,1 ,0)
    score = score_function(train_y_2[vai],pred)
    
    models.append(model)
    scores.append(score)



In [52]:
display(scores)
display(np.mean(scores))

[0.7464895114498575,
 0.7181640971114656,
 0.7093260721579304,
 0.7368526422116863,
 0.7230368009272674,
 0.6829745596868885,
 0.7012496396226228,
 0.6149511783275825,
 0.6890812321918176,
 0.6929206963249517]

0.701504643001207

## Inference_2(test_data_2)

In [53]:
# Model 추론

preds_2 = []
for i,(tri, vai) in enumerate( cv.split(train_x_2) ):
    pred = models[i].predict_proba(test_x_2)[:,1]
    preds_2.append(pred)

## Re_training_3(MFCC_64)

In [54]:
#MFCC_64

models = []
scores = []

for tri, vai in cv.split(train_x_3):
    print("="*50)
    
    preds = []
    model = ExtraTreesClassifier(random_state=CFG['SEED'])
#    model = MLPClassifier(random_state=CFG['SEED'])

                              
    model.fit(train_x_3.iloc[tri], train_y_3[tri])
    
    pred = model.predict_proba(train_x_3.iloc[vai])[:,1]
    pred = np.where(pred>=0.2 ,1 ,0)
    score = score_function(train_y_3[vai],pred)
    
    models.append(model)
    scores.append(score)



In [55]:
display(scores)
display(np.mean(scores))

[0.7371222289072985,
 0.6714864083285136,
 0.6700918881533445,
 0.6906783681214421,
 0.6813583815028901,
 0.660283152849281,
 0.6865055474326998,
 0.5722084146990947,
 0.6333200676549597,
 0.6420454545454546]

0.6645099912194979

## Inference_3(test_data_3)

In [56]:
# Model 추론

preds_3 = []
for i,(tri, vai) in enumerate( cv.split(train_x_3) ):
    pred = models[i].predict_proba(test_x_3)[:,1]
    preds_3.append(pred)

## Submission
- ExtraTreesClassifier를 이용한 3개의 모델의 예측값을 앙상블하였습니다.

In [57]:
preds_1 = np.mean(preds_1 , axis = 0 )
preds_2 = np.mean(preds_2 , axis = 0 )
preds_3 = np.mean(preds_3 , axis = 0 )

In [58]:
display(preds_1)
display(preds_2)
display(preds_3)

array([0.073, 0.058, 0.021, ..., 0.099, 0.012, 0.426])

array([0.066, 0.044, 0.047, ..., 0.061, 0.018, 0.41 ])

array([0.071, 0.034, 0.03 , ..., 0.077, 0.035, 0.273])

In [59]:
preds_final = [x + y + z for x, y, z in zip(preds_1,preds_2,preds_3)]
preds_final = np.array(preds_final)
preds_final = preds_final/3

In [60]:
preds_final

array([0.07      , 0.04533333, 0.03266667, ..., 0.079     , 0.02166667,
       0.36966667])

In [61]:
preds_final = np.where(preds_final >=0.2 ,1,0)

In [62]:
preds_final.sum()

424

In [63]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds_final
submission.to_csv('./ET_Kfold_ensemble1.csv', index=False)