In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import

In [2]:
import random
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
import librosa

from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings(action='ignore') 

## Hyperparameter Setting

In [3]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # Melspectrogram 벡터를 추출할 개수
    'SEED':42
}

## Fixed Random-Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-Processing

In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/음성 분류/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/음성 분류/test.csv')

In [6]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['path']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)

    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    return mfcc_df

In [7]:
%cd /content/drive/MyDrive/음성 분류

/content/drive/MyDrive/음성 분류


In [8]:
train_df

Unnamed: 0,id,path,label
0,TRAIN_0000,./train/TRAIN_0000.wav,1
1,TRAIN_0001,./train/TRAIN_0001.wav,2
2,TRAIN_0002,./train/TRAIN_0002.wav,4
3,TRAIN_0003,./train/TRAIN_0003.wav,5
4,TRAIN_0004,./train/TRAIN_0004.wav,4
...,...,...,...
4996,TRAIN_4996,./train/TRAIN_4996.wav,5
4997,TRAIN_4997,./train/TRAIN_4997.wav,0
4998,TRAIN_4998,./train/TRAIN_4998.wav,1
4999,TRAIN_4999,./train/TRAIN_4999.wav,1


In [9]:
train_x = get_mfcc_feature(train_df)
test_x = get_mfcc_feature(test_df)

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/1881 [00:00<?, ?it/s]

In [15]:
train_y = train_df['label']

## Grid Search 통한 Best Parameters 찾기

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [44]:
train_features=train_df.iloc[:, :2]
train_labels=train_df[["label"]]

In [18]:
# sklearn.ensemble에서 RandomForestRegressor 가져오기
rf = RandomForestRegressor(random_state = 42)

# 현재 포리스트에서 사용하는 매개변수 보기 
print('현재 사용 중인 매개변수:\n') 
print(rf.get_params())

현재 사용 중인 매개변수:

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [50]:
# 랜덤 포레스트의 트리 수 
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] 
# 모든 분할에서 고려할 기능 수 
max_features = ['auto', 'sqrt']  
# 트리의 최대 수준 수 
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] 
max_depth.append(None) 
# 노드를 분할하는 데 필요한 최소 샘플 수 
min_samples_split = [2, 5, 10] 
# 각 리프 노드에 필요한 최소 샘플 수 
min_samples_leaf = [1, 2, 4] 
# 각 트리 학습을 위한 샘플 선택 방법 
bootstrap = [True, False]
parameters = {'n_estimators': n_estimators, 
               'max_features': max_features, 
               'max_depth': max_depth, 
               'min_samples_split': min_samples_split, 
               'min_samples_leaf': min_samples_leaf, 
               'bootstrap': bootstrap}

In [51]:
parameters

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
rt=RandomForestClassifier()

In [55]:
grid_rt=GridSearchCV(rt, param_grid=parameters, cv=3, refit=True)

In [None]:
# 아래부터 에러뜬다. not convert to float.

In [58]:
# grid_rt.fit(train_features, train_labels)

In [59]:
# score_df=pd.DataFrame(grid_rt.cv_results_)

## Classification Model Fit

In [12]:
model = RandomForestClassifier(random_state=CFG['SEED'])
model.fit(train_x, train_y)

## Inference

In [13]:
preds = model.predict(test_x)

## Submission

In [14]:
submission = pd.read_csv('./sample_submission.csv')
submission['label'] = preds
submission.to_csv('./baseline_submission.csv', index=False)