In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import

In [2]:
import random
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
import librosa

import warnings
warnings.filterwarnings(action='ignore') 

In [3]:
# data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

# 경고창 무시하기.
import warnings
warnings.filterwarnings("ignore") 

# 마이너스 깨짐 방지하기.
plt.rcParams['axes.unicode_minus'] = False

# 한글 깨짐 방지하기.
import matplotlib.font_manager as fm
plt.rc('font', family='Malgun Gothic')

# 모든 열 보이도록 설정하기.
pd.options.display.max_columns = None 

# visualization
import seaborn as sns
%matplotlib inline

# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# model
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, ward
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift, estimate_bandwidth

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier

# grid search
from sklearn.model_selection import GridSearchCV

# evaluation
from sklearn.metrics.cluster import silhouette_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn import metrics
from sklearn.metrics import *

from sklearn.tree import export_graphviz
import graphviz

## Hyperparameter Setting

In [4]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # Melspectrogram 벡터를 추출할 개수
    'SEED':42
}

## Fixed Random-Seed

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-Processing

In [6]:
train_df = pd.read_csv('/content/drive/MyDrive/음성 분류/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/음성 분류/test.csv')

In [7]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['path']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)

    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    return mfcc_df

In [8]:
%cd /content/drive/MyDrive/음성 분류

/content/drive/MyDrive/음성 분류


In [9]:
train_df

Unnamed: 0,id,path,label
0,TRAIN_0000,./train/TRAIN_0000.wav,1
1,TRAIN_0001,./train/TRAIN_0001.wav,2
2,TRAIN_0002,./train/TRAIN_0002.wav,4
3,TRAIN_0003,./train/TRAIN_0003.wav,5
4,TRAIN_0004,./train/TRAIN_0004.wav,4
...,...,...,...
4996,TRAIN_4996,./train/TRAIN_4996.wav,5
4997,TRAIN_4997,./train/TRAIN_4997.wav,0
4998,TRAIN_4998,./train/TRAIN_4998.wav,1
4999,TRAIN_4999,./train/TRAIN_4999.wav,1


In [10]:
train_x = get_mfcc_feature(train_df)
test_x = get_mfcc_feature(test_df)

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/1881 [00:00<?, ?it/s]

In [11]:
train_y = train_df['label']

## 훈련 데이터와 테스트 데이터 분리하기.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train_x,
                                                    train_y, 
                                                    test_size=0.2, 
                                                    shuffle=True, 
                                                    random_state=1004,
                                                    stratify=train_y)

In [14]:
results=[]
i=0
random_state=1000
skf = StratifiedKFold(n_splits=10)
# skf = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
for train_idx, validation_idx in skf.split(train_x, train_y):
    x_train_ms = train_x.iloc[train_idx,:]
    y_train_ms = train_y.iloc[train_idx]
    x_validation_ms = train_x.iloc[validation_idx,:]
    y_validation_ms = train_y.iloc[validation_idx]
    
    # 1. Logistic regression
    model = LogisticRegression(max_iter=1000, random_state=random_state)
    start_time = time.time()
    model.fit(x_train_ms, y_train_ms)
    training_time = time.time() - start_time
    y_pred_train = model.predict(x_train_ms)
    y_pred_validation = model.predict(x_validation_ms)
    accuracy_train = accuracy_score(y_train_ms, y_pred_train)
    accuracy_validation = accuracy_score(y_validation_ms, y_pred_validation)
    results.append([i,'LogisticRegression',accuracy_train,accuracy_validation,training_time])
    
    # 2. Decision tree
    model = DecisionTreeClassifier()
    start_time = time.time()
    model.fit(x_train_ms, y_train_ms)
    training_time = time.time() - start_time
    y_pred_train = model.predict(x_train_ms)
    y_pred_validation = model.predict(x_validation_ms)
    accuracy_train = accuracy_score(y_train_ms, y_pred_train)
    accuracy_validation = accuracy_score(y_validation_ms, y_pred_validation)
    results.append([i,'DecisionTree',accuracy_train,accuracy_validation,training_time])
    
    # 3. Support vector machine
    model = SVC()
    start_time = time.time()
    model.fit(x_train_ms, y_train_ms)
    training_time = time.time() - start_time
    y_pred_train = model.predict(x_train_ms)
    y_pred_validation = model.predict(x_validation_ms)
    accuracy_train = accuracy_score(y_train_ms, y_pred_train)
    accuracy_validation = accuracy_score(y_validation_ms, y_pred_validation)
    results.append([i,'SupportVectorMachine',accuracy_train,accuracy_validation,training_time])
    
    # 4. K nearest neighbor
    model = KNeighborsClassifier()
    start_time = time.time()
    model.fit(x_train_ms, y_train_ms)
    training_time = time.time() - start_time
    y_pred_train = model.predict(x_train_ms)
    y_pred_validation = model.predict(x_validation_ms)
    accuracy_train = accuracy_score(y_train_ms, y_pred_train)
    accuracy_validation = accuracy_score(y_validation_ms, y_pred_validation)
    results.append([i,'KNearestNeighbor',accuracy_train,accuracy_validation,training_time]) 
    
    # 5. SGD
    model = SGDClassifier()
    start_time = time.time()
    model.fit(x_train_ms, y_train_ms)
    training_time = time.time() - start_time
    y_pred_train = model.predict(x_train_ms)
    y_pred_validation = model.predict(x_validation_ms)
    accuracy_train = accuracy_score(y_train_ms, y_pred_train)
    accuracy_validation = accuracy_score(y_validation_ms, y_pred_validation)
    results.append([i,'SGDClassifier',accuracy_train,accuracy_validation,training_time]) 
    i+=1

In [15]:
df_results =pd.DataFrame(data=results, 
                columns= ['iter','method','accuracy_train','accuracy_validation','training_time'])
df_results.tail(20)

Unnamed: 0,iter,method,accuracy_train,accuracy_validation,training_time
30,6,LogisticRegression,0.419018,0.412,2.952351
31,6,DecisionTree,1.0,0.354,0.351235
32,6,SupportVectorMachine,0.386136,0.37,2.020076
33,6,KNearestNeighbor,0.574761,0.392,0.00259
34,6,SGDClassifier,0.26994,0.266,0.39674
35,7,LogisticRegression,0.420795,0.372,2.576931
36,7,DecisionTree,1.0,0.334,0.243937
37,7,SupportVectorMachine,0.383026,0.384,1.138818
38,7,KNearestNeighbor,0.573872,0.408,0.002672
39,7,SGDClassifier,0.342368,0.338,0.40308


In [16]:
df_acc_test = df_results.pivot_table(index='method', columns='iter')["accuracy_validation"].reset_index()
df_acc_test['mean']=np.mean(df_acc_test, axis=1)
df_acc_test=df_acc_test.sort_values('mean')
df_acc_test

iter,method,0,1,2,3,4,5,6,7,8,9,mean
3,SGDClassifier,0.337325,0.318,0.33,0.346,0.26,0.252,0.266,0.338,0.344,0.278,0.306933
0,DecisionTree,0.329341,0.314,0.294,0.338,0.322,0.3,0.354,0.334,0.312,0.286,0.318334
4,SupportVectorMachine,0.38523,0.378,0.39,0.378,0.382,0.348,0.37,0.384,0.398,0.392,0.380523
1,KNearestNeighbor,0.41517,0.37,0.354,0.414,0.358,0.372,0.392,0.408,0.384,0.374,0.384117
2,LogisticRegression,0.421158,0.43,0.412,0.38,0.394,0.398,0.412,0.372,0.404,0.388,0.401116


In [26]:
# fig, ax=plt.subplots(1, 4, figsize=(20, 5), dpi=200)
# ax=ax.flatten()

# idx=0
# for y_name in ["accuracy_train", "accuracy_validation", "training_time", "training_time"]:
#     _=sns.barplot(x="method", y=y_name, data=df_results, capsize=.2, 
#                  order=df_results.groupby('method').mean().sort_values(y_name).index, ax=ax[idx])
#     _=ax[idx].set(ylim=(0, 0.5))
#     _=ax[idx].set_xticklabels(ax[idx].get_xticklabels(),rotation=30);
#     idx+=1
# _=ax[2].set_ylim(0, 0.5) # 0초 ~ 0.5초까지 보여줌.
# _=ax[3].set_ylim(0, 15) # 0초 ~ 15초까지 보여줌.

## Classification Model Fit

In [18]:
# model = RandomForestClassifier(random_state=CFG['SEED'])
# model.fit(train_x, train_y)

###1. SVM

In [40]:
model = SVC(random_state=CFG['SEED'])
model.fit(train_x, train_y)

### 2. KNN

In [43]:
model=KNeighborsClassifier()
model.fit(train_x, train_y)

### 3. LR

In [45]:
model=LogisticRegression(random_state=CFG['SEED'])
model.fit(train_x, train_y)

## Inference

In [46]:
preds = model.predict(test_x)

## Submission

In [47]:
submission = pd.read_csv('/content/drive/MyDrive/음성 분류/sample_submission.csv')
submission['label'] = preds
submission.to_csv('./lr_submission.csv', index=False)