**Notes on this Notebook file**

Bacause of my lack of computing skill, I bulit my computational process via `.ipynb` format rather than to build executable Python script `run.med.sh`.

This notebook was written under the conditions below:
- Cloning GitHub repository: `git clone 'http://github.com/11775website/11775-hws'` was done on `/home/ubuntu` directory
- Downloading source data: was done on `/home/ubuntu` directory
- Raw feature generation (MFCC / ASRS) : was done on `/home/ubuntu` directory

## Configuration

In [2]:
import numpy as np
import pandas as pd
import lightgbm
import time
import os

from glob import glob
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import chi2_kernel, laplacian_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import normalize
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## MFCC feature extraction

**Path setting for MFCC raw data**

In [3]:
PATH = "/home/ubuntu/mfcc"
EXT = "*.csv"
filelist = np.array([file for file in glob(os.path.join(PATH, EXT))])

In [4]:
filelist_simple = np.array([file[37:44] for file in glob(os.path.join(PATH, EXT))])

**K-means clustering for feature extraction**

In [None]:
train_sample = pd.read_csv('select.mfcc.csv', header=None, sep=';')
start_time = time.time()
n_clusters = 70
n_init = 5
kmeans = KMeans(n_clusters = n_clusters, random_state = 0, n_init = n_init, n_jobs = -1).fit(train_sample)

def get_features(k, model, path_list):
    loaded_model= model
    start_time = time.time()
    features_dict = dict()
    filelist = path_list
    for i in range(len(filelist)):
        data = pd.read_csv(filelist[i], sep = ';', header = None)
        pred_centers = loaded_model.predict(data)
        num_clusters = k
        bow_preds = np.zeros((1, num_clusters))

        for ind in pred_centers:
            bow_preds[0, ind] += 1
        norm_feat = (1.0 * bow_preds)/np.sum(bow_preds)
        features_dict[i] = pd.DataFrame(norm_feat)

    features_total = features_dict[0].copy()
    for i in range(1, len(features_dict)):
        foo = features_dict[i].copy()
        features_total = pd.concat([features_total, foo], axis = 0)
        features_total = features_total.reset_index().drop('index', axis = 1)
        
    return features_total

total_features = get_features(70, model = kmeans, path_list = filelist)
total_features.to_csv(r'features_kmeans.csv')

In [5]:
features = pd.read_csv('features_kmeans.csv').drop(columns = ['Unnamed: 0']).to_numpy()

**Splitting dataset (train / val / test)**

In [6]:
TRAIN_PATH = '/home/ubuntu/11775-hws/all_trn.lst'
VAL_PATH = '/home/ubuntu/11775-hws/all_val.lst'
TEST_PATH = '/home/ubuntu/11775-hws/all_test_fake.lst'

In [7]:
train_name, train_label = [], []
val_name, val_label = [], []
test_name = []

In [8]:
with open(TRAIN_PATH, 'r') as f:
    for line in f:
        tmp = line.split(' ')
        train_name.append(tmp[0])
        train_label.append(tmp[1][:-1])
train_name = np.array(train_name)
train_label = np.array(train_label)
f.close()

In [9]:
with open(VAL_PATH, 'r') as f:
    for line in f:
        tmp = line.split(' ')
        val_name.append(tmp[0])
        val_label.append(tmp[1][:-1])
val_name = np.array(val_name)
val_label = np.array(val_label)
f.close()

In [10]:
with open(TEST_PATH, 'r') as f:
    for line in f:
        tmp = line.split(' ')
        test_name.append(tmp[0])
test_name = np.array(test_name)
f.close()

In [11]:
train_data = []
zeros = [0] * 70
for name in train_name:
    if name not in filelist_simple:
        tmp = np.argwhere(train_name == name)[0][0]
        train_label[tmp] = 'NULL'
        train_data.append(zeros)
    elif name in filelist_simple:
        train_data.append(features[np.argwhere(filelist_simple == name)[0][0]])
train_data = np.array(train_data)

In [12]:
val_data = []
zeros = [0] * 70
for name in val_name:
    if name not in filelist_simple:
        tmp = np.argwhere(val_name == name)[0][0]
        val_label[tmp] = 'NULL'
        val_data.append(zeros)
    elif name in filelist_simple:
        val_data.append(features[np.argwhere(filelist_simple == name)[0][0]])
val_data = np.array(val_data)

In [13]:
test_data = []
zeros = [0] * 70
for name in test_name:
    if name not in filelist_simple:
        test_data.append(zeros)
    elif name in filelist_simple:
        test_data.append(features[np.argwhere(filelist_simple == name)[0][0]])
test_data = np.array(test_data)

In [14]:
train_label_001 = train_label.copy()
for i in range(train_label.shape[0]):
    if train_label[i] != 'P001':
        train_label_001[i] = 'NULL'

In [15]:
train_label_002 = train_label.copy()
for i in range(train_label.shape[0]):
    if train_label[i] != 'P002':
        train_label_002[i] = 'NULL'

In [16]:
train_label_003 = train_label.copy()
for i in range(train_label.shape[0]):
    if train_label[i] != 'P003':
        train_label_003[i] = 'NULL'

In [17]:
val_label_001 = val_label.copy()
for i in range(val_label.shape[0]):
    if val_label[i] != 'P001':
        val_label_001[i] = 'NULL'

In [18]:
val_label_002 = val_label.copy()
for i in range(val_label.shape[0]):
    if val_label[i] != 'P002':
        val_label_002[i] = 'NULL'

In [19]:
val_label_003 = val_label.copy()
for i in range(val_label.shape[0]):
    if val_label[i] != 'P003':
        val_label_003[i] = 'NULL'

## LightGBM classifier (MFCC feature)

In [20]:
lgbm_001 = LGBMClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced',
                      boosting_type = 'goss', n_estimators = 500, learning_rate = 0.002,
                      reg_alpha = 0.5, reg_beta = 0.5, max_depth = 30)
lgbm_001.fit(train_data, train_label_001)
val_prob_001 = lgbm_001.predict_proba(val_data).T
average_precision_score(y_true = val_label_001, y_score = val_prob_001[1], pos_label = 'P001')

0.19325615727695478

In [21]:
lgbm_002 = LGBMClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced',
                      boosting_type = 'goss', n_estimators = 500, learning_rate = 0.002,
                      reg_alpha = 0.5, reg_beta = 0.5, max_depth = 30)
lgbm_002.fit(train_data, train_label_002)
val_prob_002 = lgbm_002.predict_proba(val_data).T
average_precision_score(y_true = val_label_002, y_score = val_prob_002[1], pos_label = 'P002')

0.4185909278600208

In [22]:
lgbm_003 = LGBMClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced',
                      boosting_type = 'goss', n_estimators = 500, learning_rate = 0.002,
                      reg_alpha = 0.5, reg_beta = 0.5, max_depth = 30)
lgbm_003.fit(train_data, train_label_003)
val_prob_003 = lgbm_003.predict_proba(val_data).T
average_precision_score(y_true = val_label_003, y_score = val_prob_003[1], pos_label = 'P003')

0.22571787956680175

## ASRS feature extraction

**Path setting for ASRS raw data**

In [23]:
PATH_ASRS = '/home/ubuntu/asrs/*.txt'
filelist_asrs = []

**import raw ASRS data**

In [24]:
for file in glob(PATH_ASRS):
    filelist_asrs.append(file)

In [68]:
filelist_asrs_simple = np.array([name[18:25] for name in filelist_asrs])

In [30]:
def concatenate_list_data(list):
    result= ''
    for element in list:
        result += str(element)
    return result

In [32]:
text_asrs = []
for i in range(len(filelist_asrs)):
    with open (filelist_asrs[i], "r") as myfile:
        data = myfile.readlines()
        data = concatenate_list_data(data)
    text_asrs.append(data)

**Generating BoW features**

In [39]:
vect = CountVectorizer(stop_words="english")
bow = vect.fit_transform(text_asrs).toarray()
norm_bow = normalize(bow, norm = 'l1', axis=1)

In [40]:
norm_bow.shape

(2226, 6986)

**Splitting dataset (train / val / test)**

In [69]:
asrs_train_label = train_label.copy()
asrs_val_label = val_label.copy()

In [70]:
asrs_train_data = []
zeros = [0] * 6986
for name in train_name:
    if name not in filelist_asrs_simple:
        tmp = np.argwhere(train_name == name)[0][0]
        asrs_train_label[tmp] = 'NULL'
        asrs_train_data.append(zeros)
    elif name in filelist_asrs_simple:
        asrs_train_data.append(norm_bow[np.argwhere(filelist_asrs_simple == name)[0][0]])
asrs_train_data = np.array(asrs_train_data)

In [71]:
asrs_val_data = []
zeros = [0] * 6986
for name in val_name:
    if name not in filelist_asrs_simple:
        tmp = np.argwhere(val_name == name)[0][0]
        asrs_val_label[tmp] = 'NULL'
        asrs_val_data.append(zeros)
    elif name in filelist_asrs_simple:
        asrs_val_data.append(norm_bow[np.argwhere(filelist_asrs_simple == name)[0][0]])
asrs_val_data = np.array(asrs_val_data)

In [72]:
asrs_test_data = []
zeros = [0] * 6986
for name in test_name:
    if name not in filelist_asrs_simple:
        asrs_test_data.append(zeros)
    elif name in filelist_asrs_simple:
        asrs_test_data.append(norm_bow[np.argwhere(filelist_asrs_simple == name)[0][0]])
asrs_test_data = np.array(asrs_test_data)

In [73]:
asrs_train_label_001 = asrs_train_label.copy()
for i in range(asrs_train_label.shape[0]):
    if asrs_train_label[i] != 'P001':
        asrs_train_label_001[i] = 'NULL'

In [74]:
asrs_train_label_002 = asrs_train_label.copy()
for i in range(asrs_train_label.shape[0]):
    if asrs_train_label[i] != 'P002':
        asrs_train_label_002[i] = 'NULL'

In [75]:
asrs_train_label_003 = asrs_train_label.copy()
for i in range(asrs_train_label.shape[0]):
    if asrs_train_label[i] != 'P003':
        asrs_train_label_003[i] = 'NULL'

In [76]:
asrs_val_label_001 = asrs_val_label.copy()
for i in range(asrs_val_label.shape[0]):
    if asrs_val_label[i] != 'P001':
        asrs_val_label_001[i] = 'NULL'

In [77]:
asrs_val_label_002 = asrs_val_label.copy()
for i in range(asrs_val_label.shape[0]):
    if asrs_val_label[i] != 'P002':
        asrs_val_label_002[i] = 'NULL'

In [78]:
asrs_val_label_003 = asrs_val_label.copy()
for i in range(asrs_val_label.shape[0]):
    if asrs_val_label[i] != 'P003':
        asrs_val_label_003[i] = 'NULL'

## LightGBM classifier (ASRS features)

In [85]:
asrs_lgbm_001 = LGBMClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced',
                      boosting_type = 'goss', n_estimators = 10000, learning_rate = 0.002,
                      reg_alpha = 0.5, reg_beta = 0.5, max_depth = 30)
asrs_lgbm_001.fit(asrs_train_data, asrs_train_label_001)
asrs_val_prob_001 = asrs_lgbm_001.predict_proba(asrs_val_data).T
average_precision_score(y_true = asrs_val_label_001, y_score = asrs_val_prob_001[1], pos_label = 'P001')

0.08172935069886589

In [87]:
asrs_lgbm_002 = LGBMClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced',
                      boosting_type = 'goss', n_estimators = 10000, learning_rate = 0.002,
                      reg_alpha = 0.5, reg_beta = 0.5, max_depth = 30)
asrs_lgbm_002.fit(asrs_train_data, asrs_train_label_002)
asrs_val_prob_002 = asrs_lgbm_002.predict_proba(asrs_val_data).T
average_precision_score(y_true = asrs_val_label_002, y_score = asrs_val_prob_002[1], pos_label = 'P002')

0.03220559678193491

In [91]:
asrs_lgbm_003 = LGBMClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced',
                      boosting_type = 'goss', n_estimators = 500, learning_rate = 0.002,
                      reg_alpha = 0.5, reg_beta = 0.5, max_depth = 30)
asrs_lgbm_003.fit(asrs_train_data, asrs_train_label_003)
asrs_val_prob_003 = asrs_lgbm_003.predict_proba(asrs_val_data).T
average_precision_score(y_true = asrs_val_label_003, y_score = asrs_val_prob_003[1], pos_label = 'P003')

0.13441639081809278

## Score Files

In [49]:
P001_mfcc = lgbm_001.predict_proba(test_data).T[1]
P002_mfcc = lgbm_002.predict_proba(test_data).T[1]
P003_mfcc = lgbm_003.predict_proba(test_data).T[1]
P001_asrs = asrs_lgbm_001.predict_proba(asrs_test_data).T[1]
P002_asrs = asrs_lgbm_002.predict_proba(asrs_test_data).T[1]
P003_asrs = asrs_lgbm_003.predict_proba(asrs_test_data).T[1]
P001_best = lgbm_001.predict_proba(test_data).T[1]
P002_best = lgbm_002.predict_proba(test_data).T[1]
P003_best = lgbm_003.predict_proba(test_data).T[1]

In [51]:
names = ['P001_mfcc.lst', 'P002_mfcc.lst', 'P003_mfcc.lst',
         'P001_asrs.lst', 'P002_asrs.lst', 'P003_asrs.lst',
         'P001_best.lst', 'P002_best.lst', 'P003_best.lst']
scores = [P001_mfcc, P002_mfcc, P003_mfcc,
         P001_asrs, P002_asrs, P003_asrs,
         P001_best, P002_best, P003_best]

In [61]:
for idx in range(len(names)):
    with open(names[idx], 'w') as f:
        for score in scores[idx]:
            f.write("{}\n".format(str(score)))
        f.close()