# Install packages and set path

In [1]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [2]:
import os
import glob
import pandas as pd
import numpy as np
from scipy import signal
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, cross_validate
import torch
from torch import nn

path = "/home/18rvb/ELEC872/competition/Competition/"

In [3]:
# train_meta contains filenames of training data and corresponding labels

train_meta = pd.read_csv(path + '/train_meta.txt', delimiter = ' ', header=None, names=['filename', 'labels'])

In [4]:
train_meta['labels'] = train_meta['labels'].str.replace('[','')
train_meta['labels'] = train_meta['labels'].str.replace(']','')

In [5]:
train_meta['labels'] = train_meta.labels.apply(pd.to_numeric, errors='coerce')

In [6]:
train_meta['labels'] = pd.to_numeric(train_meta.labels, errors='coerce')

In [7]:
# check if balanced dataset

# train_meta.labels.value_counts()

In [8]:
num_list = []
for i in train_meta['filename']:
    temp_list = np.genfromtxt(path + i)
    num_list.append(temp_list)

In [9]:
train_meta['features'] = num_list

In [10]:
from sklearn.preprocessing import minmax_scale

In [11]:
scaled_values = []
for val in train_meta.features:
    scaled_values.append(minmax_scale(val))

In [12]:
train_meta['scaled'] = scaled_values

In [13]:
# Possible features: rms, mean, max, min, med, skew, kurt, std, iqr, max_energy_freq, mean_freq, median_freq, waveform_length, zero_crossing

means = []

for i in train_meta.features:
    temp_means = []
    for k in i:
        temp_means.append(np.mean(k))
    means.append(temp_means)
    
train_meta['means'] = means

In [14]:
median = []

for i in train_meta.features:
    temp_median = []
    for k in i:
        temp_median.append(np.median(k))
    median.append(temp_median)
    
train_meta['median'] = median

In [15]:
std = []

for i in train_meta.features:
    temp_std = []
    for k in i:
        temp_std.append(np.std(k))
    std.append(temp_std)
    
train_meta['std'] = std

In [16]:
rms = []

for i in train_meta.features:
    temp_rms = []
    for k in i:
        temp_rms.append(np.sqrt(np.mean(k**2)))
    rms.append(temp_rms)
    
train_meta['rms'] = rms

In [17]:
train_meta

Unnamed: 0,filename,labels,features,scaled,means,median,std,rms
0,/train/GiLImBvDWs.csv,5,"[[-9.66962, -1.20624, -0.763766, -3.86514, 1.0...","[[0.566780549817258, 0.17854933381618454, 0.99...","[-3.211244555555555, -3.207782, -3.20647433333...","[-0.763766, -0.772598, -0.791928, -0.822014, -...","[5.896175141545032, 5.880682855680641, 5.86252...","[6.7139387020852945, 6.698678691251001, 6.6821..."
1,/train/VQEyZLyBMp.csv,5,"[[1.38216, -0.128894, -2.18949, -8.38661, 0.27...","[[0.2734943334482953, 0.39448590438490483, 0.0...","[1.1413136555555556, 1.149737022222222, 1.1586...","[0.275239, 0.25868, 0.245538, 0.23784, 0.22358...","[5.968897493169099, 5.9594234160174775, 5.9505...","[6.077033334145694, 6.069318138938332, 6.06231..."
2,/train/AdgWYaoORE.csv,5,"[[-8.91717, -0.687015, -1.48425, -2.78385, 0.5...","[[0.4745658403369415, 0.7434271672075722, 0.07...","[-2.937389777777778, -2.932377444444445, -2.92...","[-0.687015, -0.698856, -0.710341, -0.720778, -...","[5.111957573458653, 5.114029609246989, 5.11374...","[5.895792477643244, 5.895094258868253, 5.89177..."
3,/train/lKhsadfBkL.csv,5,"[[2.37224, -0.749576, -0.448302, -6.5086, 0.57...","[[0.29074703213231956, 0.18891469573442898, 0....","[0.238337888888889, 0.23560888888888887, 0.217...","[0.575349, 0.582675, 0.584939, 0.633389, 0.653...","[2.6670133308286776, 2.6983997285319745, 2.755...","[2.6776416967357473, 2.708666211157235, 2.7638..."
4,/train/DKwGHgrHLG.csv,5,"[[-8.81887, -0.0930824, -0.567127, 4.8778, -0....","[[0.6999064197978475, 0.2572575567216022, 0.86...","[-2.2249259333333335, -2.220695777777778, -2.2...","[-0.0930824, -0.101238, -0.109498, -0.117654, ...","[6.902218425274151, 6.892162149960471, 6.88511...","[7.251959362752469, 7.241090307321667, 7.23330..."
...,...,...,...,...,...,...,...,...
1345,/train/KmaBQPpvuV.csv,0,"[[5.1333, -1.03626, 4.28043, -20.2308, 2.03229...","[[0.8466238453420969, 0.0, 0.9999999999999999,...","[3.35037, 3.3527384444444444, 3.33179888888888...","[2.40088, 2.46057, 2.47082, 2.44175, 2.30596, ...","[12.028850382953847, 12.028936154911227, 12.02...","[12.4867217744446, 12.48744009378168, 12.48107..."
1346,/train/THvCkkaZzj.csv,0,"[[2.35843, 1.58066, 1.10583, -24.4188, 1.8761,...","[[0.17336729805209217, 0.5800102419545734, 0.4...","[1.3071604444444445, 1.2822006666666668, 1.219...","[1.58066, 1.5715, 1.58053, 1.57022, 1.53138, 1...","[11.372685329352743, 11.363242801631428, 11.34...","[11.447560440023688, 11.43535419295917, 11.411..."
1347,/train/kqdPRRxErl.csv,0,"[[-1.0766, -0.650192, 4.8696, -21.6198, -1.976...","[[0.36967408034961613, 0.2330570725537875, 0.8...","[0.6638921111111113, 0.7039058888888884, 0.796...","[-0.764809, -0.751949, -0.916217, -1.04173, -1...","[11.746792317965335, 11.828143714019394, 11.57...","[11.765537917860154, 11.849070310362286, 11.60..."
1348,/train/XMCflmfeyQ.csv,0,"[[0.664292, 1.28113, 2.70629, -19.0547, 0.5641...","[[0.6883984312153334, 0.5930026583475427, 0.57...","[1.5015572222222222, 1.4337159999999995, 1.370...","[1.28113, 1.01986, 0.719702, 0.67021, 0.769737...","[10.081449476160767, 10.202951480173034, 10.25...","[10.192659006951517, 10.303191761557246, 10.34..."


In [18]:
# X_train, X_test, y_train, y_test = train_test_split(train_meta.rms, train_meta.labels, test_size=0.1, random_state=42)

In [19]:
tensor_rms = torch.tensor(train_meta['rms'].tolist())
tensor_mean = torch.tensor(train_meta['means'].tolist())
tensor_median = torch.tensor(train_meta['median'].tolist())
tensor_std = torch.tensor(train_meta['std'].tolist())

labels_train = torch.tensor(train_meta.labels.tolist())

In [20]:
result = torch.mean(torch.stack([tensor_rms, tensor_mean, tensor_median, tensor_std]), dim=0)

# Hyperparameter search

In [21]:
gsc = GridSearchCV(
            estimator=XGBClassifier(),
            param_grid={"n_estimators": [50, 100],
                        "learning_rate": (0.05, 0.10, 0.20, 0.30, 0.40, 0.50),
                        "max_depth": [ 3, 5, 10],
                        "min_child_weight": [ 3, 5, 9],
                        "gamma":[ 0.0, 0.3, 0.5],
                        "colsample_bytree":[ 0.3, 0.5],},
            cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

In [22]:
gsc.fit(tensor_rms, labels_train)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 1944 out of 1944 | elapsed: 22.1min finished


GridSearchCV(cv=3, estimator=XGBClassifier(), n_jobs=-1,
             param_grid={'colsample_bytree': [0.3, 0.5],
                         'gamma': [0.0, 0.3, 0.5],
                         'learning_rate': (0.05, 0.1, 0.2, 0.3, 0.4, 0.5),
                         'max_depth': [3, 5, 10], 'min_child_weight': [3, 5, 9],
                         'n_estimators': [50, 100]},
             scoring='accuracy', verbose=1)

In [23]:
gsc.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0.3,
 'learning_rate': 0.2,
 'max_depth': 3,
 'min_child_weight': 3,
 'n_estimators': 50}

# Cross validation metrics

In [26]:
model = XGBClassifier(n_estimators=50, learning_rate=0.2, max_depth=3, min_child_weight=3, gamma=0.3, colsample_bytree=0.5)
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True)

scoring=('accuracy')
cv_results = cross_validate(model, tensor_rms, labels_train, cv=kf, scoring=scoring, return_train_score=False)

In [27]:
np.mean(cv_results['test_score'])

0.3214814814814815

In [28]:
model.fit(tensor_rms, labels_train)

XGBClassifier(colsample_bytree=0.5, gamma=0.3, learning_rate=0.2,
              min_child_weight=3, n_estimators=50, objective='multi:softprob')

In [36]:
test_path = '/home/18rvb/ELEC872/competition/Competition/test'
test_paths = []

for path, directories, files in os.walk(test_path):
    for file in files:
        test_paths.append(os.path.join(path, file))

In [44]:
num_list_test = []
for i in test_paths:
    temp_list = np.genfromtxt(i)
    num_list_test.append(temp_list)

In [45]:
test_rms = []

for i in num_list_test:
    temp_rms = []
    for k in i:
        temp_rms.append(np.sqrt(np.mean(k**2)))
    test_rms.append(temp_rms)

In [47]:
tensor_rms_test = torch.tensor(test_rms)

In [48]:
tensor_rms_test.shape

torch.Size([150, 250])

In [49]:
predicted_probs = model.predict_proba(tensor_rms_test)

In [62]:
predicted_results = []

for val in predicted_probs:
    predicted_results.append(np.argmax(val))

In [71]:
import pickle as pkl

with open('/home/18rvb/ELEC872/competition/Competition/output/outputs_xgboost_32.pkl', 'wb+') as t:
    pkl.dump(predicted_results, t)

# predicted_results.to_csv('/home/18rvb/ELEC872/competition/Competition/output/outputs_xgboost_32.csv')

In [72]:
with open('/home/18rvb/ELEC872/competition/Competition/output/outputs_xgboost_32.pkl', 'rb') as tp:
    read_file = pkl.load(tp)

In [70]:
read_file

[7,
 10,
 0,
 10,
 8,
 14,
 9,
 11,
 7,
 9,
 5,
 2,
 0,
 5,
 11,
 1,
 12,
 12,
 10,
 1,
 1,
 9,
 0,
 8,
 4,
 5,
 4,
 3,
 10,
 2,
 6,
 0,
 8,
 0,
 7,
 2,
 11,
 2,
 4,
 2,
 1,
 0,
 4,
 9,
 12,
 9,
 12,
 0,
 14,
 5,
 5,
 2,
 1,
 1,
 10,
 5,
 12,
 3,
 11,
 3,
 0,
 14,
 8,
 0,
 11,
 7,
 11,
 11,
 0,
 2,
 7,
 4,
 0,
 1,
 2,
 0,
 4,
 9,
 3,
 8,
 11,
 11,
 11,
 11,
 0,
 5,
 12,
 14,
 1,
 14,
 8,
 0,
 8,
 8,
 9,
 0,
 0,
 7,
 1,
 13,
 2,
 7,
 2,
 14,
 0,
 3,
 11,
 5,
 12,
 9,
 2,
 7,
 7,
 13,
 9,
 7,
 9,
 0,
 0,
 2,
 13,
 11,
 10,
 9,
 1,
 14,
 8,
 7,
 6,
 0,
 12,
 8,
 8,
 14,
 14,
 9,
 12,
 0,
 13,
 4,
 2,
 7,
 3,
 8,
 4,
 12,
 4,
 7,
 14,
 10]