In [79]:
# Implement a Neural Network by Pytorch
# for imbalanced binary classification
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [80]:
# data loading
train_csv = pd.read_csv("train.csv")  # 112000*'ABCD' - 0/1
predict_csv = pd.read_csv("test.csv") # 48000*'ABCD'

In [81]:
# data encoding
dict = {'A':0, 'C':1, 'D':2, 'E':3, 'F':4,
        'G':5, 'H':6, 'I':7, 'K':8, 'L':9,
        'M':10, 'N':11, 'P':12, 'Q':13, 'R':14,
        'S':15, 'T':16, 'U':17, 'V':18, 'W':19, 'Y':20}
def encoding(data):
    results = np.zeros((data.shape[0],21*4))
    for i in range(data.shape[0]):
        for j in range(4):
            results[i][21*j+dict[data[i][j]]] = 1
    return results    

In [82]:
train_x = encoding(train_csv["Sequence"])     # 112000*21 ndarray
predict_x = encoding(predict_csv["Sequence"]) #  48000*21 ndarray
# labels
train_y = train_csv["Active"].values          # 112000*1 ndarray

In [83]:
# config
test_split_ratio = 0.2
zero_weight = 0.2
input_nodes = 84
hidden_nodes = 64
output_nodes = 1
dropout = 0.5
EPOCHS = 150
BATCH_SIZE = 128
LEARNING_RATE = 0.0005

In [84]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=test_split_ratio)

In [85]:
# standard scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_predict = scaler.transform(predict_x)

In [86]:
## train data structure
class trainData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    def __len__ (self):
        return len(self.X_data)

## test data    
class testData(Dataset):
    def __init__(self, X_data):
        self.X_data = X_data
    def __getitem__(self, index):
        return self.X_data[index]
    def __len__ (self):
        return len(self.X_data)

In [87]:
# package data for NN
train_data = trainData(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
test_data = testData(torch.FloatTensor(X_test))
predict_data = testData(torch.FloatTensor(X_predict))

In [88]:
# for imbalanced dataset
class_count = np.array([len(np.where(y_train == t)[0]) for t in range(2)])    # return num of 0s and 1s
weight = [zero_weight, 1.0/zero_weight]                                                     # define weight
samples_weight = np.array([weight[t] for t in y_train])                                     # give weight for EACH data sample
samples_weight = torch.from_numpy(samples_weight).double()                                  # turn to torch (double)
Sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=Sampler)

test_loader = DataLoader(test_data)
predict_loader = DataLoader(predict_data)

print(class_count)
print(samples_weight)
for i, (data, target) in enumerate(train_loader):
    print( "batch index {}, 0/1: {}/{}".format(i,len(np.where(target.numpy() == 0)[0]),len(np.where(target.numpy() == 1)[0])))

[86208  3392]
tensor([0.2000, 0.2000, 0.2000,  ..., 0.2000, 0.2000, 0.2000],
       dtype=torch.float64)
batch index 0, 0/1: 66/62
batch index 1, 0/1: 61/67
batch index 2, 0/1: 58/70
batch index 3, 0/1: 72/56
batch index 4, 0/1: 69/59
batch index 5, 0/1: 74/54
batch index 6, 0/1: 66/62
batch index 7, 0/1: 62/66
batch index 8, 0/1: 70/58
batch index 9, 0/1: 71/57
batch index 10, 0/1: 66/62
batch index 11, 0/1: 75/53
batch index 12, 0/1: 62/66
batch index 13, 0/1: 72/56
batch index 14, 0/1: 68/60
batch index 15, 0/1: 65/63
batch index 16, 0/1: 63/65
batch index 17, 0/1: 73/55
batch index 18, 0/1: 62/66
batch index 19, 0/1: 74/54
batch index 20, 0/1: 67/61
batch index 21, 0/1: 54/74
batch index 22, 0/1: 62/66
batch index 23, 0/1: 72/56
batch index 24, 0/1: 52/76
batch index 25, 0/1: 58/70
batch index 26, 0/1: 63/65
batch index 27, 0/1: 57/71
batch index 28, 0/1: 61/67
batch index 29, 0/1: 69/59
batch index 30, 0/1: 76/52
batch index 31, 0/1: 65/63
batch index 32, 0/1: 63/65
batch index 33

In [89]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        # Number of input features is 21.
        self.layer_1 = nn.Linear(input_nodes, hidden_nodes) 
        self.layer_2 = nn.Linear(hidden_nodes, hidden_nodes)
        self.layer_out = nn.Linear(hidden_nodes, output_nodes) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout)
        self.batchnorm1 = nn.BatchNorm1d(hidden_nodes)
        self.batchnorm2 = nn.BatchNorm1d(hidden_nodes)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        return x

In [90]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [91]:
model = binaryClassification()
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [92]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    return acc

In [93]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.18877 | Acc: 92.964
Epoch 002: | Loss: 0.08044 | Acc: 97.330
Epoch 003: | Loss: 0.05492 | Acc: 98.113
Epoch 004: | Loss: 0.04640 | Acc: 98.413
Epoch 005: | Loss: 0.04052 | Acc: 98.607
Epoch 006: | Loss: 0.03636 | Acc: 98.743
Epoch 007: | Loss: 0.03215 | Acc: 98.861
Epoch 008: | Loss: 0.03066 | Acc: 98.931
Epoch 009: | Loss: 0.02776 | Acc: 99.014
Epoch 010: | Loss: 0.02629 | Acc: 99.060
Epoch 011: | Loss: 0.02524 | Acc: 99.091
Epoch 012: | Loss: 0.02433 | Acc: 99.127
Epoch 013: | Loss: 0.02355 | Acc: 99.173
Epoch 014: | Loss: 0.02093 | Acc: 99.236
Epoch 015: | Loss: 0.02061 | Acc: 99.244
Epoch 016: | Loss: 0.02063 | Acc: 99.260
Epoch 017: | Loss: 0.02121 | Acc: 99.241
Epoch 018: | Loss: 0.01853 | Acc: 99.329
Epoch 019: | Loss: 0.02127 | Acc: 99.230
Epoch 020: | Loss: 0.01872 | Acc: 99.279
Epoch 021: | Loss: 0.01749 | Acc: 99.360
Epoch 022: | Loss: 0.01708 | Acc: 99.346
Epoch 023: | Loss: 0.01645 | Acc: 99.391
Epoch 024: | Loss: 0.01623 | Acc: 99.391
Epoch 025: | Los

In [94]:
y_test_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_tag = torch.round(torch.sigmoid(y_test_pred))
        y_test_list.append(y_test_tag.cpu().int().numpy())
y_test_list = [a.squeeze().tolist() for a in y_test_list]

In [95]:
f1_score(y_test,np.array(y_test_list))

0.871521610420367

In [95]:
y_predict_list = []
model.eval()
with torch.no_grad():
    for X_batch in predict_loader:
        X_batch = X_batch.to(device)
        y_predict_pred = model(X_batch)
        y_predict_tag = torch.round(torch.sigmoid(y_predict_pred))
        y_predict_list.append(y_predict_tag.cpu().int().numpy())
y_predict_list = [a.squeeze().tolist() for a in y_predict_list]

In [96]:
pd.DataFrame(np.array(y_predict_list)).to_csv('sample_nn.csv', index=False, header=False, float_format='%.0f')