In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn import svm
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
# data loading
train_csv = pd.read_csv("train.csv")  # 112000*'ABCD' - 0/1
predict_csv = pd.read_csv("test.csv") # 48000*'ABCD

In [3]:
# data encoding
dict = {'A':0, 'C':1, 'D':2, 'E':3, 'F':4,
        'G':5, 'H':6, 'I':7, 'K':8, 'L':9,
        'M':10, 'N':11, 'P':12, 'Q':13, 'R':14,
        'S':15, 'T':16, 'U':17, 'V':18, 'W':19, 'Y':20}
def encoding(data):
    results = np.zeros((data.shape[0],21*4))
    for i in range(data.shape[0]):
        for j in range(4):
            results[i][21*j+dict[data[i][j]]] = 1
    return results
train_x = encoding(train_csv["Sequence"])     # 112000*84 ndarray
predict_x = encoding(predict_csv["Sequence"]) #  48000*84 ndarray
# labels
train_y = train_csv["Active"].values          # 112000*1 ndarray

In [14]:
# config
results = np.zeros(predict_x.shape[0])
test_split_ratio = 0.2
solution = 'NN' # 'SVM','MLP','NN'
# for SVM
SVC_params = {'C':1.0,'kernel':'rbf','class_weight':'balanced'} # class_weight to tackle imbalanced data
# for MLP: ‘logistic’, ‘tanh’, ‘relu’
MLP_params = {'activation':'logistic', 'max_iter':500}
# for NN
zero_weight = 0.2 # smaller, more likely to select 1
input_nodes = 84 # for input dim
hidden_nodes = 64 # for hidden layer
output_nodes = 1 # 0/1
dropout = 0.2 # p=1 for all zeros
activate_func = nn.ReLU()
criterion = nn.BCEWithLogitsLoss()
epochs = 150 
batchSize = 256
learning_rate = 0.0005
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [5]:
# train-test split
X_train, X_test, Y_train, Y_test = train_test_split(train_x, train_y, test_size=test_split_ratio)

In [6]:
# SVM (8min)
if(solution == 'SVM'):
    model = make_pipeline(StandardScaler(),svm.SVC(**SVC_params))
    model.fit(X_train,Y_train)
    y_test_result = model.predict(X_test)
    score = f1_score(Y_test, y_test_result)
    print("f1 score = ",score)
    if(score>0.5):
        results = model.predict(predict_x)
        pd.DataFrame(results).to_csv('sample_SVM.csv', index=False, header=False, float_format='%.0f')
    else:
        print("too poor performance")
else:
    print("SVM is not selected")

SVM is not selected


In [7]:
# MLP: Multilayer Perceptron (2min)
if(solution == 'MLP'):
    # normalize data, because MLP is sensitive to feature scaling
    scaler = StandardScaler()  
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)
    X_predict = scaler.transform(predict_x)
    model = MLPClassifier(**MLP_params)
    model.fit(X_train,Y_train)
    y_test_result = model.predict(X_test)
    score = f1_score(Y_test, y_test_result)
    print("f1 score = ",score)
    if(score>0.5):
        results = model.predict(X_predict)
        np.savetxt('sample_MLP.csv', results, delimiter=',', fmt='%i')
    else:
        print("too poor performance")
else:
    print("MLP is not selected")

MLP is not selected


In [8]:
# Build a NN by pytorch
# standard scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_predict = scaler.transform(predict_x)

## define data structure
class trainData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    def __len__ (self):
        return len(self.X_data)

## test data    
class testData(Dataset):
    def __init__(self, X_data):
        self.X_data = X_data
    def __getitem__(self, index):
        return self.X_data[index]
    def __len__ (self):
        return len(self.X_data)

In [9]:
# package data for NN
train_data = trainData(torch.FloatTensor(X_train), torch.FloatTensor(Y_train))
test_data = testData(torch.FloatTensor(X_test))
predict_data = testData(torch.FloatTensor(X_predict))

In [10]:
# imbalanced sampling
class_count = np.array([len(np.where(Y_train == t)[0]) for t in range(2)])    # return num of 0s and 1s
weight = [zero_weight, 1.0/zero_weight]                                       # define weight for 0s and 1s
samples_weight = np.array([weight[t] for t in Y_train])                       # give weight for EACH data sample
samples_weight = torch.from_numpy(samples_weight).double()                    # turn to torch (double)
Sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
# load data for each epoch
train_loader = DataLoader(train_data, batch_size=batchSize, sampler=Sampler)
test_loader = DataLoader(test_data)
predict_loader = DataLoader(predict_data)
# balanced data check
print(class_count)
for i, (data, label) in enumerate(train_loader):
    if i > 9 : break # i = 0 ~ 9
    print( "batch index {}, 0/1: {}/{}".format(i,len(np.where(label.numpy() == 0)[0]),len(np.where(label.numpy() == 1)[0])))

[86219  3381]
batch index 0, 0/1: 120/136
batch index 1, 0/1: 138/118
batch index 2, 0/1: 136/120
batch index 3, 0/1: 128/128
batch index 4, 0/1: 126/130
batch index 5, 0/1: 126/130
batch index 6, 0/1: 142/114
batch index 7, 0/1: 114/142
batch index 8, 0/1: 124/132
batch index 9, 0/1: 123/133


In [11]:
# build NN
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        self.layer_1 = nn.Linear(input_nodes, hidden_nodes) 
        self.layer_2 = nn.Linear(hidden_nodes, hidden_nodes)
        self.layer_out = nn.Linear(hidden_nodes, output_nodes) 
        self.F = activate_func
        self.dropout = nn.Dropout(p=dropout)
        self.batchnorm = nn.BatchNorm1d(hidden_nodes)
        
    def forward(self, inputs):
        x = self.F(self.layer_1(inputs))
        x = self.batchnorm(x)
        x = self.F(self.layer_2(x))
        x = self.batchnorm(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        return x

In [15]:
# start training
model = binaryClassification().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
model.train()
for e in range(epochs):
    epoch_loss = 0
    f1_avg = 0
    i = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch).squeeze() #torch.Size([128,1])---squeeze()--->torch.Size([128]), match y_batch
        loss = criterion(y_pred, y_batch)
        y_pred = torch.round(torch.sigmoid(y_pred)) # turn output to 0/1
        f1 = f1_score(y_batch.cpu().detach().numpy(), y_pred.cpu().detach().numpy())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        f1_avg += f1
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.8f} | F1_score: {f1_avg/len(train_loader):.3f}')

Epoch 000: | Loss: 0.21622792 | F1_score: 0.920
Epoch 001: | Loss: 0.06981066 | F1_score: 0.979
Epoch 002: | Loss: 0.04646413 | F1_score: 0.986
Epoch 003: | Loss: 0.03705940 | F1_score: 0.989
Epoch 004: | Loss: 0.03291193 | F1_score: 0.991
Epoch 005: | Loss: 0.02717069 | F1_score: 0.992
Epoch 006: | Loss: 0.02507608 | F1_score: 0.993
Epoch 007: | Loss: 0.02307484 | F1_score: 0.993
Epoch 008: | Loss: 0.02060723 | F1_score: 0.994
Epoch 009: | Loss: 0.01954998 | F1_score: 0.995
Epoch 010: | Loss: 0.01775368 | F1_score: 0.995
Epoch 011: | Loss: 0.01730847 | F1_score: 0.995
Epoch 012: | Loss: 0.01679663 | F1_score: 0.995
Epoch 013: | Loss: 0.01630706 | F1_score: 0.995
Epoch 014: | Loss: 0.01443403 | F1_score: 0.996
Epoch 015: | Loss: 0.01426485 | F1_score: 0.996
Epoch 016: | Loss: 0.01363760 | F1_score: 0.996
Epoch 017: | Loss: 0.01126288 | F1_score: 0.997
Epoch 018: | Loss: 0.01165066 | F1_score: 0.996
Epoch 019: | Loss: 0.01118116 | F1_score: 0.997
Epoch 020: | Loss: 0.01077127 | F1_score

In [16]:
# test result
y_test_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_tag = torch.round(torch.sigmoid(y_test_pred))
        y_test_list.append(y_test_tag.int().item()) 
f1_score(Y_test,np.array(y_test_list))

0.6334355828220859

In [19]:
y_predict_list = []
model.eval()
with torch.no_grad():
    for X_batch in predict_loader:
        X_batch = X_batch.to(device)
        y_predict_pred = model(X_batch)
        y_predict_tag = torch.round(torch.sigmoid(y_predict_pred))
        y_predict_list.append(y_predict_tag.int().item())
pd.DataFrame(np.array(y_predict_list)).to_csv('sample_NN.csv', index=False, header=False, float_format='%.0f')

In [27]:
85/3*2

131.76470588235293

In [None]:
hidden layer: 样本数/（输入+输出）/alpha
隐藏神经元的数量应在输入层的大小和输出层的大小之间。
隐藏神经元的数量应为输入层大小的2/3加上输出层大小的2/3。
隐藏神经元的数量应小于输入层大小的两倍。
batchsize： 尽量大 for batchnorm
dropout
relu
epoch(early stop)
这个NN调崩了他妈的
看solution_NN吧