In [1]:
from mxnet import autograd, nd, init
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from mxboard import *
from mxnet.gluon import data as gdata
import os
from mxnet.gluon import nn
import numpy as np
import random
from mxnet.gluon import loss
from mxnet.gluon.trainer import Trainer
import time

In [2]:
def load_all_data():
    root = 'data/raw_exp'
    file_list = os.listdir(root)
    file_labels = dict(zip(file_list, list(range(1,len(file_list)+1))))
    samples_list = []

    for file in file_list:
        with open(os.path.join(root,file), 'r') as f:
            first_row = f.readline()
            first_row = first_row.strip().split('\t')
            for sample in first_row[1:]:
                if int(sample.strip()[-3:-2]) == 0:
                    # case
                    samples_list.append((sample, file_labels[file]))
                else:
                    # control
                    samples_list.append((sample, 0))
    sample2labels = pd.DataFrame(samples_list,columns=['sampleID', 'label'])
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_sample(np.array(sample2labels['sampleID']).reshape(-1, 1), 
                                              np.array(sample2labels['label'])
                                             ) # 过采样
    allsamples = pd.DataFrame({'sampleID':X_resampled[:,0], 'label':y_resampled})
    length = allsamples.shape[0]
    index_list = list(range(length))
    train_num, validation_num, test_num= 0.7, 0.2, 0.1
    random.shuffle(index_list)
    train_index = index_list[0:int(length*train_num)]
    validation_index = index_list[int(length*train_num): int(length*train_num) + int(length*validation_num)]
    test_index = index_list[int(length * train_num) + int(length * validation_num):]

    train_samples = allsamples.loc[train_index]
    validation_samples = allsamples.loc[validation_index]
    test_samples = allsamples.loc[test_index]
    
    return train_samples, validation_samples, test_samples

In [3]:
exp = pd.read_csv('data/exp_data.csv')

In [4]:
train_samples, validation_samples, test_samples = load_all_data()
train_x = nd.array(exp[train_samples['sampleID']]).transpose()
train_y = nd.array(train_samples['label'])

In [5]:
net = nn.Sequential()
net.add(nn.Dense(3000, activation='relu'),
        nn.BatchNorm(),
        nn.Dropout(0.5),
        nn.Dense(1000, activation='relu'),
        nn.BatchNorm(),
        nn.Dropout(0.5),
        nn.Dense(100, activation='relu'),
        nn.BatchNorm(),
        nn.Dropout(0.5),
        nn.Dense(33))
net.initialize(init.Normal(sigma=0.01))

In [6]:
net.collect_params('.*dense.*')

sequential0_ (
  Parameter dense0_weight (shape=(3000, 0), dtype=float32)
  Parameter dense0_bias (shape=(3000,), dtype=float32)
  Parameter dense1_weight (shape=(1000, 0), dtype=float32)
  Parameter dense1_bias (shape=(1000,), dtype=float32)
  Parameter dense2_weight (shape=(100, 0), dtype=float32)
  Parameter dense2_bias (shape=(100,), dtype=float32)
  Parameter dense3_weight (shape=(33, 0), dtype=float32)
  Parameter dense3_bias (shape=(33,), dtype=float32)
)

In [7]:
cross_entropy = loss.SoftmaxCrossEntropyLoss()
trainer = Trainer(net.collect_params(), 'adam', {'learning_rate': 0.1})

In [9]:
batch_size = 200
dataset = gdata.ArrayDataset(train_x, train_y)
data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)

In [10]:
num_epochs = 20
sw = SummaryWriter(logdir='./logs2', flush_secs=2)
params = net.collect_params('.*dense.*')
param_names = params.keys()
ls = 0
for epoch in range(1, num_epochs + 1):
    train_loss_sum, train_acc_sum, n, start = 0., 0., 0., time.time()
    for X, Y in data_iter:
        with autograd.record():
            pre = net(X)
            l = cross_entropy(pre, Y).sum()
        l.backward()
        trainer.step(batch_size)
        train_loss_sum += l.sum().asscalar()
        train_acc_sum += (pre.argmax(axis=1)==Y).sum().asscalar()
        n += len(Y)
        
        sw.add_histogram(tag='cross_entropy', values=train_loss_sum / n, global_step=ls)
        sw.add_histogram(tag='train_acc', values=train_acc_sum / n, global_step=ls)
        
        for i, name in enumerate(param_names):
            sw.add_histogram(tag=name, values=net.collect_params()[name].grad(), global_step=ls, bins=1000) 
        ls += 1
    print('epoch %d, loss %.4f, train acc %.3f,  time %.1f sec' % 
          (epoch, train_loss_sum / n, train_acc_sum / n,  time.time() - start))

epoch 1, loss 0.8110, train acc 0.755,  time 142.8 sec
epoch 2, loss 0.3232, train acc 0.897,  time 137.9 sec
epoch 3, loss 0.2348, train acc 0.925,  time 140.3 sec
epoch 4, loss 0.2226, train acc 0.936,  time 146.5 sec
epoch 5, loss 0.2201, train acc 0.945,  time 158.2 sec
epoch 6, loss 0.3120, train acc 0.938,  time 184.6 sec
epoch 7, loss 0.3026, train acc 0.947,  time 214.7 sec
epoch 8, loss 0.3173, train acc 0.950,  time 195.7 sec
epoch 9, loss 0.5553, train acc 0.926,  time 192.7 sec
epoch 10, loss 0.4940, train acc 0.936,  time 194.1 sec


KeyboardInterrupt: 

In [11]:
sw.close()

In [10]:
for x, y in data_iter:
    break

In [11]:
net(x)


[[-0.09682716 -0.06125428 -0.00473381 ... -0.0888713  -0.00932187
  -0.15131769]
 [-0.0924625  -0.07597667  0.01104103 ... -0.06827219 -0.00386821
  -0.13062906]
 [-0.09791607 -0.08168846 -0.00248691 ... -0.09105153 -0.01040389
  -0.12076826]
 ...
 [-0.08215083 -0.07469714 -0.00171283 ... -0.10362029 -0.01176906
  -0.13257726]
 [-0.09476141 -0.08650095 -0.0232383  ... -0.06999294 -0.00254738
  -0.13142964]
 [-0.07482515 -0.0936146  -0.01798228 ... -0.04998816 -0.0169401
  -0.1297883 ]]
<NDArray 200x33 @cpu(0)>

In [15]:
y


[ 0. 15. 17. 31. 16.  8.  6. 13.  2. 17. 27. 28. 11.  0. 25. 31.  8.  1.
 19. 18. 12.  0. 31. 11. 15. 10. 11.  1.  3.  2. 12. 31. 22. 25. 26. 11.
 18.  7. 23. 22. 12. 11.  8.  4. 12. 18.  0. 15. 30.  4. 22.  0. 26.  5.
  4.  1. 19.  7. 19.  1. 30. 29.  7. 30. 10. 14. 19. 10. 14.  9. 13.  4.
 21. 31.  8. 21.  7. 22.  8. 28. 14. 22. 21. 19. 20. 18. 21. 25.  5. 32.
 12.  0. 24. 22. 30. 22. 25.  8. 27. 26. 15.  7.  9.  7. 32. 27. 17. 17.
 13. 25. 18. 10.  5. 20. 22. 11. 21. 30. 14. 28. 25. 18. 28. 14. 24. 25.
  0.  4. 21. 26. 14.  0. 22. 20. 12. 24. 25.  4. 32.  9. 19.  6.  3. 23.
  8. 23. 21.  2. 13. 21.  2. 30. 14. 12.  5. 25. 14. 14. 31. 13. 13.  5.
 21. 30. 21. 10. 14.  9. 10. 20. 18. 19.  2.  5. 28. 32. 31. 22. 20. 25.
 19.  0. 26. 20. 15. 26.  2. 27. 30. 20. 23. 12. 30. 21.  8. 31. 30.  2.
 17.  8.]
<NDArray 200 @cpu(0)>

In [16]:
y_h = nd.zeros(shape=(y.shape[0],33))
for i, x in enumerate(y):
    y_h[i, x] = 1
    
y_h


[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 200x33 @cpu(0)>