In [2]:
import deepchem as dc
from rdkit import Chem
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

#Load bộ dữ liệu toxic21
_, (train,valid,test), _ = dc.molnet.load_tox21()
train_X, train_Y, train_w = train.X, train.y, train.w
valid_X, valid_Y, valid_w = valid.X, valid.y, valid.w
test_X, test_Y, test_w = test.X, test.y, test.w

print(train_X.shape,valid_X.shape,test_X.shape) #in ra kích cỡ của vecto đặc trưng
print(train_Y.shape,valid_Y.shape,test_Y.shape) #in ra kích cỡ của vecto nhãn
print(train_w.shape,valid_w.shape,test_w.shape) #in ra kích cỡ của vecto trọng số mẫu

Loading dataset from disk.
Loading dataset from disk.
Loading dataset from disk.
(6264, 1024) (783, 1024) (784, 1024)
(6264, 12) (783, 12) (784, 12)
(6264, 12) (783, 12) (784, 12)


In [4]:
#loại bỏ phần dữ liệu thừa
train_Y = train_Y[:, 0]
valid_Y = valid_Y[:, 0]
test_Y = test_Y[:, 0]
train_w = train_w[:, 0]
valid_w = valid_w[:, 0]
test_w = test_w[:, 0]

print(train_X.shape,valid_X.shape,test_X.shape) #in ra kích cỡ của vecto đặc trưng
print(train_Y.shape,valid_Y.shape,test_Y.shape) #in ra kích cỡ của vecto nhãn
print(train_w.shape,valid_w.shape,test_w.shape) #in ra kích cỡ của vecto trọng số mẫu

(6264, 1024) (783, 1024) (784, 1024)
(6264,) (783,) (784,)
(6264,) (783,) (784,)


In [17]:
learning_rate = 0.001 
n_epochs = 10 
batch_size = 50
n_hidden = 50
dropout_prob = 1
d = train_X.shape[1] #số chiều của vecto đặc trưng = 1024

In [18]:
tf.reset_default_graph()

#khởi tạo ô trống với kích cỡ minibatch chấp nhận nhiều giá trị khác nhau để nhập vecto đặc trưng x và nhãn y 
with tf.name_scope("placeholders"):
    x = tf.placeholder(tf.float32,(None,d))
    y = tf.placeholder(tf.float32,(None,))
    keep_prob = tf.placeholder(tf.float32)

#mô tả một lớp ẩn trong mạng với các tham số
with tf.name_scope("hidden-layer"):
    W = tf.Variable(tf.random_normal((d, n_hidden))) #n_hidden là số lớp mạng ẩn trong toàn bộ mạng
    b = tf.Variable(tf.random_normal((n_hidden,)))
    x_hidden = tf.nn.relu(tf.matmul(x,W) + b) # giá trị tại một nút trong mạng
    
    x_hidden = tf.nn.dropout(x_hidden,keep_prob) #sử dụng dropout 

#mô tả lớp đầu ra của mạng
with tf.name_scope("output"):
    W = tf.Variable(tf.random_normal((n_hidden,1)))
    b = tf.Variable(tf.random_normal((1,)))
    y_logit = tf.matmul(x_hidden,W) + b
    
    y_one_prob = tf.sigmoid(y_logit) #dùng hàm sigmoid để đưa đầu ra về dạng xác suất

    y_pred = tf.round(y_one_prob) #làm tròn kết quả để đưa ra dự đoán 0 hoặc 1

#mô tả cách tính sự mất mát tại mỗi điểm dữ liệu
with tf.name_scope("loss"):
    y_expand = tf.expand_dims(y,1) #thêm một chiều vào vecto y
    #tìm cross entropy giữa mỗi điểm dữ liệu
    entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_logit,labels=y_expand) 
    #tính tổng các entropy 
    l = tf.reduce_sum(entropy)

#mô tả hàm tối ưu cho mạng   
with tf.name_scope("optim"):
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(l)
    
with tf.name_scope("summaries"):
    tf.summary.scalar("loss",l)
    merged = tf.summary.merge_all()

In [19]:
train_writer = tf.summary.FileWriter('/tmp/logistic-train', tf.get_default_graph())

In [20]:
step = 0
N = train_X.shape[0]

#bắt đầu một session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for epoch in range(n_epochs):
        pos = 0

        while pos < N:
            batch_X = train_X[pos:pos+batch_size]
            batch_Y = train_Y[pos:pos+batch_size]
            feed_dict = {x: batch_X,y: batch_Y,keep_prob: dropout_prob}
            _,summary,loss = sess.run([train_op,merged,l], feed_dict=feed_dict)
            print("Epoch: %d, step: %d, loss: %f "% (epoch,step,loss))
            train_writer.add_summary(summary,step)

            step += 1
            pos += batch_size

    #dự đoán
    train_y_pred = sess.run(y_pred, feed_dict={x:train_X,keep_prob:1.0})
    valid_y_pred = sess.run(y_pred, feed_dict={x:valid_X,keep_prob:1.0})
    test_y_pred = sess.run(y_pred, feed_dict={x:test_X,keep_prob:1.0})

Epoch: 0, step: 0, loss: 359.763672 
Epoch: 0, step: 1, loss: 459.091278 
Epoch: 0, step: 2, loss: 570.972717 
Epoch: 0, step: 3, loss: 434.572113 
Epoch: 0, step: 4, loss: 385.434814 
Epoch: 0, step: 5, loss: 360.103699 
Epoch: 0, step: 6, loss: 384.246216 
Epoch: 0, step: 7, loss: 422.724243 
Epoch: 0, step: 8, loss: 513.125061 
Epoch: 0, step: 9, loss: 268.475037 
Epoch: 0, step: 10, loss: 361.075348 
Epoch: 0, step: 11, loss: 534.608643 
Epoch: 0, step: 12, loss: 414.832336 
Epoch: 0, step: 13, loss: 412.949432 
Epoch: 0, step: 14, loss: 226.162949 
Epoch: 0, step: 15, loss: 296.734772 
Epoch: 0, step: 16, loss: 310.046234 
Epoch: 0, step: 17, loss: 185.366333 
Epoch: 0, step: 18, loss: 258.668030 
Epoch: 0, step: 19, loss: 230.514435 
Epoch: 0, step: 20, loss: 165.606537 
Epoch: 0, step: 21, loss: 281.208282 
Epoch: 0, step: 22, loss: 257.746613 
Epoch: 0, step: 23, loss: 249.002365 
Epoch: 0, step: 24, loss: 281.886444 
Epoch: 0, step: 25, loss: 181.461105 
Epoch: 0, step: 26, lo

Epoch: 2, step: 339, loss: 74.562393 
Epoch: 2, step: 340, loss: 95.968666 
Epoch: 2, step: 341, loss: 31.784515 
Epoch: 2, step: 342, loss: 53.929775 
Epoch: 2, step: 343, loss: 54.766060 
Epoch: 2, step: 344, loss: 0.893663 
Epoch: 2, step: 345, loss: 36.268486 
Epoch: 2, step: 346, loss: 5.092955 
Epoch: 2, step: 347, loss: 79.986008 
Epoch: 2, step: 348, loss: 114.023643 
Epoch: 2, step: 349, loss: 0.432268 
Epoch: 2, step: 350, loss: 24.947395 
Epoch: 2, step: 351, loss: 29.198671 
Epoch: 2, step: 352, loss: 16.074434 
Epoch: 2, step: 353, loss: 42.453430 
Epoch: 2, step: 354, loss: 5.276576 
Epoch: 2, step: 355, loss: 24.746326 
Epoch: 2, step: 356, loss: 55.824570 
Epoch: 2, step: 357, loss: 24.485044 
Epoch: 2, step: 358, loss: 39.277912 
Epoch: 2, step: 359, loss: 6.667773 
Epoch: 2, step: 360, loss: 35.444794 
Epoch: 2, step: 361, loss: 41.480236 
Epoch: 2, step: 362, loss: 39.187111 
Epoch: 2, step: 363, loss: 14.931925 
Epoch: 2, step: 364, loss: 0.880635 
Epoch: 2, step: 3

Epoch: 5, step: 661, loss: 52.090515 
Epoch: 5, step: 662, loss: 6.497435 
Epoch: 5, step: 663, loss: 49.390697 
Epoch: 5, step: 664, loss: 13.658253 
Epoch: 5, step: 665, loss: 60.592060 
Epoch: 5, step: 666, loss: 78.104271 
Epoch: 5, step: 667, loss: 134.803207 
Epoch: 5, step: 668, loss: 35.247746 
Epoch: 5, step: 669, loss: 0.174694 
Epoch: 5, step: 670, loss: 88.410004 
Epoch: 5, step: 671, loss: 0.338308 
Epoch: 5, step: 672, loss: 11.212546 
Epoch: 5, step: 673, loss: 54.511585 
Epoch: 5, step: 674, loss: 63.029167 
Epoch: 5, step: 675, loss: 62.199371 
Epoch: 5, step: 676, loss: 2.004107 
Epoch: 5, step: 677, loss: 46.868847 
Epoch: 5, step: 678, loss: 33.239120 
Epoch: 5, step: 679, loss: 19.047379 
Epoch: 5, step: 680, loss: 6.918561 
Epoch: 5, step: 681, loss: 0.218017 
Epoch: 5, step: 682, loss: 12.988924 
Epoch: 5, step: 683, loss: 3.785741 
Epoch: 5, step: 684, loss: 57.038433 
Epoch: 5, step: 685, loss: 47.446526 
Epoch: 5, step: 686, loss: 73.641113 
Epoch: 5, step: 68

Epoch: 7, step: 922, loss: 73.482338 
Epoch: 7, step: 923, loss: 0.286778 
Epoch: 7, step: 924, loss: 8.609140 
Epoch: 7, step: 925, loss: 42.313396 
Epoch: 7, step: 926, loss: 52.369671 
Epoch: 7, step: 927, loss: 50.787037 
Epoch: 7, step: 928, loss: 0.064136 
Epoch: 7, step: 929, loss: 37.671146 
Epoch: 7, step: 930, loss: 15.461261 
Epoch: 7, step: 931, loss: 12.674550 
Epoch: 7, step: 932, loss: 2.986162 
Epoch: 7, step: 933, loss: 0.270014 
Epoch: 7, step: 934, loss: 11.941789 
Epoch: 7, step: 935, loss: 1.479437 
Epoch: 7, step: 936, loss: 43.774857 
Epoch: 7, step: 937, loss: 36.488197 
Epoch: 7, step: 938, loss: 46.469341 
Epoch: 7, step: 939, loss: 3.803274 
Epoch: 7, step: 940, loss: 1.206325 
Epoch: 7, step: 941, loss: 0.244662 
Epoch: 7, step: 942, loss: 1.486928 
Epoch: 7, step: 943, loss: 78.858177 
Epoch: 7, step: 944, loss: 18.936865 
Epoch: 7, step: 945, loss: 36.809414 
Epoch: 7, step: 946, loss: 23.601091 
Epoch: 7, step: 947, loss: 0.811280 
Epoch: 7, step: 948, lo

Epoch: 9, step: 1204, loss: 19.631275 
Epoch: 9, step: 1205, loss: 7.668934 
Epoch: 9, step: 1206, loss: 0.757760 
Epoch: 9, step: 1207, loss: 16.851528 
Epoch: 9, step: 1208, loss: 19.557930 
Epoch: 9, step: 1209, loss: 51.225204 
Epoch: 9, step: 1210, loss: 14.975698 
Epoch: 9, step: 1211, loss: 10.066834 
Epoch: 9, step: 1212, loss: 3.978287 
Epoch: 9, step: 1213, loss: 0.455304 
Epoch: 9, step: 1214, loss: 1.240374 
Epoch: 9, step: 1215, loss: 21.150238 
Epoch: 9, step: 1216, loss: 56.264973 
Epoch: 9, step: 1217, loss: 4.302561 
Epoch: 9, step: 1218, loss: 50.575157 
Epoch: 9, step: 1219, loss: 0.152793 
Epoch: 9, step: 1220, loss: 35.614662 
Epoch: 9, step: 1221, loss: 13.671635 
Epoch: 9, step: 1222, loss: 3.279485 
Epoch: 9, step: 1223, loss: 9.558020 
Epoch: 9, step: 1224, loss: 29.168476 
Epoch: 9, step: 1225, loss: 23.548368 
Epoch: 9, step: 1226, loss: 0.813265 
Epoch: 9, step: 1227, loss: 7.514618 
Epoch: 9, step: 1228, loss: 0.072373 
Epoch: 9, step: 1229, loss: 36.256355

In [21]:
from sklearn.metrics import accuracy_score
#đưa ra độ chính xác của dự đoán
train_weighted_score = accuracy_score(train_Y, train_y_pred, sample_weight=train_w)
print("Train Weighted Classification Accuracy: %f" % train_weighted_score)
valid_weighted_score = accuracy_score(valid_Y, valid_y_pred, sample_weight=valid_w)
print("Valid Weighted Classification Accuracy: %f" % valid_weighted_score)
test_weighted_score = accuracy_score(test_Y, test_y_pred, sample_weight=test_w)
print("Test Weighted Classification Accuracy: %f" % test_weighted_score)

Train Weighted Classification Accuracy: 0.777218
Valid Weighted Classification Accuracy: 0.644231
Test Weighted Classification Accuracy: 0.684198
