# Test CNN_Virus code and refactor

Original code is [here](https://github.com/MaHaoran627/CNN_Virus)

In [1]:
import numpy as np
import sys
from pathlib import Path

p = Path('../src').resolve().absolute()
if p.is_dir():
    sys.path.insert(1, str(p))
    print(f"Added following path: {p}")
else:
    print(f"There is no directory {p}")

%load_ext autoreload
%autoreload 2

Added following path: /home/vtec/projects/metagenomics/src


In [2]:
cnn_repo = Path('../repos/cnn_virus/').resolve()
assert cnn_repo.is_dir()

data = Path('../data/cnn_virus').resolve()
assert data.is_dir()


# Preprocessing data

In [3]:
from preprocessing import get_learning_weights, get_params_50mer, get_kmer_from_50mer
from preprocessing import DataGenerator_from_50mer

from architecture import build_model

In [4]:
p2file = Path('../data/cnn_virus/ICTV_50mer_benchmarking.txt').resolve()
f_matrix, f_labels, f_pos = get_kmer_from_50mer(p2file)
len(f_matrix), len(f_labels), len(f_pos)

(100000, 100000, 100000)

In [5]:
i = 0
f_matrix[i], f_labels[i], f_pos[i]

('CTACATGACCCTGACACTCAGCTACGAGATGTCAAATTTTGGGGGCAATGAAAGCAACACCCTTTTTAAGGTAGACAACCACACATATGTGCAACTAGATCGTCCACACAGTCCGCAGTTCCTTGTTCAGCTCAATGAAACACTTCGAAG',
 '120',
 '3')

In [6]:
#path for the training file
filepath_train= data /"50mer_training"
#path for the validating file
filepath_val= data / "50mer_validating"
#path for the learning weights file
filepath_weights=data / "weights_of_classes"
#path for the benchmarking file
filepath_benchmark=data / "ICTV_50mer_benchmarking.txt"

#paths for saving model and loss
filepath_loss=Path("..saved/Multi_task_model.loss").resolve()
filepath_model=Path("../saved/checkpoint/best_model.h5").resolve()

In [7]:
d_nucl={"A":0,"C":1,"G":2,"T":3,"N":4}
f_matrix,f_labels,f_pos=get_kmer_from_50mer(filepath_val)
f_matrix_val,f_labels_val,f_pos_val=get_kmer_from_50mer(filepath_benchmark)

params = get_params_50mer()
d_weights=get_learning_weights(filepath_weights)

training_generator = DataGenerator_from_50mer(f_matrix, f_labels, f_pos, **params)
val_generator = DataGenerator_from_50mer(f_matrix_val, f_labels_val, f_pos_val, **params)

In [8]:
len(training_generator)

977

[`tf.keras.utils.Sequence`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence)

In [9]:
tg = training_generator.__getitem__(0)

In [10]:
len(tg), type(tg), type(tg[0]), type(tg[1])

(2, tuple, numpy.ndarray, dict)

In [11]:
inp, out_d = tg
out_d.keys()

dict_keys(['output1', 'output2'])

In [12]:
inp.shape, out_d['output1'].shape, out_d['output2'].shape

((1024, 50, 5), (1024, 187), (1024, 10))

In [13]:
inp[:2, :4, :]

array([[[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.]],

       [[0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.]]])

In [14]:
out1 = out_d['output1']
out2 = out_d['output2']

out1[:2, :10]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

# Build model and review it

In [16]:
model=build_model()
model.summary()

Creating Model
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 50, 5)]      0           []                               
                                                                                                  
 conv1d_4 (Conv1D)              (None, 50, 512)      13312       ['input_2[0][0]']                
                                                                                                  
 batch_normalization_5 (BatchNo  (None, 50, 512)     2048        ['conv1d_4[0][0]']               
 rmalization)                                                                                     
                                                                                                  
 max_pooling1d_3 (MaxPooling1D)  (None, 25, 512)     0           ['batch_norm

In [17]:
model.compile(optimizer='adam',
	loss={'output1':'categorical_crossentropy','output2':'categorical_crossentropy'},
	metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

result = model.fit(
    training_generator,
	epochs=2,
	verbose=1,
	# class_weight=d_weights,
	# validation_split=0.2,
	# callbacks=[es]
	)
