Author: Wolfgang Black <br>
date_modified: 2022-07-24
    
# Notebook purpose:
    
This notebook is meant to be run in google colab inside a /test/ folder placed at the same level as /src/. Its meant to do a few tests on the training data, data artificts, and the config generated via updated_config.py

This nb is responsible for debugging /test/test.py

In [1]:
import os
import sys
import json
import pytest
from collections import Counter

from google.colab import drive
drive.mount('/content/drive/')
base_dir = 'drive/MyDrive/Colab Notebooks/ProtCNN/src/'
os.chdir(base_dir)
from utils.datautils import *
from utils.modelutils import *
# os.chdir('../test/')

# from utils.datautils import reader

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
with open('./config/config.json') as json_data_file:
  config = json.load(json_data_file)
print(config)

{'data_config': {'DATA_DIR': './../../../PFAM_database/data/random_split/', 'MAX_LEN': 120, 'TEST_BATCH_SIZE': 100, 'TEST_RANGE': [0, 10000]}, 'model_config': {'NUM_MODELS': 1, 'MODEL_DIR': '../models/', 'NUM_RES_BLOCKS': 3, 'FILTERS': 128, 'D_RATE': [2, 3, 3], 'MAX_LEN': 120, 'DROPOUT': 0.5, 'L2_FACTOR': 0.0001, 'OPT': 'adam', 'LOSS': 'sparse_categorical_crossentropy', 'METRICS': ['accuracy'], 'EPOCHS': 5, 'BATCH_SIZE': 256}}


In [3]:
print("building artifacts for training",'\n')

train_data, train_targets = reader('train',config['data_config']['DATA_DIR'])
fam2label = build_labels(train_targets)
word2id = build_vocab(train_data)

print('building dataset dictionaries','\n')
##call class to get dictionaries
train_data = SequenceData(word2id, fam2label, config['data_config']['MAX_LEN'], config['data_config']['DATA_DIR'],"train")
train_dict = train_data.get_data_dictionaries()

building artifacts for training 

There are 17930 labels. 

AA dictionary formed. the length of dictionary is: 21. 

building dataset dictionaries 



In [4]:
def verify_labels_in_encoded_values(y: list, fam2label:dict):
  """This test verifies there are no extraneous values in the labels that do not exist in the label encoder"""
  assert set(list(Counter(y).keys())).issubset(list(fam2label.values())) == True, 'verify_labels_in_encoded_values FAILED - there are labels in y that dne in the encoder'
  print('verify_labels_in_encoded_values PASSED')

In [5]:
verify_labels_in_encoded_values(train_dict['target'], fam2label)

verify_labels_in_encoded_values PASSED


In [6]:
def verify_unique_labels(y: list, fam2labels: dict):
    """This verifies that the training data doesn't have more unique values than exists in the label encoder"""
    counter_object = Counter(y)
    keys = counter_object.keys()
    assert len(fam2labels) >= len(keys), 'failed, the number unique labels in training is greater than the expected number'
    print('verify_unique_labels PASSED: There are not more labels encoded in the data than are possible')

In [7]:
verify_unique_labels(train_dict['target'], fam2label)

verify_unique_labels PASSED: There are not more labels encoded in the data than are possible


In [8]:
def verify_feature_shapes(x: dict, max_len,word2id):
    """This test verifies that the feature sets are the expected size"""
    assert x['sequence'].shape[1:] == (max_len, len(word2id)), 'verified_feature_shapes failed - the shape of the features did not equal = (max_len,len(word2id))'
    print('Passed! Feature shape and configs for model Input layer are the same')

In [9]:
verify_feature_shapes(train_dict, config['data_config']['MAX_LEN'],word2id)

Passed! Feature shape and configs for model Input layer are the same


In [12]:
def check_config_dtypes(config):
  count = 0
  broken_configs = []
  if isinstance(config['data_config']['DATA_DIR'],str) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n data_config:DATA_DIR')

  if isinstance(config['data_config']['MAX_LEN'],int) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n data_config:MAX_LEN')

  if isinstance(config['data_config']['TEST_BATCH_SIZE'],int) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n data_config:TEST_BATCH_SIZE')

  if isinstance(config['data_config']['TEST_RANGE'],list) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n data_config:BATCH_SIZE')

  if isinstance(config['model_config']['BATCH_SIZE'],int) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:BATCH_SIZE')

  if isinstance(config['model_config']['DROPOUT'],float) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:DROPOUT')

  if isinstance(config['model_config']['D_RATE'],list) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:D_RATE')

  if isinstance(config['model_config']['EPOCHS'],int) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:EPOCHS')

  if isinstance(config['model_config']['FILTERS'],int) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:BATCH_SIZE')

  if isinstance(config['model_config']['L2_FACTOR'],float) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:L2_FACTOR')
    
  if isinstance(config['model_config']['LOSS'],str) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:LOSS')

  if isinstance(config['model_config']['MAX_LEN'],int) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:MAX_LEN')

  if isinstance(config['model_config']['METRICS'],list) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:METRICS')

  if isinstance(config['model_config']['MODEL_DIR'],str) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:MODEL_DIR')

  if isinstance(config['model_config']['NUM_MODELS'],int) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:NUM_MODELS')

  if isinstance(config['model_config']['NUM_RES_BLOCKS'],int) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:NUM_RES_BLOCKS')

  if isinstance(config['model_config']['OPT'],str) == True:
    pass
  else:
    count += 1
    broken_configs.append('\n model_config:OPT')
  broken_configs.append('\n')
  try:
    assert count == 0
    print('Config passes dtype test')
  except:
    print('config is broken, see following list')
    print(broken_configs)    
  

In [13]:
check_config_dtypes(config)

Config passes dtype test


In [None]:
len(D_RATE) == NUM_RES_BLOCKS
len(TEST_RANGE) == 0 or 2
