In [1]:
%load_ext autoreload
import xcf
import matplotlib.pyplot as plt
import keras_nlp
from keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

2023-05-07 17:17:43.722837: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
AA_VOCAB = ['<PAD>','<START>','<END>','<MASK>','A','B','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
MAX_SEQUENCE_LENGTH = 600 #Not counting <START> and <END> tokens

BASELINE_N_LSTM_UNITS = 128
BASELINE_DROPOUT = 0.5
BASELINE_LEARNING_RATE = 0.0001
BASELINE_N_CLASSES = 1
BASELINE_BATCH_SIZE = 256
BASELINE_EPOCHS = 10

MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32

In [3]:
def classify_acidity(x):
    if x <= 7.0:
        return '0'
    elif x > 7.0:
        return '1'

data = xcf.load_data_from_db('data_pHpred_from_data_raw_2023-04-07', mode = 'clean')
data['seq_as_words'] = data['sequence'].apply(lambda s: ' '.join([char for char in s]))
data = data[['seq_as_words', 'pH']]
data['acidity'] = data['pH'].apply(classify_acidity)
data.head()

Unnamed: 0_level_0,seq_as_words,pH,acidity
rcsb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4BOB,G A M G D E Q S S G E I N H T L Y D E Q S N G ...,7.5,1
4BOD,G A M G D G Q S N G E A K V K K I E F S E F T ...,7.5,1
4BOF,M T A Q T P I H V Y S E I G K L K K V L L H R ...,5.5,0
4BPD,G H H H H H H E L A N N T T G F T R I I K A A ...,5.6,0
4BPM,M H H H H H H S P A L P A F L L C S T L L V I ...,6.7,0


In [5]:
data_train, data_val, data_test = xcf.split_df(data)

tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=AA_VOCAB,
    sequence_length=MAX_SEQUENCE_LENGTH,
    oov_token='X'
)


data_train_X = data_train['seq_as_words'].to_numpy(dtype='str')
data_val_X = data_val['seq_as_words'].to_numpy(dtype='str')
data_test_X = data_test['seq_as_words'].to_numpy(dtype='str')

data_train_X = tokenizer(data_train_X)
data_val_X = tokenizer(data_val_X)
data_test_X = tokenizer(data_test_X)

# label_scaler = MinMaxScaler(feature_range=(0,1))
# data_train_Y = label_scaler.fit_transform(data_train['pH'].to_numpy(dtype='float').reshape(-1,1))
# data_val_Y = label_scaler.transform(data_val['pH'].to_numpy(dtype='float').reshape(-1,1))
# data_test_Y = label_scaler.transform(data_test['pH'].to_numpy(dtype='float').reshape(-1,1))

data_train_Y = data_train['acidity'].to_numpy(dtype='int8').reshape(-1,1)
data_val_Y = data_val['acidity'].to_numpy(dtype='int8').reshape(-1,1)
data_test_Y = data_test['acidity'].to_numpy(dtype='int8').reshape(-1,1)


2023-05-07 17:17:47.771944: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-07 17:17:47.772496: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [6]:
data_train_X_1h = to_categorical(data_train_X,tokenizer.vocabulary_size(),dtype='uint8')
data_val_X_1h = to_categorical(data_val_X,tokenizer.vocabulary_size(),dtype='uint8')
data_test_X_1h = to_categorical(data_test_X,tokenizer.vocabulary_size(),dtype='uint8')

In [16]:
%autoreload
lit_LSTM = xcf.models.compile_Liu2017_LSTM_3(
    tokenizer=tokenizer,
    seq_length=MAX_SEQUENCE_LENGTH,
    n_hidden=BASELINE_N_LSTM_UNITS,
    dropout=BASELINE_DROPOUT,
    lr=BASELINE_LEARNING_RATE,
    n_classes=BASELINE_N_CLASSES
)


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 600, 29)]         0         
                                                                 
 masking_3 (Masking)         (None, 600, 29)           0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 256)              161792    
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 1)                 257       
                                                                 
Total params: 162,049
Trainable params: 162,049
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
early_stopping = EarlyStopping(patience=3)
history_lit_LSTM = lit_LSTM.fit(
    x=data_train_X_1h, 
    y=data_train_Y, 
    validation_data=(data_val_X_1h, data_val_Y),
    batch_size=BASELINE_BATCH_SIZE, 
    epochs=BASELINE_EPOCHS,
    callbacks=[early_stopping],
)

Epoch 1/10


2023-05-07 17:09:20.932036: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1213563000 exceeds 10% of free system memory.
2023-05-07 17:09:23.796801: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]
2023-05-07 17:09:26.240729: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]




KeyboardInterrupt: 

In [31]:
result = lit_LSTM.predict(x=data_val_X_1h, batch_size=BASELINE_BATCH_SIZE)



In [51]:
categories = {0:0,1:0,2:0}
for row in result:
    category = list(row).index(max(row))
    categories[category] += 1

categories

{0: 8612, 1: 58, 2: 48}