In [1]:
import os
import pandas as pd
import numpy as np
# To remove these warnings : "Your kernel may have been built without NUMA support."
#   run these 2 lines before importing tensorflow
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'        # or any {'0', '1', '2', '3'}

import tensorflow as tf 
random_state = 10
np.random.seed(random_state)
tf.random.set_seed(random_state)


# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline

# Seaborn for easier visualization
import seaborn as sns
### sns.set_style('darkgrid')

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

# scikit-learn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, LabelBinarizer
from sklearn.pipeline import make_pipeline, Pipeline

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Keras
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical 

from scikeras.wrappers import KerasClassifier

In [2]:
# Import the script from different folder
import sys  
sys.path.append('../scripts')

import file_utilities as fu

## ProSE embeddings


Initiaize arguments

In [3]:
# Define arguments for the file_paths function
task = 'acp'
ptmodel = 'prose'
file_base = 'train'
model = 'prose_dlm'
emb_layer = 'layer'
pool = 'avg'  


### ProSE DLM model - prose_dlm

- **Pooling Operation:  `avg`**

### Train Dataset

Run the script `file_paths` to prepare paths. The default root data folder is *../data*.

In [4]:
# Prepare paths
path_pt, _, path_fa = fu.file_paths(ptmodel, task, file_base, model, pool)
print('', path_fa, '\n', path_pt)

 ../data/acp/train_data.fa 
 ../data/acp/prose/train/acp_train_dlm_avg


In [5]:
X_train, y_train, sequence_id_train = fu.read_embeddings(path_fa, path_pt, pool, emb_layer)

Shape of embeddings: 		(1378, 6165)
Length of target label list:	1378
Length of sequential ids list:	1378


### Test Dataset

In [6]:
# Update arguments
file_base = 'test'
# Prepare paths
path_pt, _, path_fa = fu.file_paths(ptmodel, task, file_base, model, pool)
print('', path_fa, '\n', path_pt)

 ../data/acp/test_data.fa 
 ../data/acp/prose/test/acp_test_dlm_avg


In [7]:
X_test, y_test, sequence_id_test = fu.read_embeddings(path_fa, path_pt, pool, emb_layer)

Shape of embeddings: 		(344, 6165)
Length of target label list:	344
Length of sequential ids list:	344


### Label encoding

- Our target variable represents a binary category which has been coded as numbers 0 and 1, hence we will have to encode it. To achieve that we will use  the `to_categorical()` function from the Keras utilities package. 
- The two lines of code below accomplishes that in both training and test datasets.

In [8]:
# Encode labels
# y_train_e = to_categorical(y_train)
# y_test_e = to_categorical(y_test)
# print(f"{y_train_e.shape}\n{y_test_e.shape}")

In [9]:
# lb = LabelBinarizer()

# y_train_lb = lb.fit_transform(y_train)
# y_test_lb = lb.transform(y_test)
# y_train_lb

In [10]:
le = LabelEncoder()
y_train_e = le.fit_transform(y_train)
y_test_e = le.transform(y_test)
y_train_e

array([0, 0, 1, ..., 0, 1, 0])

#### Implement the Scikit-Learn classifier interface.

In [11]:
# Import model build function "create_model"from the script "keras_model.py"
import keras_model

In [None]:
def create_model(optimizer="adam", dropout=0.1, n_features=60, n_units=64):
    model = Sequential()
    model.add(Dense(n_units, activation='relu', input_shape=(n_features,)))
    model.add(Dropout(dropout), )
    model.add(Dense(n_units, activation='relu'))
    model.add(Dropout(dropout), )          
    model.add(Dense(1, activation='sigmoid'))
   # model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=["accuracy"])
    return model

In [12]:
# Create sklearn-like classifier
clf = KerasClassifier(
    model=create_model, 
    loss='binary_crossentropy', 
    optimizer='adam', n_features=X_train.shape[1], dropout=0.5, verbose=0)

  keras_clf = KerasClassifier(build_fn=keras_model.create_model, n_features=X_train.shape[1], verbose=0)


In [13]:
num_pca_components = 60

In [14]:
# Define model with pipeline
model = Pipeline(
    steps = (
        ('scaler', MinMaxScaler()),
        # ('pca', PCA(num_pca_components)),
        ('nnc', keras_clf)
    )
)

In [15]:
# model.fit(X_train, y_train_lb)

In [16]:
model.get_params()

{'memory': None,
 'steps': (('scaler', MinMaxScaler()),
  ('nnc', <keras.wrappers.scikit_learn.KerasClassifier at 0x7f37557beca0>)),
 'verbose': False,
 'scaler': MinMaxScaler(),
 'nnc': <keras.wrappers.scikit_learn.KerasClassifier at 0x7f37557beca0>,
 'scaler__clip': False,
 'scaler__copy': True,
 'scaler__feature_range': (0, 1),
 'nnc__n_features': 6165,
 'nnc__verbose': 0,
 'nnc__build_fn': <function keras_model.create_model(optimizer='adam', dropout=0.1, n_features=60, n_units=64)>}

In [17]:
# Define parameters for Keras classifier

param_grid = {
    'nnc__epochs': [10],
    # 'nnc__n_units': [64, 128, 100],
    #'kerasclassifier__init': [ 'uniform', 'zeros', 'normal', ], 
    'nnc__batch_size':[32],
    #'kerasclassifier__optimizer':['RMSprop', 'Adam', 'Adamax', 'sgd'],
    'nnc__dropout': [0.1]
}

In [18]:
# Create the GridSearchCV model
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
grid = GridSearchCV(model, param_grid, verbose=0, cv=5, n_jobs=4)

In [19]:
# Train the model with GridSearch
grid.fit(X_train, y_train_e)

In [20]:
print(f"Best Score: {grid.best_score_}  using:\n{grid.best_params_}")

Best Score: 0.5159183204174042  using:
{'nnc__batch_size': 32, 'nnc__dropout': 0.1, 'nnc__epochs': 10}


In [21]:
grid.cv_results_

{'mean_fit_time': array([33.88087163]),
 'std_fit_time': array([13.47376174]),
 'mean_score_time': array([1.40353389]),
 'std_score_time': array([2.47065904]),
 'param_nnc__batch_size': masked_array(data=[32],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_nnc__dropout': masked_array(data=[0.1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_nnc__epochs': masked_array(data=[10],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'nnc__batch_size': 32, 'nnc__dropout': 0.1, 'nnc__epochs': 10}],
 'split0_test_score': array([0.48550725]),
 'split1_test_score': array([0.65579712]),
 'split2_test_score': array([0.4710145]),
 'split3_test_score': array([0.47999999]),
 'split4_test_score': array([0.48727274]),
 'mean_test_score': array([0.51591832]),
 'std_test_score': array([0.07016831]),
 'rank_test_score': array([1], dtype=int32)}

In [22]:
# from sklearn.metrics import get_scorer_names
# get_scorer_names()

In [23]:
print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

Training Data Score: 0.6937590837478638
Testing Data Score: 0.6569767594337463


In [25]:
# Make predictions with the hypertuned model
y_pred = grid.predict(X_test)
y_pred.shape



(344, 1)

In [26]:
accuracy_score(y_test, y_pred)

0.6569767441860465

In [27]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[129  43]
 [ 75  97]]


In [27]:
# Normalized confusion matrix
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

[[0.74 0.26]
 [0.28 0.72]]


In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.74      0.73       172
           1       0.73      0.72      0.72       172

    accuracy                           0.73       344
   macro avg       0.73      0.73      0.73       344
weighted avg       0.73      0.73      0.73       344



In [31]:
print(np.array(y_test)[:10])
print(y_pred[:10])

[0 0 0 1 0 0 1 1 1 0]
[1 0 0 1 1 0 1 0 0 1]


In [75]:
pipe.predict(X_test[55:56])

array([1])

### Creating classifier path

In [40]:
# check what to use joblib(.sav), PyTorch (.pt), Keras(.h5) or something else

clf = ['rf', 'lr', 'xgb', 'nn', 'knn']

tail = os.path.split(path_pt)[1].split('_')
tail.pop(1)

pipeline_name = f'{"_".join(tail)}_{clf[3]}_pipeline.sav'
model_name = f'{"_".join(tail)}_{clf[3]}.h5'
model_path = os.path.join('../saved_models', model_name)
pipeline_path = os.path.join('../saved_models', pipeline_name)
print(model_path)
print(pipeline_path)

../saved_models/acp_dlm_avg_nn.h5
../saved_models/acp_dlm_avg_nn_pipeline.sav


In [36]:
# Save the best estimator
# This will convert GridSearch object to pipeline object
# It will also partially destroyed grid object, 
# If needed, it could be reran or reconstructed using saved model

nn_model_s = grid.best_estimator_

In [37]:
type(nn_model_s)

sklearn.pipeline.Pipeline

In [41]:
# Save the Keras model first
nn_model_s.named_steps['nnc'].model.save(model_path)

In [42]:
# This hack allows us to save the pipeline
nn_model_s.named_steps['nnc'].model = None

In [46]:
import joblib
# Save the pipeline
joblib.dump(nn_model_s, pipeline_path)

['../saved_models/acp_dlm_avg_nn_pipeline.sav']

In [None]:
# We do not need this pipeline object anymore
## del dl_model_s

In [47]:
from tensorflow.keras.models import load_model

In [48]:
# Load the pipeline first
nn_model = joblib.load(pipeline_path)

In [49]:
# Then, load the Keras model
nn_model.named_steps['nnc'].model = load_model(model_path)

In [50]:
print(nn_model.score(X_test, y_test))

0.7267441749572754


In [51]:
# Classification metrics
y_predl = nn_model.predict(X_test)
cm = confusion_matrix(y_test, y_predl)
print(cm)

[[127  45]
 [ 49 123]]


In [52]:
sequence_id_test[55:56]

['Protein_seq_ts0056']

In [None]:
GVGDIFRKIVSTIKNVV

In [53]:
y_test[55:56], y_predl[55:56]

([0], array([0]))

In [56]:
type(nn_model)

sklearn.pipeline.Pipeline

In [57]:
nn_model.get_params()

{'memory': None,
 'steps': [('scaler', MinMaxScaler()),
  ('pca', PCA(n_components=60)),
  ('nnc', <keras.wrappers.scikit_learn.KerasClassifier at 0x7efcbfa1f430>)],
 'verbose': False,
 'scaler': MinMaxScaler(),
 'pca': PCA(n_components=60),
 'nnc': <keras.wrappers.scikit_learn.KerasClassifier at 0x7efcbfa1f430>,
 'scaler__clip': False,
 'scaler__copy': True,
 'scaler__feature_range': (0, 1),
 'pca__copy': True,
 'pca__iterated_power': 'auto',
 'pca__n_components': 60,
 'pca__n_oversamples': 10,
 'pca__power_iteration_normalizer': 'auto',
 'pca__random_state': None,
 'pca__svd_solver': 'auto',
 'pca__tol': 0.0,
 'pca__whiten': False,
 'nnc__verbose': 0,
 'nnc__dropout': 0.3,
 'nnc__epochs': 10,
 'nnc__build_fn': <function keras_model.create_model(optimizer='adam', dropout=0.1, n_features=60, n_units=64)>}