# Section 05: Classification IV

<a rel="license" href="https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt"><img alt="Attribution-NonCommercial-ShareAlike 4.0 International" src="https://mirrors.creativecommons.org/presskit/buttons/88x31/svg/by-nc-sa.eu.svg" title="This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License"/></a>

End-to-end

In [2]:
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

from keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import *
from tensorflow import keras

### define some metrics which are shown during the training process

In [None]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR')
]

# Logistic Neural Regression
#### The following function 'emulates' logistic regression as an artificial neural network in the simplest form possible
#### for this simple network (just one input), the keras/tensorflow-sequential-API can be used which expects all layers in sequential order
#### however, for 'emulating' logistic regression, there is only one layer with a sigmoid activation function

In [None]:
def log_neural_model(metrics=METRICS):

  model = keras.Sequential([
      #in principle, additional layers can be added - however, it is unlikely that for the current features (and without additional techniques like normalization etc.) results would improve significantly
      #keras.layers.Dense(128, activation="relu"),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

  model.compile(
      optimizer=keras.optimizers.Adam(),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

  return model

### Training and inference can be done analogously to 'usual' logistic regression

In [None]:
def train_logistic_ANN(feature_file, model_file, epochs=20, batch_size=32):
    """
    Description:
        Function for training of the logistic neural regression using keras/tensorflow

    Arguments:
        feature_file str: Filename of the feature file for training.
        model_file str: Filename of the output model.
        epochs int: number of epochs, i.e. how often the full training data set is processed during training
        batch_size int: batch size number, i.e. how many instances are processed simultaneously
    """
    from sklearn.model_selection import train_test_split

    feature_df = pd.read_csv(feature_file, sep="\t")

    model = log_neural_model()


    # Remove feature vectors with label -1
    feature_df = feature_df[feature_df['label'] != -1]
    labels = feature_df['label']
    data = feature_df.drop(
        columns=['chrom', 'start', 'end', 'sample', 'label'])

    #split data in train and validation/test data
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.1)

    #train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

    os.makedirs(os.path.dirname(model_file), exist_ok=True)
    pickle.dump(model, open(model_file, "wb"))

In [1]:
def infer_logistic_ANN(feature_file, model_file):
    """
    Description:
        Function for inference using logistic neural regression implemented in keras/tensorflow

    Arguments:
        feature_file str: Filename of the feature file for inference.
        model_file str: Filename of the trained ANN.
    """
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

    feature_df = pd.read_csv(feature_file, sep="\t")
    data = feature_df.drop(columns=['chrom', 'start', 'end', 'sample'])

    predictions = model.predict(data)

    return predictions

## Logistic Neural Regression with additional feature-processing layers
#### in this ANN, some of the features run through additional layers before all of them are merged for the final logistic regression step
#### In this case, as multiple inputs are given, we have to use the slightly more complicated functional API

In [None]:
def log_neural_model_extended(feature_df_tons, feature_df_pairwise, feature_df_rest, metrics=METRICS, cnn = False):
    from keras.layers import Input, Concatenate, Flatten, Dense, Conv1D
    from keras.models import Model
  
    #determining the shape of the inputs
    tons_input = Input((feature_df_tons.shape[1],1))
    pairwise_input=Input((feature_df_pairwise.shape[1],1))
    rest_input = Input((feature_df_rest.shape[1],))

    if cnn == True:

        conv_layer1 = Conv1D(filters=32, kernel_size=5)(tons_input)
        flat_layer1 = Flatten()(conv_layer1)

        conv_layer2 = Conv1D(filters=32, kernel_size=5)(pairwise_input)
        flat_layer2 = Flatten()(conv_layer2)

        # Concatenate the convolutional features and the vector input
        concat_layer= Concatenate()([flat_layer1, flat_layer2, rest_input])

    else:
        dense_layer1 = Dense(4, activation="relu")(tons_input)
        flat_layer1 = Flatten()(dense_layer1)

        dense_layer2 = Dense(4, activation="relu")(pairwise_input)
        flat_layer2 = Flatten()(dense_layer2)

        # Concatenate the dense layers and the vector input
        concat_layer= Concatenate()([flat_layer1, flat_layer2, rest_input])

    #final neural logistic regression / sigmoid layer
    output = keras.layers.Dense(1, activation="sigmoid")(concat_layer)

    model = Model(inputs=[tons_input, pairwise_input, rest_input], outputs=output)

    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics)

    return model

In [None]:
def partition_feature_df(feature_df):
    """
    Description:
        Function which partitions the original dataframe into three dataframes depending on the name of the features

    Arguments:
        feature_df Pd.Dataframe: Pandas dataframe containing the features of the training/test data
    """

    pairwise_cols = [col for col in feature_df.columns if ( col.startswith("pairwised_dist"))]
    ton_cols = [col for col in feature_df.columns if ('-ton' in col )]
    dynamic_cols = [col for col in feature_df.columns if ('-ton' in col or col.startswith("pairwised_dist"))]


    feature_df_tons = feature_df[ton_cols]
    feature_df_pairwise = feature_df[pairwise_cols]
    feature_df_rest = feature_df.drop(dynamic_cols, axis=1, inplace = False, errors='ignore')

    return feature_df_tons, feature_df_pairwise, feature_df_rest

In [None]:
def train_logistic_ANN_extended(feature_file, model_file, epochs=20, batch_size=32):
    """
    Description:
        Function for training of the extended logistic neural regression model implemented in keras/tensorflow

    Arguments:
        feature_file str: Filename of the feature file for training.
        model_file str: Filename of the output model.
        epochs int: number of epochs, i.e. how often the full training data set is processed during training
        batch_size int: batch size number, i.e. how many instances are processed simultaneously
    """
    from sklearn.model_selection import train_test_split

    feature_df = pd.read_csv(feature_file, sep="\t")

    model = log_neural_model_extended()


    # Remove feature vectors with label -1
    feature_df = feature_df[feature_df['label'] != -1]
    labels = feature_df['label']

    data = feature_df.drop(
        columns=['chrom', 'start', 'end', 'sample', 'label'])

    #split data in train and validation/test data
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.1)

    #for this architecture, we have to separate the data frame in different inputs
    X_train_tons, X_train_pairwise, X_train_rest = partition_feature_df(X_train)
    X_test_tons, X_test_pairwise, X_test_rest = partition_feature_df(X_test)

    #train the model
    model.fit([X_train_tons, X_train_pairwise, X_train_rest], y_train, epochs=epochs, batch_size=batch_size, validation_data=([X_test_tons, X_test_pairwise, X_test_rest], y_test))

    os.makedirs(os.path.dirname(model_file), exist_ok=True)
    pickle.dump(model, open(model_file, "wb"))

In [None]:
def infer_logistic_ANN_extended(feature_file, model_file):
    """
    Description:
    Function for inference using the extended logistic neural regression model implemented in keras/tensorflow

    Arguments:
        feature_file str: Filename of the feature file for inference.
        model_file str: Filename of the ANN.
    """
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

    feature_df = pd.read_csv(feature_file, sep="\t")


    data = feature_df.drop(columns=['chrom', 'start', 'end', 'sample'])

    feature_df_tons, feature_df_pairwise, feature_df_rest = partition_feature_df(data)

    predictions = model.predict([feature_df_tons, feature_df_pairwise, feature_df_rest])

    return predictions

### the results can - if the true labels of the test data set are available - evaluated by means of precision-recall curves

In [2]:
def show_precision_recall_curve(predictions, y_test, title="logistic neural network"):
    from sklearn.metrics import precision_recall_curve

    precision, recall, thresholds = precision_recall_curve(y_test.astype(int), predictions)
    plt.plot(recall, precision, marker='.', label=title)

    plt.xlabel('Recall')
    plt.ylabel('Precision')