In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%load_ext tensorboard
import datetime, os

In [None]:
from google.colab import drive
drive.mount('/content/drive',  force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/PHI/ToF_ML/src

/content/drive/MyDrive/PHI/ToF_ML/src


In [None]:
from data_generator import DataGenerator

In [None]:
norm_data = pd.read_csv('../data/all_calibrated_data.csv')

In [None]:
from ast import literal_eval
norm_data['channels']  = norm_data['precise_channels'].apply(literal_eval)
norm_data['intensities'] = norm_data['precise_intensities'].apply(literal_eval)
norm_data.drop(['precise_channels', 'precise_intensities'], axis=1, inplace=True)

In [None]:
norm_data.dropna(inplace=True)
norm_data = norm_data[norm_data['intensities'].apply(len)> 0].copy()

In [None]:
from data_transformation import generate_data, mass_formula, generate_calibrated_data
erred = generate_data(norm_data, 2, 2, True, [0, 0, 0], slope_index=2)
for num in range(3):
    for _ in range(12):
        erred = pd.concat([erred, generate_data(norm_data, num + 2, 1, True, [0.334, 0.667, 1], True, True, slope_index=2)], axis=0)
erred['target'] = erred['target'].apply(lambda a: a - 1 if a > 0 else a)
erred.reset_index(inplace=True, drop=True)
erred = generate_calibrated_data(erred, slope_index=2)

In [None]:
from data_transformation import get_suspicious_peaks, get_peak_suspiciousness
from data_transformation import get_isotope_data, get_ranges
isotope_data = get_isotope_data()
ranges = get_ranges(isotope_data['Isotope Masses'], 4000)
erred['peak_sussness'] = erred['masses'].apply(get_peak_suspiciousness, args=(ranges, True, False, 4000))

In [None]:
erred.describe()

Unnamed: 0,Mass/Time,MassOffset,StartFlightTime,SpecBinSize,target,err_prop_slope,err_prop_offset
count,53169.0,53169.0,53169.0,53169.0,53169.0,53169.0,53169.0
mean,0.342188,-0.461535,1.982834,0.813724,0.646711,-1.4e-05,-1.2e-05
std,0.08158,0.411939,4.475478,1.988243,0.477996,0.002696,0.002706
min,0.166933,-1.672351,-0.929,0.128,0.0,-0.009999,-0.01
25%,0.376696,-0.712313,0.0,0.128,0.0,-6.3e-05,-6.1e-05
50%,0.379314,-0.51313,0.0,0.138,1.0,0.0,0.0
75%,0.382785,-4.6e-05,0.0,1.248,1.0,6e-05,6.2e-05
max,0.467555,0.001721,21.571,20.0,1.0,0.009998,0.01


In [None]:
def get_padded_list(masses):
    new = list(masses)
    for num in range(4356 - len(masses)):
        new.append(0)
    return new

In [None]:
from sklearn.preprocessing import MinMaxScaler
x = []
scl = MinMaxScaler()
for row in erred.itertuples():
    scl.fit(np.array(row.intensities).reshape((-1, 1)))
    intensities = scl.transform(np.array(row.intensities).reshape((-1, 1)))
    x.append(intensities)

In [None]:
erred['padded_intensities'] = pd.Series(x).apply(get_padded_list).apply(np.array).apply(np.reshape, args=((1, -1)))

In [None]:
erred['padded_masses'] = erred['masses'].apply(get_padded_list).apply(np.array).apply(np.reshape, args=((1, -1)))

In [None]:
X = np.dstack((erred['padded_intensities'], erred['padded_masses']))[0]
y = np.array(erred['target'])

In [None]:
from sklearn.model_selection import train_test_split
indices = np.concatenate((np.random.randint(0, 906, 10), np.random.randint(906, X.shape[0], 270)))
X_val = X[indices]
y_val = y[indices]
X = np.delete(X, indices, axis=0)
y = np.delete(y, indices, axis=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape

(42311, 2)

In [None]:
np.array(erred['padded_masses'][0]).reshape((1, -1)).shape

(1, 4356)

In [None]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Please install GPU version of TF


In [None]:
### InceptionTime / GooLeNet  

In [None]:
import tensorflow.keras as keras
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [None]:

class Classifier_INCEPTION:

    def __init__(self, output_directory, input_shape, nb_classes, verbose=False, build=True, batch_size=64,
                 nb_filters=32, use_residual=True, use_bottleneck=True, depth=6, kernel_size=41, nb_epochs=1500):

        self.output_directory = output_directory

        self.nb_filters = nb_filters
        self.use_residual = use_residual
        self.use_bottleneck = use_bottleneck
        self.depth = depth
        self.kernel_size = kernel_size - 1
        self.callbacks = None
        self.batch_size = batch_size
        self.bottleneck_size = 32
        self.nb_epochs = nb_epochs
        self.verbose = verbose

        if build == True:
            self.model = self.build_model(input_shape, nb_classes)
            self.model.save_weights(self.output_directory + 'model_init.hdf5')

    def _inception_module(self, input_tensor, stride=1, activation='linear'):

        if self.use_bottleneck and int(input_tensor.shape[-1]) > 1:
            input_inception = keras.layers.Conv1D(filters=self.bottleneck_size, kernel_size=1,
                                                  padding='same', activation=activation, use_bias=False)(input_tensor)
        else:
            input_inception = input_tensor

        # kernel_size_s = [3, 5, 8, 11, 17]
        kernel_size_s = [self.kernel_size // (2 ** i) for i in range(3)]

        conv_list = []

        for i in range(len(kernel_size_s)):
            conv_list.append(keras.layers.Conv1D(filters=self.nb_filters, kernel_size=kernel_size_s[i],
                                                 strides=stride, padding='same', activation=activation, use_bias=False)(
                input_inception))

        max_pool_1 = keras.layers.MaxPool1D(pool_size=3, strides=stride, padding='same')(input_tensor)

        conv_6 = keras.layers.Conv1D(filters=self.nb_filters, kernel_size=1,
                                     padding='same', activation=activation, use_bias=False)(max_pool_1)

        conv_list.append(conv_6)

        x = keras.layers.Concatenate(axis=2)(conv_list)
        x = keras.layers.BatchNormalization()(x)
        x = keras.layers.Activation(activation='relu')(x)
        return x

    def _shortcut_layer(self, input_tensor, out_tensor):
        shortcut_y = keras.layers.Conv1D(filters=int(out_tensor.shape[-1]), kernel_size=1,
                                         padding='same', use_bias=False)(input_tensor)
        shortcut_y = keras.layers.BatchNormalization()(shortcut_y)

        x = keras.layers.Add()([shortcut_y, out_tensor])
        x = keras.layers.Activation('relu')(x)
        return x

    def build_model(self, input_shape, nb_classes):
        input_layer = keras.layers.Input(input_shape)

        x = input_layer
        input_res = input_layer

        for d in range(self.depth):

            x = self._inception_module(x)

            if self.use_residual and d % 3 == 2:
                x = self._shortcut_layer(input_res, x)
                input_res = x

        gap_layer = keras.layers.GlobalAveragePooling1D()(x)

        output_layer = keras.layers.Dense(nb_classes, activation='softmax')(gap_layer)

        model = keras.models.Model(inputs=input_layer, outputs=output_layer)

        model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(),
                      metrics=['accuracy', 'AUC'])

        reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50,
                                                      min_lr=0.0001)

        file_path = self.output_directory + 'best_model.hdf5'

        model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, monitor='loss',
                                                           save_best_only=True, )

        self.callbacks = [reduce_lr, model_checkpoint, tensorboard_callback]

        return model

    def fit(self, x_train, y_train, x_val, y_val, y_true, plot_test_acc=False):
        if not tf.test.gpu_device_name():
            print('error no gpu')
            exit()
        # x_val and y_val are only used to monitor the test loss and NOT for training

        if self.batch_size is None:
            mini_batch_size = int(min(x_train.shape[0] / 10, 16))
        else:
            mini_batch_size = self.batch_size

        start_time = time.time()

        if plot_test_acc:

            hist = self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=self.nb_epochs,
                                  verbose=self.verbose, validation_data=(x_val, y_val), callbacks=self.callbacks)
        else:

            hist = self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=self.nb_epochs,
                                  verbose=self.verbose, callbacks=self.callbacks, )

        duration = time.time() - start_time

        self.model.save(self.output_directory + 'last_model.hdf5')

        y_pred = self.predict(x_val, y_true, x_train, y_train, y_val,
                              return_df_metrics=False)

        # save predictions
        np.save(self.output_directory + 'y_pred.npy', y_pred)

        # convert the predicted from binary to integer
        y_pred = np.argmax(y_pred, axis=1)

        df_metrics = save_logs(self.output_directory, hist, y_pred, y_true, duration,
                             plot_test_acc=plot_test_acc)

        keras.backend.clear_session()

        return 0#

    def predict(self, x_test, y_true, x_train, y_train, y_test, return_df_metrics=True):
        start_time = time.time()
        model_path = self.output_directory + 'best_model.hdf5'
        model = keras.models.load_model(model_path)
        y_pred = model.predict(x_test, batch_size=self.batch_size)
        if return_df_metrics:
            y_pred = np.argmax(y_pred, axis=1)
            df_metrics = calculate_metrics(y_true, y_pred, 0.0)
            return df_metrics
        else:
            test_duration = time.time() - start_time
            save_test_duration(self.output_directory + 'test_duration.csv', test_duration)
            return y_pred

In [None]:
def calculate_metrics(y_true, y_pred, duration):
    res = pd.DataFrame(data=np.zeros((1, 4), dtype=np.float), index=[0],
                       columns=['precision', 'accuracy', 'recall', 'duration'])
    res['precision'] = precision_score(y_true, y_pred, average='macro')
    res['accuracy'] = accuracy_score(y_true, y_pred)
    res['recall'] = recall_score(y_true, y_pred, average='macro')
    res['duration'] = duration
    return res


def save_test_duration(file_name, test_duration):
    res = pd.DataFrame(data=np.zeros((1, 1), dtype=np.float), index=[0],
                       columns=['test_duration'])
    res['test_duration'] = test_duration
    res.to_csv(file_name, index=False)

def save_logs(output_directory, hist, y_pred, y_true, duration,
              lr=True, plot_test_acc=True):
    hist_df = pd.DataFrame(hist.history)
    hist_df.to_csv(output_directory + 'history.csv', index=False)

    df_metrics = calculate_metrics(y_true, y_pred, duration)
    df_metrics.to_csv(output_directory + 'df_metrics.csv', index=False)

    index_best_model = hist_df['loss'].idxmin()
    row_best_model = hist_df.loc[index_best_model]

    df_best_model = pd.DataFrame(data=np.zeros((1, 6), dtype=np.float), index=[0],
                                 columns=['best_model_train_loss', 'best_model_val_loss', 'best_model_train_acc',
                                          'best_model_val_acc', 'best_model_learning_rate', 'best_model_nb_epoch'])
    df_best_model['best_model_train_loss'] = row_best_model['loss']
    if plot_test_acc:
        df_best_model['best_model_val_loss'] = row_best_model['val_loss']
    df_best_model['best_model_train_acc'] = row_best_model['accuracy']
    if plot_test_acc:
        df_best_model['best_model_val_acc'] = row_best_model['val_accuracy']
    if lr == True:
        df_best_model['best_model_learning_rate'] = row_best_model['lr']
    df_best_model['best_model_nb_epoch'] = index_best_model

    df_best_model.to_csv(output_directory + 'df_best_model.csv', index=False)

    if plot_test_acc:
        # plot losses
        plot_epochs_metric(hist, output_directory + 'epochs_loss.png')

    return df_metrics


def plot_epochs_metric(hist, file_name, metric='loss'):
    plt.figure()
    plt.plot(hist.history[metric])
    plt.plot(hist.history['val_' + metric])
    plt.title('model ' + metric)
    plt.ylabel(metric, fontsize='large')
    plt.xlabel('epoch', fontsize='large')
    plt.legend(['train', 'val'], loc='upper left')
    plt.savefig(file_name, bbox_inches='tight')
    plt.close()

In [None]:
X_train.shape

(42312, 2)

In [None]:
c = Classifier_INCEPTION('../data/', (2, 2000), 2, build=True, verbose=True, batch_size=40, nb_epochs=600, depth=5, use_bottleneck=False)

ValueError: ignored

In [None]:
y_train_dummies = np.array(pd.get_dummies(pd.Series(y_train.reshape(y_train.shape[0], ))))
y_val_dummies =  np.array(pd.get_dummies(pd.Series(y_val.reshape(y_val.shape[0], ))))

In [None]:
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.preprocessing import LabelEncoder

In [None]:
c.model = keras.models.load_model('../best_model.hdf5')

In [None]:
c.fit(X_train, y_train_dummies, X_val, y_val_dummies, y_val, plot_test_acc=True)

In [None]:
c.predict(X_test, y_test,X_train, y_train, y_test )

In [None]:
pd.read_csv('../history.csv')

In [None]:
%tensorboard --logdir logs