In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sys import path
%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/My Drive/PHI/ToF_ML/src

/content/drive/My Drive/PHI/ToF_ML/src


In [4]:
from setup import data_setup
from data_generator import DataGenerator
from model_trainer import ModelTrainer

In [5]:
dg = data_setup()
norm_data = dg.df()

In [6]:
from data_transformation import get_isotope_data, get_hydrocarbs
isotope_data = get_isotope_data()

In [7]:
from data_transformation import get_suspicious_peaks, get_peak_suspiciousness, get_ranges
original_data = dg.calibrated_df(True, .005, .01,use_ranges=True, ranges=[0, 0, 0.5], cat=True)
ranges = get_ranges(isotope_data, 2000)
original_data['target'] = original_data['target'].apply(lambda a: a - 1 if a > 0 else a)
original_data['sus_peaks'] = original_data['masses'].apply(get_suspicious_peaks, args=(ranges, .1))
original_data['peak_sussness'] = original_data['masses'].apply(get_peak_suspiciousness, args=(ranges, True))

In [8]:
df = pd.read_csv('../data/processed_cas.csv')

In [None]:
test = df['Calibration'][3]

In [9]:
labels = []
positions = []
for row in df.itertuples():
    for split in row.Calibration.split(')')[:-1]:
        calibrators = split.split('(')[1].split(',')
        if calibrators[1] not in labels:
            labels.append(calibrators[1])
            positions.append(float(calibrators[2].strip()))

In [10]:
df2 = pd.DataFrame({'labels': labels, 'positions': positions})
df2 = df2.sort_values('positions', ascending=False).reset_index(drop=True)

In [11]:
def get_x(masses, intensities, x=12, thresh=0.1):
    '''
    Get all peaks in data near a specific mass x.
    '''
    row_x = -1
    max = -1
    for i, mass in enumerate(masses):
        dif = abs(mass-x)
        inten = intensities[i]
        if dif < thresh and (inten > max or max == -1):
            max = inten
            row_x = dif
    return row_x

In [12]:
training_data = pd.DataFrame(columns=list(df2['labels'])+['target'])

In [13]:
from data_transformation import generate_data
# 10,000 Examples only offset error
erred = generate_data(norm_data, 2, 2, True, [0, 0, 0])
for _ in range(10):
    erred = pd.concat([erred, generate_data(norm_data, 2, 2, True, [0, 0, 1], True)], axis=0)
#erred['target'] = erred['target'].apply(lambda a: a - 1 if a > 0 else a)
dg.set_df(erred)
erred = dg.calibrated_df()

In [14]:
for i, row in enumerate(erred.itertuples()):
    dists = []
    for pos in df2['positions']:
        dists.append(get_x(row.masses, row.intensities, x=pos, thresh=0.1))
    training_data.loc[i] = dists + [row.target]

In [15]:
from sklearn.model_selection import train_test_split
X = training_data.drop('target', axis=1)
X = X.to_numpy().reshape(9966, 309, 1)
y = training_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
from tensorflow.keras.layers import Dense, Flatten, Input, Conv1D, BatchNormalization
from tensorflow.keras.layers import AveragePooling1D, MaxPooling1D, Layer, Concatenate
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import Model, Sequential

In [17]:
def lenet():
    model = Sequential()

    model.add(Conv1D(filters=8, kernel_size=10, activation='tanh', input_shape=(309, 1)))
    model.add(AveragePooling1D())

    model.add(Conv1D(filters=16, kernel_size=20, activation='tanh'))
    model.add(AveragePooling1D())

    model.add(Flatten())

    model.add(Dense(units=500, activation='tanh'))

    model.add(Dense(units=100, activation='tanh'))

    model.add(Dense(units=2, activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC'])
    return model

In [18]:
estimator2 = KerasClassifier(build_fn = lenet, epochs = 300, batch_size = 30, verbose = 1, class_weight={0:11, 1:1})
estimator2.fit(X_train, y_train)
preds2 = estimator2.predict_proba(X_test)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78





In [None]:
import tensorflow.keras as keras
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.preprocessing import LabelEncoder

In [None]:
def calculate_metrics(y_true, y_pred, duration):
    res = pd.DataFrame(data=np.zeros((1, 4), dtype=np.float), index=[0],
                       columns=['precision', 'accuracy', 'recall', 'duration'])
    res['precision'] = precision_score(y_true, y_pred, average='macro')
    res['accuracy'] = accuracy_score(y_true, y_pred)
    res['recall'] = recall_score(y_true, y_pred, average='macro')
    res['duration'] = duration
    return res


def save_test_duration(file_name, test_duration):
    res = pd.DataFrame(data=np.zeros((1, 1), dtype=np.float), index=[0],
                       columns=['test_duration'])
    res['test_duration'] = test_duration
    res.to_csv(file_name, index=False)

def save_logs(output_directory, hist, y_pred, y_true, duration,
              lr=True, plot_test_acc=True):
    hist_df = pd.DataFrame(hist.history)
    hist_df.to_csv(output_directory + 'history.csv', index=False)

    df_metrics = calculate_metrics(y_true, y_pred, duration)
    df_metrics.to_csv(output_directory + 'df_metrics.csv', index=False)

    index_best_model = hist_df['loss'].idxmin()
    row_best_model = hist_df.loc[index_best_model]

    df_best_model = pd.DataFrame(data=np.zeros((1, 6), dtype=np.float), index=[0],
                                 columns=['best_model_train_loss', 'best_model_val_loss', 'best_model_train_acc',
                                          'best_model_val_acc', 'best_model_learning_rate', 'best_model_nb_epoch'])

    df_best_model['best_model_train_loss'] = row_best_model['loss']
    if plot_test_acc:
        df_best_model['best_model_val_loss'] = row_best_model['val_loss']
    df_best_model['best_model_train_acc'] = row_best_model['acc']
    if plot_test_acc:
        df_best_model['best_model_val_acc'] = row_best_model['val_acc']
    if lr == True:
        df_best_model['best_model_learning_rate'] = row_best_model['lr']
    df_best_model['best_model_nb_epoch'] = index_best_model

    df_best_model.to_csv(output_directory + 'df_best_model.csv', index=False)

    if plot_test_acc:
        # plot losses
        plot_epochs_metric(hist, output_directory + 'epochs_loss.png')

    return df_metrics

In [None]:
class Classifier_INCEPTION:

    def __init__(self, output_directory, input_shape, nb_classes, verbose=False, build=True, batch_size=64,
                 nb_filters=32, use_residual=True, use_bottleneck=True, depth=6, kernel_size=41, nb_epochs=1500):

        self.output_directory = output_directory

        self.nb_filters = nb_filters
        self.use_residual = use_residual
        self.use_bottleneck = use_bottleneck
        self.depth = depth
        self.kernel_size = kernel_size - 1
        self.callbacks = None
        self.batch_size = batch_size
        self.bottleneck_size = 32
        self.nb_epochs = nb_epochs

        if build == True:
            self.model = self.build_model(input_shape, nb_classes)
            if (verbose == True):
                self.model.summary()
            self.verbose = verbose
            self.model.save_weights(self.output_directory + 'model_init.hdf5')

    def _inception_module(self, input_tensor, stride=1, activation='linear'):

        if self.use_bottleneck and int(input_tensor.shape[-1]) > 1:
            input_inception = keras.layers.Conv1D(filters=self.bottleneck_size, kernel_size=1,
                                                  padding='same', activation=activation, use_bias=False)(input_tensor)
        else:
            input_inception = input_tensor

        # kernel_size_s = [3, 5, 8, 11, 17]
        kernel_size_s = [self.kernel_size // (2 ** i) for i in range(3)]

        conv_list = []

        for i in range(len(kernel_size_s)):
            conv_list.append(keras.layers.Conv1D(filters=self.nb_filters, kernel_size=kernel_size_s[i],
                                                 strides=stride, padding='same', activation=activation, use_bias=False)(
                input_inception))

        max_pool_1 = keras.layers.MaxPool1D(pool_size=3, strides=stride, padding='same')(input_tensor)

        conv_6 = keras.layers.Conv1D(filters=self.nb_filters, kernel_size=1,
                                     padding='same', activation=activation, use_bias=False)(max_pool_1)

        conv_list.append(conv_6)

        x = keras.layers.Concatenate(axis=2)(conv_list)
        x = keras.layers.BatchNormalization()(x)
        x = keras.layers.Activation(activation='relu')(x)
        return x

    def _shortcut_layer(self, input_tensor, out_tensor):
        shortcut_y = keras.layers.Conv1D(filters=int(out_tensor.shape[-1]), kernel_size=1,
                                         padding='same', use_bias=False)(input_tensor)
        shortcut_y = keras.layers.BatchNormalization()(shortcut_y)

        x = keras.layers.Add()([shortcut_y, out_tensor])
        x = keras.layers.Activation('relu')(x)
        return x

    def build_model(self, input_shape, nb_classes):
        input_layer = keras.layers.Input(input_shape)

        x = input_layer
        input_res = input_layer

        for d in range(self.depth):

            x = self._inception_module(x)

            if self.use_residual and d % 3 == 2:
                x = self._shortcut_layer(input_res, x)
                input_res = x

        gap_layer = keras.layers.GlobalAveragePooling1D()(x)

        output_layer = keras.layers.Dense(nb_classes, activation='softmax')(gap_layer)

        model = keras.models.Model(inputs=input_layer, outputs=output_layer)

        model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(),
                      metrics=['accuracy'])

        reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50,
                                                      min_lr=0.0001)

        file_path = self.output_directory + 'best_model.hdf5'

        model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, monitor='loss',
                                                           save_best_only=True)

        self.callbacks = [reduce_lr, model_checkpoint]

        return model

    def fit(self, x_train, y_train, x_val, y_val, y_true, plot_test_acc=False, class_weights=None):
        if not tf.test.gpu_device_name():
            print('error no gpu')
            exit()
        # x_val and y_val are only used to monitor the test loss and NOT for training

        if self.batch_size is None:
            mini_batch_size = int(min(x_train.shape[0] / 10, 16))
        else:
            mini_batch_size = self.batch_size

        start_time = time.time()

        if plot_test_acc:

            hist = self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=self.nb_epochs,
                                  verbose=self.verbose, validation_data=(x_val, y_val),
                                  callbacks=self.callbacks, class_weight=class_weights)
        else:

            hist = self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=self.nb_epochs,
                                  verbose=self.verbose, callbacks=self.callbacks, class_weight=class_weights)

        duration = time.time() - start_time

        self.model.save(self.output_directory + 'last_model.hdf5')

        y_pred = self.predict(x_val, y_true, x_train, y_train, y_val,
                              return_df_metrics=False)

        # save predictions
        np.save(self.output_directory + 'y_pred.npy', y_pred)

        # convert the predicted from binary to integer
        #y_pred = np.argmax(y_pred, axis=1)

        #df_metrics = save_logs(self.output_directory, hist, y_pred, y_true, duration,
        #                       plot_test_acc=plot_test_acc)

        keras.backend.clear_session()

        return y_pred

    def predict(self, x_test, y_true, x_train, y_train, y_test, return_df_metrics=False):
        start_time = time.time()
        model_path = self.output_directory + 'best_model.hdf5'
        model = keras.models.load_model(model_path)
        y_pred = model.predict(x_test, batch_size=self.batch_size)
        if return_df_metrics:
            y_pred = np.argmax(y_pred, axis=1)
            df_metrics = calculate_metrics(y_true, y_pred, 0.0)
            return df_metrics
        else:
            test_duration = time.time() - start_time
            save_test_duration(self.output_directory + 'test_duration.csv', test_duration)
            return y_pred

In [None]:
c = Classifier_INCEPTION('../', (309, 1), 2, build=True, verbose=True, batch_size=20, nb_epochs=100, depth=3)

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 309, 1)]     0                                            
__________________________________________________________________________________________________
max_pooling1d_15 (MaxPooling1D) (None, 309, 1)       0           input_6[0][0]                    
__________________________________________________________________________________________________
conv1d_82 (Conv1D)              (None, 309, 32)      1280        input_6[0][0]                    
__________________________________________________________________________________________________
conv1d_83 (Conv1D)              (None, 309, 32)      640         input_6[0][0]                    
____________________________________________________________________________________________

In [None]:
import tensorflow as tf

In [None]:
y_train_dummies = np.array(pd.get_dummies(y_train))
y_val_dummies =  np.array(pd.get_dummies(y_test))

In [None]:
import time
c.fit(X_train, y_train_dummies, X_test, y_val_dummies, y_val_dummies, class_weights={0:10, 1:1})

NameError: ignored

In [None]:
preds = c.predict(X_train, y_train, X_train, y_train, y_test)

In [None]:
predictions = []
for i in range(len(preds)):
    predictions.append(np.where(preds[i]==max(preds[i]))[0][0])
accuracy_score(y_train, predictions)

0.9233567486201706

In [None]:
y_test[60:70]

4375    1.0
9927    1.0
1347    1.0
6123    1.0
9727    1.0
2485    1.0
107     0.0
6162    1.0
2517    1.0
3209    1.0
Name: target, dtype: float64

In [None]:
preds[66]

array([0.03952207, 0.96047795], dtype=float32)