In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
%matplotlib inline

In [2]:
%load_ext tensorboard
import datetime, os
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [3]:
%cd /content/drive/MyDrive/PHI/ToF_ML/src

/content/drive/MyDrive/PHI/ToF_ML/src


In [4]:
from ast import literal_eval
data = pd.read_csv('../data/fixed_1400.csv')
data['masses'] = data['masses'].apply(literal_eval)
data['channels'] = data['channels'].apply(literal_eval)
data['intensities'] = data['intensities'].apply(literal_eval)

In [5]:
data.head()

Unnamed: 0,file_name,Mass/Time,MassOffset,StartFlightTime,SpecBinSize,channels,intensities,masses,avg_dist_frags_low,avg_dist_frags_high,adjusted_original_proportions_identified,original_proportions_identified,diff,prop_diff_in_low,calibration,adjusted_proportion_identified,proportion_identified
0,0106301.cas,0.387384,-0.275004,0.0,1.248,"[2644.0367300000003, 3505.0183700000002, 4162....","[73874, 1234, 138, 610, 1216, 4159, 8958, 1084...","[1.0065519723918102, 2.015029094672708, 3.0191...",0.001298,0.002255,0.418033,0.398438,0.000958,0.738174,0,0.540984,0.515625
1,0107316.cas,0.387113,-0.278302,0.0,1.248,"[2647.00072, 3508.9949100000003, 4164.59326000...","[49864, 1034, 168, 4696, 8247, 13992, 17903, 2...","[1.00101811517532, 2.0077555328930656, 3.00565...",0.001537,0.002586,0.131783,0.129771,0.001049,0.682225,0,0.51938,0.51145
2,0110203.cas,0.379037,-0.271056,0.0,4.992,"[1973.87665, 2049.0706800000003, 2122.01224, 2...","[23352, 74717, 10387, 947, 12344, 9121, 249, 4...","[11.998071176139083, 13.003971096434277, 14.01...",0.00164,0.001858,0.388889,0.388889,0.000218,0.1331,0,0.444444,0.444444
3,0110212.cas,0.379177,-0.269744,0.0,4.992,"[672.00298, 891.90543, 1970.94521, 2046.11295,...","[34398, 304, 2223, 3521, 5205, 6509, 99, 115, ...","[1.0045194511091773, 2.012140966655108, 11.978...",0.001337,0.002206,0.37963,0.37963,0.000868,0.649178,0,0.592593,0.592593
4,0116511.cas,0.38336,-0.302184,0.0,1.248,"[2726.98153, 3594.53649, 4265.28736, 7866.5038...","[42995, 602, 151, 17912, 9536, 17609, 29604, 5...","[1.0049940659400325, 2.0094784706009245, 3.022...",0.001397,0.002171,0.418367,0.336066,0.000774,0.554114,0,0.581633,0.467213


In [6]:
from data_transformation import generate_data, mass_formula, generate_calibrated_data
erred = generate_data(data, 2, 2, [0, 0, 0], slope_index=2)
for num in range(3):
    for _ in range(10):
        erred = pd.concat([erred, generate_data(data, num + 2, 1, [0.334, 0.667, 1], True, True, slope_index=2)], axis=0)
#erred['target'] = erred['target'].apply(lambda a: a - 1 if a > 0 else a)
erred.reset_index(inplace=True, drop=True)
erred = generate_calibrated_data(erred, slope_index=2)

In [7]:
def get_spectra_summary(masses, scaled_intens, slope, offset, length=2000):
    # scaled intens might not be a good tie breaker for peaks at same mass
    summary = np.zeros([2, length + 1])
    i = 0
    while 1:
        mass = masses[i]
        if mass < length:
            intensity = scaled_intens[i]
            index = round(mass)
            if summary[1][index] == 0:
                summary[1][index] = mass - round(mass)
                summary[0][index] = intensity
            else:
                if intensity > summary[0][index]:
                    summary[1][index] = mass - round(mass)
                    summary[0][index] = intensity
        i += 1
        if i >= len(masses):
            break
    summary[0][-1] += slope
    summary[1][-1] += offset
    return summary

In [None]:
from sklearn.preprocessing import MinMaxScaler
scl = MinMaxScaler()
scaled_intens = []
for row in erred.itertuples():
    scl.fit(np.array(row.intensities).reshape((-1, 1)))
    intensities = scl.transform(np.array(row.intensities).reshape((-1, 1)))
    scaled_intens.append(intensities)

In [None]:
errors = np.array(erred[['err_prop_slope', 'err_prop_offset']])

In [None]:
X = np.zeros((len(erred), 2, 2001))
y = np.array(pd.get_dummies(erred['target']))
y = np.concatenate([y, errors], axis=1)
for i, row in enumerate(erred.itertuples()):
    summary = get_spectra_summary(row.masses, scaled_intens[i], row[2],
                                  row.MassOffset)
    X[i] += summary

In [None]:
from sklearn.model_selection import train_test_split
indices = np.concatenate((np.random.randint(0, 1441, 10), np.random.randint(1441, X.shape[0], 270)))
X_val = X[indices]
y_val = y[indices]
X = np.delete(X, indices, axis=0)
y = np.delete(y, indices, axis=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train = tf.convert_to_tensor(X_train)
X_test = tf.convert_to_tensor(X_test)
X_val = tf.convert_to_tensor(X_val)

In [None]:
def split_y(y):
    dummy = y.copy()
    split = np.hsplit(dummy, np.array([3, 6]))
    dummy = {'error_cat': split[0], 'error_amt': split[1]}
    return dummy

In [None]:
y_train = split_y(y_train)
y_test = split_y(y_test)
y_val = split_y(y_val)

In [None]:
if tf.test.gpu_device_name():
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device:/device:GPU:0


In [None]:
class_weights = {0: 1 / (np.sum(erred['target']==0) / len(erred)), 1: 1/ (np.sum(erred['target']==1) / len(erred)), 2: 1 / (np.sum(erred['target']==2) / len(erred))}

In [None]:
import tensorflow.keras as keras

In [None]:
def weighted_categorical_crossentropy(y_true, y_pred, weights):
    nb_cl = len(weights)
    final_mask = K.zeros_like(y_pred[:, 0])
    y_pred_max = K.max(y_pred, axis=1)
    y_pred_max = K.reshape(y_pred_max, (K.shape(y_pred)[0], 1))
    y_pred_max_mat = K.cast(K.equal(y_pred, y_pred_max), K.floatx())
    for c_p, c_t in product(range(nb_cl), range(nb_cl)):
        final_mask += (weights[c_t, c_p] * y_pred_max_mat[:, c_p] * y_true[:, c_t])
    return K.categorical_crossentropy(y_pred, y_true) * final_mask

In [None]:
def calculate_metrics(y_true, y_pred, duration):
    res = pd.DataFrame(data=np.zeros((1, 4), dtype=np.float), index=[0],
                       columns=['precision', 'accuracy', 'recall', 'duration'])
    res['precision'] = precision_score(y_true, y_pred, average='macro')
    res['accuracy'] = accuracy_score(y_true, y_pred)
    res['recall'] = recall_score(y_true, y_pred, average='macro')
    res['duration'] = duration
    return res


def save_test_duration(file_name, test_duration):
    res = pd.DataFrame(data=np.zeros((1, 1), dtype=np.float), index=[0],
                       columns=['test_duration'])
    res['test_duration'] = test_duration
    res.to_csv(file_name, index=False)


def save_logs(output_directory, hist, y_pred, y_true, duration,
              lr=True, plot_test_acc=True):
    hist_df = pd.DataFrame(hist.history)
    hist_df.to_csv(output_directory + 'history.csv', index=False)

    df_metrics = calculate_metrics(y_true, y_pred, duration)
    df_metrics.to_csv(output_directory + 'df_metrics.csv', index=False)

    index_best_model = hist_df['loss'].idxmin()
    row_best_model = hist_df.loc[index_best_model]

    df_best_model = pd.DataFrame(data=np.zeros((1, 6), dtype=np.float),
                                 index=[0],
                                 columns=['best_model_train_loss',
                                          'best_model_val_loss',
                                          'best_model_train_acc',
                                          'best_model_val_acc',
                                          'best_model_learning_rate',
                                          'best_model_nb_epoch'])
    df_best_model['best_model_train_loss'] = row_best_model['loss']
    if plot_test_acc:
        df_best_model['best_model_val_loss'] = row_best_model['val_loss']
    df_best_model['best_model_train_acc'] = row_best_model['accuracy']
    if plot_test_acc:
        df_best_model['best_model_val_acc'] = row_best_model['val_accuracy']
    if lr == True:
        df_best_model['best_model_learning_rate'] = row_best_model['lr']
    df_best_model['best_model_nb_epoch'] = index_best_model

    df_best_model.to_csv(output_directory + 'df_best_model.csv', index=False)

    if plot_test_acc:
        # plot losses
        plot_epochs_metric(hist, output_directory + 'epochs_loss.png')

    return df_metrics


def plot_epochs_metric(hist, file_name, metric='loss'):
    plt.figure()
    plt.plot(hist.history[metric])
    plt.plot(hist.history['val_' + metric])
    plt.title('model ' + metric)
    plt.ylabel(metric, fontsize='large')
    plt.xlabel('epoch', fontsize='large')
    plt.legend(['train', 'val'], loc='upper left')
    plt.savefig(file_name, bbox_inches='tight')
    plt.close()


class Classifier_INCEPTION:

    def __init__(self, output_directory, input_shape, nb_classes, verbose=False,
                 build=True, batch_size=64, nb_filters=32, use_residual=True,
                 use_bottleneck=True, depth=6, kernel_size=41, nb_epochs=1500):

        self.output_directory = output_directory

        self.nb_filters = nb_filters
        self.use_residual = use_residual
        self.use_bottleneck = use_bottleneck
        self.depth = depth
        self.kernel_size = kernel_size - 1
        self.callbacks = None
        self.batch_size = batch_size
        self.bottleneck_size = 32
        self.nb_epochs = nb_epochs
        self.verbose = verbose

        if build == True:
            self.model = self.build_model(input_shape, nb_classes)
            self.model.save_weights(self.output_directory + 'model_init.hdf5')

    def _inception_module(self, input_tensor, stride=1, activation='linear'):

        if self.use_bottleneck and int(input_tensor.shape[-1]) > 1:
            input_inception = keras.layers.Conv1D(filters=self.bottleneck_size, kernel_size=1,
                                                  padding='same', activation=activation, use_bias=False)(input_tensor)
        else:
            input_inception = input_tensor

        # kernel_size_s = [3, 5, 8, 11, 17]
        kernel_size_s = [self.kernel_size // (2 ** i) for i in range(3)]

        conv_list = []

        for i in range(len(kernel_size_s)):
            conv_list.append(keras.layers.Conv1D(filters=self.nb_filters, kernel_size=kernel_size_s[i],
                                                 strides=stride, padding='same', activation=activation, use_bias=False)(
                input_inception))

        max_pool_1 = keras.layers.MaxPool1D(pool_size=3, strides=stride, padding='same')(input_tensor)

        conv_6 = keras.layers.Conv1D(filters=self.nb_filters, kernel_size=1,
                                     padding='same', activation=activation, use_bias=False)(max_pool_1)

        conv_list.append(conv_6)

        x = keras.layers.Concatenate(axis=2)(conv_list)
        x = keras.layers.BatchNormalization()(x)
        x = keras.layers.Activation(activation='relu')(x)
        return x

    def _shortcut_layer(self, input_tensor, out_tensor):
        shortcut_y = keras.layers.Conv1D(filters=int(out_tensor.shape[-1]), kernel_size=1,
                                         padding='same', use_bias=False)(input_tensor)
        shortcut_y = keras.layers.BatchNormalization()(shortcut_y)

        x = keras.layers.Add()([shortcut_y, out_tensor])
        x = keras.layers.Activation('relu')(x)
        return x

    def build_model(self, input_shape, nb_classes):
        input_layer = keras.layers.Input(input_shape)
        calib_values = keras.layers.Lambda(lambda x: x[:, :, -1])(input_layer)
        spectrum = keras.layers.Lambda(lambda x: x[:, :, :-1])(input_layer)

        input_dropout = keras.layers.Dropout(rate=0.2)(spectrum)
        x = input_dropout
        input_res = input_dropout

        for d in range(self.depth):

            x = self._inception_module(x)

            if self.use_residual and d % 3 == 2:
                x = self._shortcut_layer(input_res, x)
                input_res = x

        gap_layer = keras.layers.GlobalAveragePooling1D()(x)

        concat = keras.layers.Concatenate()([gap_layer, calib_values])

        dropout_gap = keras.layers.Dropout(rate=.2)(gap_layer)

        dl = keras.layers.Dense(128)(dropout_gap)

        dropout_dense = keras.layers.Dropout(rate=.2)(dl)

        dl2 = keras.layers.Dense(256)(dropout_dense)

        error_output_layer = keras.layers.Dense(2, activation='linear')(dl2)

        concat_final = keras.layers.Concatenate()([dl2, error_output_layer])

        class_output_layer = keras.layers.Dense(nb_classes,
                                          activation='softmax')(concat_final)

        model = keras.models.Model(inputs=input_layer,
                                   outputs={'error_cat':class_output_layer,
                                            'error_amt': error_output_layer})

        model.compile(loss={'error_cat':'categorical_crossentropy',
                            'error_amt': 'mse'},
                      loss_weights = {'error_cat': .3, 
                                      'error_amt': .6},
                      optimizer=keras.optimizers.Adam(),
                      metrics={'error_cat': ['accuracy', 'AUC'],
                               'error_amt': ['mse']})

        reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss',
                                                      factor=0.5, patience=50,
                                                      min_lr=0.0001)

        file_path = self.output_directory + 'best_model.hdf5'

        model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path,
                                                           monitor='loss',
                                                           save_best_only=True,)
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir,
                                                              histogram_freq=1)
        
        early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                       patience=20)


        self.callbacks = [reduce_lr, model_checkpoint, tensorboard_callback,
                          early_stopping]

        return model

    def fit(self, x_train, y_train, x_val, y_val, y_true, plot_test_acc=False):
        if not tf.test.gpu_device_name():
            print('error no gpu')
            exit()
        # x_val and y_val are only used to monitor the test loss and NOT for training

        if self.batch_size is None:
            mini_batch_size = int(min(x_train.shape[0] / 10, 16))
        else:
            mini_batch_size = self.batch_size

        start_time = time.time()

        if plot_test_acc:

            hist = self.model.fit(x_train, y_train, batch_size=mini_batch_size,
                                  epochs=self.nb_epochs, verbose=self.verbose,
                                  validation_data=(x_val, y_val),
                                  callbacks=self.callbacks)
        else:

            hist = self.model.fit(x_train, y_train, batch_size=mini_batch_size,
                                  epochs=self.nb_epochs, verbose=self.verbose,
                                  callbacks=self.callbacks)

        duration = time.time() - start_time

        self.model.save(self.output_directory + 'last_model.hdf5')

        y_pred = self.predict(x_val, y_true, x_train, y_train, y_val,
                              return_df_metrics=False)

        # save predictions
        np.save(self.output_directory + 'y_pred.npy', y_pred)

        # convert the predicted from binary to integer
        y_pred = np.argmax(y_pred, axis=1)

        df_metrics = save_logs(self.output_directory, hist, y_pred, y_true, duration,
                             plot_test_acc=plot_test_acc)

        keras.backend.clear_session()

        return 0#

    def predict(self, x_test, y_true, x_train, y_train, y_test,
                return_df_metrics=True):
        start_time = time.time()
        model_path = self.output_directory + 'best_model.hdf5'
        model = keras.models.load_model(model_path)
        y_pred = model.predict(x_test, batch_size=self.batch_size)
        if return_df_metrics:
            y_pred = np.argmax(y_pred, axis=1)
            df_metrics = calculate_metrics(y_true, y_pred, 0.0)
            return df_metrics
        else:
            test_duration = time.time() - start_time
            save_test_duration(self.output_directory + 'test_duration.csv',
                               test_duration)
            return y_pred

In [None]:
X_train.shape

TensorShape([35514, 2, 2001])

In [None]:
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.preprocessing import LabelEncoder

In [None]:
c = Classifier_INCEPTION('../data/', (2, 2001), 3, build=True, verbose=True, batch_size=60, nb_epochs=600, depth=5, use_bottleneck=True)

In [None]:
c.model.summary()

In [None]:
c.fit(X_train, y_train, X_val, y_val, y_val, plot_test_acc=True)

In [None]:
c.model.save('../models/scaled_intens_whole_num_subbed_masses_3_cat_dropout')

In [None]:
 keras.models.load_model('../models/scaled_intens_whole_num_subbed_masses')

<tensorflow.python.keras.engine.functional.Functional at 0x7fabe3e13150>

In [None]:
c.predict(X_val, y_val, X_train, y_train, y_test)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,accuracy,recall,duration
0,0.500845,0.792857,0.544176,0.0


In [None]:
%tensorboard --logdir log_dir

In [None]:
from keras.utils import to_categorical
from PIL import Image
# Data Generator Example: read in data small amt at a time to save memory
# https://towardsdatascience.com/building-a-multi-output-convolutional-neural-network-with-keras-ed24c7bc1178
class UtkFaceDataGenerator():
    """
    Data generator for the UTKFace dataset. This class should be used when training our Keras multi-output model.
    """
    def __init__(self, df):
        self.df = df
        
    def generate_split_indexes(self):
        p = np.random.permutation(len(self.df))
        train_up_to = int(len(self.df) * TRAIN_TEST_SPLIT)
        train_idx = p[:train_up_to]
        test_idx = p[train_up_to:]
        train_up_to = int(train_up_to * TRAIN_TEST_SPLIT)
        train_idx, valid_idx = train_idx[:train_up_to], train_idx[train_up_to:]
        
        # converts alias to id
        self.df['gender_id'] = self.df['gender'].map(lambda gender: dataset_dict['gender_alias'][gender])
        self.df['race_id'] = self.df['race'].map(lambda race: dataset_dict['race_alias'][race])
        self.max_age = self.df['age'].max()
        
        return train_idx, valid_idx, test_idx
    
    def preprocess_image(self, img_path):
        """
        Used to perform some minor preprocessing on the image before inputting into the network.
        """
        im = Image.open(img_path)
        im = im.resize((IM_WIDTH, IM_HEIGHT))
        im = np.array(im) / 255.0
        
        return im
        
    def generate_images(self, image_idx, is_training, batch_size=16):
        """
        Used to generate a batch with images when training/testing/validating our Keras model.
        """
        
        # arrays to store our batched data
        images, ages, races, genders = [], [], [], []
        while True:
            for idx in image_idx:
                person = self.df.iloc[idx]
                
                age = person['age']
                race = person['race_id']
                gender = person['gender_id']
                file = person['file']
                
                im = self.preprocess_image(file)
                
                ages.append(age / self.max_age)
                races.append(to_categorical(race, len(dataset_dict['race_id'])))
                genders.append(to_categorical(gender, len(dataset_dict['gender_id'])))
                images.append(im)
                
                # yielding condition
                if len(images) >= batch_size:
                    yield np.array(images), [np.array(ages), np.array(races), np.array(genders)]
                    images, ages, races, genders = [], [], [], []
                    
            if not is_training:
                break
                
data_generator = UtkFaceDataGenerator(df)
train_idx, valid_idx, test_idx = data_generator.generate_split_indexes() 