## Preprocessing Steps
**Spike removal / filtering methods**
<br>
    -- Reduction of spike events by special design of the instrument (Zhao, 2003)
<br>
    -- Automatic Spike Removal Algorithm for Raman Spectra: wavelet transform (spike removal raman filter from matlab)
<br>
    -- Missing point polynomial filter (I have the code)
<br>
    -- Robust smoothing filter
<br>
    -- Moving window filter 
<br>
**Remove background Autofluorescence noise**
<br>
--IModPoly (Chad A Lieber and Anita Mahadevan-Jansen. Automated method for subtraction offluorescence from biological raman spectra.Applied spectroscopy, 57(11):1363â€“1367,2003) (https://github.com/michaelstchen/modPolyFit)(Faster technique)
 <br>
--Zhiming Zhang (An intelligent background-correction algorithm for highly fluorescent samples in raman spectroscopy: https://onlinelibrary.wiley.com/doi/abs/10.1002/jrs.2500)(https://github.com/zmzhang/baselineWavelet)
<br>
--Vancouver Raman Algorithm (Jianhua Zhao: http://journals.sagepub.com/doi/abs/10.1366/000370207782597003) 
<br>
--EMD (Empirical  Mode Decomposition) (https://github.com/laszukdawid/PyEMD)
<br>
**Smoothing (Denoising)**
<br>
-- Savisky-Golay filtering (Scipi package):  https://github.com/scipy/scipy/blob/master/scipy/signal/_savitzky_golay.py
<br>
-- Moving Average/median
<br>
--CARS (Coherent Anti-Stokes Raman spectroscopy) 
<br>
**Normalize**
<br>
--Min/Max method (I have the code).
<br>
--Vector based 
<br>
**Spectral and intensity re-calibration**

**Normal**
<br>
Individual patients with 5 sample points in blood is 381
<br>
Individual patients with 3 sample points in blood is 228

**Disease 1:**

Individual patients with 5 sample points in blood is 144.
<br>
Individual patients with 3 sample points in blood is 20.


In [113]:
'''
Class dealing with the Raman data
'''
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
from imblearn.over_sampling import SMOTE
import random
import os
import pickle
import pandas as pd
import matplotlib
matplotlib.use('Qt5Agg')
import matplotlib.pyplot as plt
from convertwdf import *
from wdfReader import * 
from scipy import sparse
from scipy.sparse.linalg import spsolve
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Activation, Input, BatchNormalization, MaxPooling1D, Bidirectional,LSTM
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPool1D, Flatten , Embedding, GlobalMaxPool1D
from keras.models import Model
from keras.optimizers import SGD, Adam, rmsprop
#%matplotlib inline 
#https://github.com/MacDumi/Deconvolution
#python3 Deconvolution_test.py /home/titli/Documents/Deconvolution-master/0151.txt 
#https://www.pnas.org/content/114/31/8247

In [114]:
def normalize(data):
    _min = np.min(data)
    _max = np.max(data)
    return (data - _min) / (_max - _min)
def getspikes(fileID):
    
    x_data= fileID.get_xdata()
    spectra= fileID.get_spectra()
    return (x_data, spectra)

In [115]:
patient_array_1 = [] #patients in disease1
patient_array_0 = [] #patients in disease0
spectra_array0_980 = [] #spectrum in disease0
spectra_array0_1700 = [] #spectrum in disease1
spectra_array1_980 = [] #spectrum in disease1
spectra_array1_1700 = [] #spectrum in disease1

In [116]:
rootdir = '/home/titli/Documents/normal'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        if( x[5] == '1_0-5-1'):
            if (str(x[7]) not in patient_array_0):
                patient_array_0.append(x[7])  
patient_array_0 = list(set(patient_array_0))
print('Individual patients', len(patient_array_0))

Individual patients 381


In [117]:
rootdir = '/home/titli/Documents/disease1'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        if( x[5] == '1_0-5-1'):
            if (str(x[7]) not in patient_array_1):
                patient_array_1.append(x[7]) 
patient_array_1 = list(set(patient_array_1))
print('Individual patients', len(patient_array_1))

Individual patients 144


In [118]:
date_list_1 = []
rootdir = '/home/titli/Documents/disease1'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        date_list_1.append(subdir)
date_list_1 = list(set(date_list_1))
print('Individual date-patient', len(date_list_1))

Individual date-patient 330


In [119]:
date_list_0 = []
rootdir = '/home/titli/Documents/normal'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        date_list_0.append(subdir)
date_list_0 = list(set(date_list_0))
print('Individual date-patient', len(date_list_0))

Individual date-patient 1213


In [120]:
total_spectra_1 = []
for dirnames in date_list_1:
    names = dirnames.split("/")
    if (str(names[7]) in patient_array_1 and str(names[8]== 980)):
        path, dirs, files = next(os.walk(dirnames)) # get all the file names
        newdirnames = '/'.join(names[:-1])+'/1700'
        # looping through file names
        for f in files:
            filenames980 = dirnames + "/"+f
            filenames1700 = newdirnames + "/"+f
            wdfIle980 = wdfReader(filenames980)
            X, spectra980 = getspikes(wdfIle980)
            try:
                wdfIle1700 = wdfReader(filenames1700)
                X, spectra1700 = getspikes(wdfIle1700)
            except OSError:
                wdfIle1700 = np.zeros(1015)
            total_spectra_1.append(np.append(normalize(spectra980), normalize(spectra1700)))     
        try:
            patient_array_1.remove(names[7])
        except ValueError:
            continue
total_df_1= pd.DataFrame(total_spectra_1)
labels_1 =  pd.DataFrame({'labels': np.ones(len(total_df_1))})

In [121]:
total_spectra_0 = []
for dirnames in date_list_0:
    names = dirnames.split("/")
    if (str(names[7]) in patient_array_0 and str(names[8]== 980)): 
        path, dirs, files = next(os.walk(dirnames)) # get all the file names
        newdirnames = '/'.join(names[:-1])+'/1700'
        # looping through file names
        for f in files:
            filenames980 = dirnames + "/"+f
            filenames1700 = newdirnames + "/"+f
            wdfIle980 = wdfReader(filenames980)
            X, spectra980 = getspikes(wdfIle980)
            try:
                wdfIle1700 = wdfReader(filenames1700)
                X, spectra1700 = getspikes(wdfIle1700)
            except OSError:
                wdfIle1700 = np.ones(1015)*10000
            total_spectra_1.append(np.append(normalize(spectra980), normalize(spectra1700)))     
            total_spectra_0.append(np.append(normalize(spectra980), normalize(spectra1700)))   
        try:
            patient_array_0.remove(names[7])
        except ValueError:
            continue
total_df_0= pd.DataFrame(total_spectra_0)
labels_0 =  pd.DataFrame({'labels': np.zeros(len(total_df_0))})

In [122]:
len(labels_0)

1905

### Create train-validation split 

In [142]:
total_df_train = pd.concat([total_df_1[:int(len(total_df_1)*0.8)],total_df_0[:int(len(total_df_0)*0.8)]], axis = 0)
total_df_train = total_df_train.apply(lambda x: [y if y <= 1e-5 else 1e-4 for y in x])
labels_df_train = pd.concat([labels_1[:int(len(total_df_1)*0.8)],labels_0[:int(len(total_df_0)*0.8)]], axis = 0)
indices=list(range(0,len(total_df_train)))
random.shuffle(indices)
X_train = total_df_train.values[indices].reshape(total_df_train.shape[0],total_df_train.shape[1],1)
y_train = labels_df_train.values[indices]
y_train_labels = to_categorical(y_train, num_classes=2)

In [124]:
560+1524

2084

In [125]:
140+381

521

In [143]:
total_df_val = pd.concat([total_df_1[int(len(total_df_1)*0.8):],total_df_0[int(len(total_df_0)*0.8):]], axis = 0)
total_df_val = total_df_val.apply(lambda x: [y if y <= 1e-5 else 1e-4 for y in x])
labels_df_val = pd.concat([labels_1[int(len(total_df_1)*0.8):],labels_0[int(len(total_df_0)*0.8):]], axis = 0)
indices=list(range(0,len(total_df_val)))
random.shuffle(indices)
X_val = total_df_val.values[indices].reshape(total_df_val.shape[0],total_df_val.shape[1],1)
y_val = labels_df_val.values[indices]
y_val_labels = to_categorical(y_val, num_classes=2)

In [144]:
def kraub_method():
    inp =  Input(shape=(2030, 1))
    x = Conv1D(32, kernel_size = 7, strides= 1,padding='valid', activation='relu')(inp)
    x = Conv1D(16, kernel_size = 5, strides= 1, padding='valid', activation='relu')(x)
    x = Flatten()(x)
    x = Dropout(0.01)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.01)(x)
    x = Dense(256, activation='relu')(x)
    preds = Dense(2, activation='softmax')(x)
    model = Model(inp, preds)
    model.compile(loss= 'categorical_crossentropy',
              optimizer= 'rmsprop',
              metrics=['acc'])
    return model

In [145]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
weight_path="{}_model_step2.hdf5".format('boat_detector')
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min', save_weights_only = True)

reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=5, min_lr=0.0001)
early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=10) 
callbacks_list = [checkpoint, early, reduceLROnPlat]



In [146]:
model = kraub_method()
history = model.fit(X_train, y_train_labels, batch_size= 5, epochs=65, validation_data=(X_val, y_val_labels),callbacks=callbacks_list)

Train on 2084 samples, validate on 521 samples
Epoch 1/65

Epoch 00001: val_loss improved from inf to 0.58347, saving model to boat_detector_model_step2.hdf5
Epoch 2/65

Epoch 00002: val_loss improved from 0.58347 to 0.58198, saving model to boat_detector_model_step2.hdf5
Epoch 3/65

Epoch 00003: val_loss did not improve from 0.58198
Epoch 4/65

Epoch 00004: val_loss did not improve from 0.58198
Epoch 5/65

Epoch 00005: val_loss did not improve from 0.58198
Epoch 6/65

Epoch 00006: val_loss did not improve from 0.58198
Epoch 7/65

Epoch 00007: val_loss did not improve from 0.58198
Epoch 8/65

Epoch 00008: val_loss did not improve from 0.58198
Epoch 9/65

Epoch 00009: val_loss did not improve from 0.58198
Epoch 10/65

Epoch 00010: val_loss did not improve from 0.58198
Epoch 11/65

Epoch 00011: val_loss did not improve from 0.58198
Epoch 12/65

Epoch 00012: val_loss did not improve from 0.58198

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.


In [130]:
model_json = model.to_json()
with open("model_step3.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_step3.h5")
print("Saved model to disk")

Saved model to disk


In [131]:
patient_array_1 = [] #patients in disease1
patient_array_0 = [] #patients in disease0
patient_array_2 = [] #patients in disease1
patient_array_3 = [] #patients in disease0
spectra_array0_980 = [] #spectrum in disease0
spectra_array0_1700 = [] #spectrum in disease1
spectra_array1_980 = [] #spectrum in disease1
spectra_array1_1700 = [] #spectrum in disease1

In [132]:
rootdir = '/home/titli/Documents/test/disease1'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        if( x[6] == '1_0-5-1'):
            if (str(x[8]) not in patient_array_1):
                patient_array_1.append(x[8]) 
patient_array_1= set(patient_array_1)

In [133]:
date_list_1 = []
rootdir = '/home/titli/Documents/test/disease1'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        date_list_1.append(subdir)
date_list_1 = list(set(date_list_1))

In [134]:
rootdir = '/home/titli/Documents/test/normal'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        if( x[6] == '1_0-5-1'):
            if (str(x[8]) not in patient_array_0):
                patient_array_0.append(x[8])    
patient_array_0= set(patient_array_0)

In [135]:
date_list_0 = []
rootdir = '/home/titli/Documents/test/normal'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        date_list_0.append(subdir)
date_list_0 = list(set(date_list_0))

In [136]:
total_spectra_1 = []
for dirnames in date_list_1:
    names = dirnames.split("/")
    if ( str(names[9]== 980) and str(names[8]) in patient_array_1): #str(names[8]) in patient_array_1 and
        path, dirs, files = next(os.walk(dirnames)) # get all the file namesone
        newdirnames = '/'.join(names[:-1])+'/1700'
        # looping through file names
        for f in files:
            
            filenames980 = dirnames + "/"+f
            filenames1700 = newdirnames + "/"+f
            wdfIle980 = wdfReader(filenames980)
            X, spectra980 = getspikes(wdfIle980)
            try:
                wdfIle1700 = wdfReader(filenames1700)
                X, spectra1700 = getspikes(wdfIle1700)
            except OSError:
                wdfIle1700 = np.ones(1015)*10000
            total_spectra_1.append(np.append(normalize(spectra980), normalize(spectra1700))) 
        try:
            patient_array_1.remove(names[8])
        except ValueError:
            continue

total_df_1_test= pd.DataFrame(total_spectra_1)
labels_test_1 =  pd.DataFrame({'labels': np.ones(len(total_df_1_test))})

In [137]:
total_spectra_0 = []
for dirnames in date_list_0:
    names = dirnames.split("/")
    if (str(names[8]) in patient_array_0 and str(names[9]== 980)):
        path, dirs, files = next(os.walk(dirnames)) # get all the file names
        newdirnames = '/'.join(names[:-1])+'/1700'
        # looping through file names
        for f in files:
            filenames980 = dirnames + "/"+f
            filenames1700 = newdirnames + "/"+f
            wdfIle980 = wdfReader(filenames980)
            X, spectra980 = getspikes(wdfIle980)
            wdfIle1700 = wdfReader(filenames1700)
            X, spectra1700 = getspikes(wdfIle1700)
            total_spectra_0.append(np.append(normalize(spectra980), normalize(spectra1700)))   
        try:
            patient_array_0.remove(names[8])
        except ValueError:
            continue
total_df_0_test= pd.DataFrame(total_spectra_0)
labels_test_0 =  pd.DataFrame({'labels': np.zeros(len(total_df_0_test))})

In [138]:
total_df_test = pd.concat([total_df_1_test,total_df_0_test], axis = 0)
total_df_test = total_df_test.apply(lambda x: [y if y <= 1e-5 else 1e-4 for y in x])
X_test = total_df_test.values
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)
labels_df_test = pd.concat([labels_test_0,labels_test_1], axis = 0)
y_test = labels_df_test.values
y_test = to_categorical(y_test, num_classes=2)
model1_test_y = model.predict(X_test, batch_size=10, verbose=1)



In [139]:
model1_test_y[model1_test_y > 0.5] = 1
model1_test_y[model1_test_y <= 0.5] = 0

In [140]:
def F1_score(pred_test_y, actuals):

    predictions =[]
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    
    for i in range (len(pred_test_y)):
        if ((pred_test_y[i,0]==1) & (actuals[i,0]==1)):
            true_pos = true_pos+1
        elif((pred_test_y[i,0]==0) & (actuals[i,0]==0)):
            true_neg = true_neg+1
        elif((pred_test_y[i,0]==1) & (actuals[i,0]==0)):
            false_pos = false_pos +1
        elif((pred_test_y[i,0]==0) & (actuals[i,0]==1)):
            false_neg = false_neg+1
    #prec=true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos+false_neg)
    accur=(true_pos+true_neg)/(true_pos+false_pos+ true_neg+ false_neg)
    #F1=2*(prec*recall/(prec+recall))
    #FPR = false_pos/(false_pos+true_neg)
    return (true_pos, false_pos, true_neg, false_neg, accur)

In [141]:
print((F1_score(model1_test_y, y_test)))

(450, 100, 0, 0, 0.8181818181818182)
