## Preprocessing Steps
**Spike removal / filtering methods**
<br>
    -- Reduction of spike events by special design of the instrument (Zhao, 2003)
<br>
    -- Automatic Spike Removal Algorithm for Raman Spectra: wavelet transform (spike removal raman filter from matlab)
<br>
    -- Missing point polynomial filter (I have the code)
<br>
    -- Robust smoothing filter
<br>
    -- Moving window filter 
<br>
**Remove background Autofluorescence noise**
<br>
--IModPoly (Chad A Lieber and Anita Mahadevan-Jansen. Automated method for subtraction offluorescence from biological raman spectra.Applied spectroscopy, 57(11):1363–1367,2003) (https://github.com/michaelstchen/modPolyFit)(Faster technique)
 <br>
--Zhiming Zhang (An intelligent background-correction algorithm for highly fluorescent samples in raman spectroscopy: https://onlinelibrary.wiley.com/doi/abs/10.1002/jrs.2500)(https://github.com/zmzhang/baselineWavelet)
<br>
--Vancouver Raman Algorithm (Jianhua Zhao: http://journals.sagepub.com/doi/abs/10.1366/000370207782597003) 
<br>
--EMD (Empirical  Mode Decomposition) (https://github.com/laszukdawid/PyEMD)
<br>
**Smoothing (Denoising)**
<br>
-- Savisky-Golay filtering (Scipi package):  https://github.com/scipy/scipy/blob/master/scipy/signal/_savitzky_golay.py
<br>
-- Moving Average/median
<br>
--CARS (Coherent Anti-Stokes Raman spectroscopy) 
<br>
**Normalize**
<br>
--Min/Max method (I have the code).
<br>
--Vector based 
<br>
**Spectral and intensity re-calibration**

**Normal**
<br>
Individual patients with 5 sample points in blood is 471
<br>
Individual patients with 3 sample points in blood is 228

**Disease 1:**

Individual patients with 5 sample points in blood is 153.
<br>
Individual patients with 3 sample points in blood is 20.


In [1]:
'''
Class dealing with the Raman data
'''
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
from imblearn.over_sampling import SMOTE
import random
import os
import pickle
import pandas as pd
import matplotlib
matplotlib.use('Qt5Agg')
import matplotlib.pyplot as plt
from convertwdf import *
from wdfReader import * 
from scipy import sparse
from scipy.sparse.linalg import spsolve
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Activation, Input, BatchNormalization, MaxPooling1D, Bidirectional,LSTM
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPool1D, Flatten , Embedding, GlobalMaxPool1D
from keras.models import Model
from keras.optimizers import SGD, Adam, rmsprop
#%matplotlib inline 
#https://github.com/MacDumi/Deconvolution
#python3 Deconvolution_test.py /home/titli/Documents/Deconvolution-master/0151.txt 
#https://www.pnas.org/content/114/31/8247

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def normalize(data):
    _min = np.min(data)
    _max = np.max(data)
    return (data - _min) / (_max - _min)
def getspikes(fileID):
    
    x_data= fileID.get_xdata()
    spectra= fileID.get_spectra()
    return (x_data, spectra)

In [3]:
patient_array_1 = [] #patients in disease1
patient_array_0 = [] #patients in disease0
spectra_array0 = [] #spectrum in disease0
spectra_array1 = [] #spectrum in disease1

In [4]:
#Disease 1
rootdir = '/home/titli/Documents/disease1'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        if( x[5] == '1_0-5-1' and x[8] == '980'):
            if (str(x[7]) not in patient_array_1):
                patient_array_1.append(x[7])
                wdfIle = wdfReader(os.path.join(subdir, file))
                X, spectra = getspikes(wdfIle) # plotting the spectrum
                if len(X)<1015:
                    continue
                spectra = normalize(spectra)
                spectra_array1.append(spectra)
spectra_array_1= pd.DataFrame(spectra_array1)
labels_1 = pd.DataFrame({'labels': np.ones((len(spectra_array1),), dtype=int)})

In [5]:
len(labels_1)

96

### Normal patients

In [6]:
rootdir = '/home/titli/Documents/normal'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        if( x[5] == '1_0-5-1' and x[8] == '980'):
            if (str(x[7]) not in patient_array_0):
                patient_array_0.append(x[7])
                wdfIle = wdfReader(os.path.join(subdir, file))
                X, spectra = getspikes(wdfIle) # plotting the spectrum
                if len(X)<1015:
                    continue
                spectra = normalize(spectra)
                spectra_array0.append(spectra)
spectra_array_0= pd.DataFrame(spectra_array0)
labels_0 = pd.DataFrame({'labels': np.zeros((len(spectra_array0),), dtype=int)})

In [7]:
len(labels_0)

381

In [9]:
total_df = pd.concat([spectra_array_0,spectra_array_1], axis = 0)
labels_df = pd.concat([labels_0,labels_1], axis = 0)
indices=list(range(0,len(total_df)))
random.shuffle(indices)
X = total_df.values[indices]
y = labels_df.values[indices]
#total_df.fillna(0.0)
#len(total_df)

In [10]:
# Test- Train Dataset: Making a balanced dataset 50 disease1 and 50 normal
split_val= int(len(X)*0.8)
X_train=X[:split_val]
X_test=X[split_val:,]
y_train =y[:split_val]
y_test =y[split_val:]
X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)
# Convert labels to categorical one-hot encoding
y_train_labels = to_categorical(y_train, num_classes=2)
y_test_labels = to_categorical(y_test, num_classes=2)

In [11]:
def kraub_method():
    inp =  Input(shape=(1015, 1))
    x = Conv1D(32, kernel_size = 7, strides= 1,padding='valid', activation='relu')(inp)
    x = Conv1D(16, kernel_size = 5, strides= 1, padding='valid', activation='relu')(x)
    x = Flatten()(x)
    x = Dropout(0.01)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.01)(x)
    x = Dense(256, activation='relu')(x)
    preds = Dense(2, activation='softmax')(x)
    model = Model(inp, preds)
    model.compile(loss= 'categorical_crossentropy',
              optimizer= 'rmsprop',
              metrics=['acc'])
    return model
    

In [12]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
weight_path="{}_model_step1.hdf5".format('boat_detector')
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min', save_weights_only = True)

reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=5, min_lr=0.0001)
early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=10) 
callbacks_list = [checkpoint, early, reduceLROnPlat]



In [13]:
model = kraub_method()
history = model.fit(X_train, y_train_labels, batch_size= 10, epochs=65, validation_data=(X_test, y_test_labels),callbacks=callbacks_list)

Train on 381 samples, validate on 96 samples
Epoch 1/65

Epoch 00001: val_loss improved from inf to 0.68052, saving model to boat_detector_model_step1.hdf5
Epoch 2/65

Epoch 00002: val_loss improved from 0.68052 to 0.56730, saving model to boat_detector_model_step1.hdf5
Epoch 3/65

Epoch 00003: val_loss improved from 0.56730 to 0.42742, saving model to boat_detector_model_step1.hdf5
Epoch 4/65

Epoch 00004: val_loss did not improve from 0.42742
Epoch 5/65

Epoch 00005: val_loss did not improve from 0.42742
Epoch 6/65

Epoch 00006: val_loss did not improve from 0.42742
Epoch 7/65

Epoch 00007: val_loss did not improve from 0.42742
Epoch 8/65

Epoch 00008: val_loss did not improve from 0.42742
Epoch 9/65

Epoch 00009: val_loss did not improve from 0.42742
Epoch 10/65

Epoch 00010: val_loss did not improve from 0.42742
Epoch 11/65

Epoch 00011: val_loss did not improve from 0.42742
Epoch 12/65

Epoch 00012: val_loss did not improve from 0.42742
Epoch 13/65

Epoch 00013: val_loss did not i

In [19]:
model_json = model.to_json()
with open("model_step1.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
#model.save_weights("model_step1.h5")
#print("Saved model to disk")

### Test set

In [20]:
#Disease 1
patient_array_1 = [] #patients in disease1
patient_array_0 = [] #patients in disease0
spectra_array0 = [] #spectrum in disease0
spectra_array1 = [] #spectrum in disease1

In [21]:
#Disease 1
rootdir = '/home/titli/Documents/test/disease1'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        if( x[6] == '1_0-5-1' and x[9] == '980'):
            if (str(x[8]) not in patient_array_1):
                patient_array_1.append(x[8])
                wdfIle = wdfReader(os.path.join(subdir, file))
                X, spectra = getspikes(wdfIle) # plotting the spectrum
                if len(X)<1015:
                    continue
                spectra = normalize(spectra)
                spectra_array1.append(spectra)
spectra_array_1= pd.DataFrame(spectra_array1)
labels_test_1 = pd.DataFrame({'labels': np.ones((len(spectra_array1),), dtype=int)})

In [22]:
spectra_array_1

In [23]:
rootdir = '/home/titli/Documents/test/normal'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print (os.path.join(subdir, file))
        txt = os.path.join(subdir, file)
        x = txt.split("/")
        if( x[6] == '1_0-5-1' and x[9] == '980'):
            if (str(x[8]) not in patient_array_0):
                patient_array_0.append(x[8])
                wdfIle = wdfReader(os.path.join(subdir, file))
                X, spectra = getspikes(wdfIle) # plotting the spectrum
                if len(X)<1015:
                    continue
                spectra = normalize(spectra)
                spectra_array0.append(spectra)
spectra_array_0= pd.DataFrame(spectra_array0)
labels_test_0 = pd.DataFrame({'labels': np.zeros((len(spectra_array0),), dtype=int)})

In [24]:
total_df_test = pd.concat([spectra_array_0,spectra_array_1], axis = 0)
X_test = total_df_test.values
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)
labels_df_test = pd.concat([labels_test_0,labels_test_1], axis = 0)
y_test = labels_df_test.values
y_test = to_categorical(y_test, num_classes=2)
model1_test_y = model.predict(X_test, batch_size=10, verbose=1)



In [25]:
model1_test_y[model1_test_y > 0.5] = 1
model1_test_y[model1_test_y <= 0.5] = 0

In [26]:
def F1_score(pred_test_y, actuals):

    predictions =[]
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    
    for i in range (len(pred_test_y)):
        if ((pred_test_y[i,0]==1) & (actuals[i,0]==1)):
            true_pos = true_pos+1
        elif((pred_test_y[i,0]==0) & (actuals[i,0]==0)):
            true_neg = true_neg+1
        elif((pred_test_y[i,0]==1) & (actuals[i,0]==0)):
            false_pos = false_pos +1
        elif((pred_test_y[i,0]==0) & (actuals[i,0]==1)):
            false_neg = false_neg+1
    prec=true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos+false_neg)
    accur=(true_pos+true_neg)/(true_pos+false_pos+ true_neg+ false_neg)
    #F1=2*(prec*recall/(prec+recall))
    #FPR = false_pos/(false_pos+true_neg)
    return (true_pos, false_pos, true_neg, false_neg, accur)

In [27]:
print((F1_score(model1_test_y, y_test)))

(88, 0, 0, 2, 0.9777777777777777)
