In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sys import path
%matplotlib inline

In [2]:
path.insert(0, '../src')
from data_generator import DataGenerator
from model_trainer import ModelTrainer

In [3]:
from data_transformation import get_better_spectra, get_precise_peaks
dg = DataGenerator('../data/classification_cas_data.csv')
norm_data = dg.df()

In [4]:
data = get_better_spectra(dir='../data/SpectraCsvFiles_BkgndSubtractWatsonPeakFinder/')
norm_data.sort_values('file_name', inplace=True)
data.sort_values('file_name', inplace=True)
norm_data = pd.merge(data, norm_data, on='file_name')

In [5]:
peaks = get_precise_peaks(norm_data, ['precise_channels', 'precise_intensities'])
norm_data['peaks'] = peaks
dg.set_df(norm_data)

In [6]:
original_data = dg.calibrated_df(True, use_ranges=True, cat=True)
original_data['num_peaks'] = original_data['peaks'].apply(len)

In [16]:
def get_new_target(num):
    if num == 2:
        return 1
    else:
        return 0
original_data['new_target'] = original_data['target'].apply(get_new_target)

In [17]:
max(original_data['masses'].apply(max))

1961.7678321458432

In [18]:
from data_transformation import get_isotope_data, get_isotope_mass_list
isotope_data = get_isotope_data()
nom_masses_low = get_isotope_mass_list(isotope_data, False, 2000)
nom_masses_high = get_isotope_mass_list(isotope_data, True, 2000)

In [36]:
def get_spectra(masses, intensities, nom_masses_low, nom_masses_high):
    spectra = [0 for x in range(2000)]
    spectra_intensities = [0 for x in range(2000)]
    for i, mass in enumerate(masses):
        j = round(mass)
        num = mass - nom_masses_low[j]
        spectra_intensities[j] = intensities[i]
        if num < 0:
            spectra[j] = num
        else:
            spectra[j] = mass - nom_masses_high[j]
    return spectra, spectra_intensities

In [20]:
from sklearn.preprocessing import MinMaxScaler
spectra = []
intensities = []
for row in original_data.itertuples():
    a, b = get_spectra(row.masses, row.precise_intensities, nom_masses_low, nom_masses_high)
    spectra.append(np.array(a))
    intensities.append(np.array(b))
spectra = np.vstack(spectra)
intensities = np.vstack(intensities)
scl = MinMaxScaler()
scl.fit(intensities)
intensities = scl.transform(intensities)

In [21]:
X = np.dstack([spectra, intensities])
y = original_data['new_target']

### FFNN / CNN Setup

In [24]:
from tensorflow.keras.layers import Dense, Flatten, Input, Conv1D, AveragePooling1D
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import Model, Sequential
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

def baseline_model():
    x_in = Input(shape=(2000,))
    dense1 = Dense(2000, activation='sigmoid')(x_in)
    dense2 = Dense(2000, activation='sigmoid')(dense1)
    dense3 = Dense(1000, activation='sigmoid')(dense2)
    dense5 = Dense(2, activation='sigmoid')(dense3)
    model = Model(inputs=x_in, outputs=dense5)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy', 'AUC'])
    return model

In [25]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
estimator = KerasClassifier(build_fn = baseline_model, epochs = 100, batch_size = 10, verbose = 0)
results = cross_val_score(estimator, spectra, y, cv = kfold)

In [26]:
np.mean(results)

0.8454556345939637

In [27]:
results

array([0.86263734, 0.87845302, 0.85635358, 0.8232044 , 0.80662984])

## LeNet Conv NN

In [28]:
def lenet():
    model = Sequential()

    model.add(Conv1D(filters=8, kernel_size=20, activation='relu', input_shape=(2000, 2)))
    model.add(AveragePooling1D())

    model.add(Conv1D(filters=20, kernel_size=80, activation='relu'))
    model.add(AveragePooling1D())

    model.add(Flatten())

    model.add(Dense(units=500, activation='sigmoid'))

    model.add(Dense(units=100, activation='relu'))

    model.add(Dense(units=2, activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC'])
    return model

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [30]:
kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
estimator = KerasClassifier(build_fn = lenet, epochs = 100, batch_size = 10, verbose = 0)
estimator.fit(X_train, y_train)

<tensorflow.python.keras.callbacks.History at 0x7fb70900b050>

In [31]:
preds = estimator.predict_proba(X_test)



In [32]:
from sklearn.metrics import roc_auc_score, accuracy_score

In [33]:
roc_auc_score(pd.get_dummies(y_test), preds)

0.9421495327102803

In [34]:
predictions = []
for i in range(len(preds)):
    predictions.append(np.where(preds[i]==max(preds[i]))[0][0])

In [35]:
accuracy_score(y_test, predictions)

0.9010989010989011

## Retrain Models with only offset error

In [48]:
offset_data = dg.calibrated_df(True, use_ranges=True,ranges=[0, 0, 0.5], cat=True)
offset_data['num_peaks'] = original_data['peaks'].apply(len)

In [50]:
spectra = []
intensities = []
for row in offset_data.itertuples():
    a, b = get_spectra(row.masses, row.precise_intensities, nom_masses_low, nom_masses_high)
    spectra.append(np.array(a))
    intensities.append(np.array(b))
spectra = np.vstack(spectra)
intensities = np.vstack(intensities)
scl = MinMaxScaler()
scl.fit(intensities)
intensities = scl.transform(intensities)
X = np.dstack([spectra, intensities])
y = offset_data['target']

In [51]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
estimator = KerasClassifier(build_fn = baseline_model, epochs = 100, batch_size = 10, verbose = 0)
results = cross_val_score(estimator, spectra, y, cv = kfold)

In [52]:
np.mean(results)

0.5077287435531617

In [54]:
kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
estimator = KerasClassifier(build_fn = lenet, epochs = 100, batch_size = 10, verbose = 0)

In [59]:
np.mean(results)

0.49668508768081665

In [58]:
results = cross_val_score(estimator, X, y, cv = kfold)

### Manipulation Of Offset Error

In [63]:
dg.set_df(norm_data)
norm_data = dg.calibrated_df(False)

In [191]:
def check_for_carbon(data):
    carbon_candidates = []
    indices = []
    for row in data.itertuples():
        carb = 0
        max = 0 
        ind = -1
        for i, mass in enumerate(row.masses):
            if round(mass) == 12 and abs(mass - 12) < .01:
                if row.intensities[i] > max:
                    ind = i
                    carb = mass
                    max = row.intensities[i]
        carb = abs(round(carb) - carb)
        carbon_candidates.append(carb)
    return carbon_candidates

In [311]:
carbs = np.array(check_for_carbon(offset_data))
indices = pd.Series(carbs)!=0
carbs = carbs[carbs!=0]

In [312]:
carbs = carbs.reshape(758, 1)
a = carbs + carbs * 0.01
b = carbs - carbs * 0.01

In [314]:
X = abs(np.hstack([carbs, a, b]))
y = offset_data['target'][indices]

In [315]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from model_trainer import ModelTrainer
models = [RandomForestClassifier, XGBClassifier, LGBMClassifier]
mt = ModelTrainer(models, X, y, ['rfc', 'xgb', 'lgbm'])

In [316]:
accs, index_pred = mt.kfold_models(5)



In [326]:
def check_for_elements(data):
    elements = []
    for row in data.itertuples():
        elems = [-1 for num in range(150)]
        maxs = [-1 for num in range(150)] 
        inds = []
        for i, mass in enumerate(row.masses):
            index = round(mass)
            if index < 150:
                val = mass - index
                if abs(mass - index) < .01:
                    if row.intensities[i] > maxs[round(mass)]:
                        maxs[index] = row.intensities[i]
                        elems[index] = val
        elems = pd.Series(elems)
        elems = elems[elems != -1]
        elements.append(list(elems))
    return elements

In [327]:
elements = check_for_elements(offset_data)

In [331]:
data = pd.DataFrame(elements)
data['target'] = offset_data['target']

In [349]:
a = data[[1, 2, 3, 4 , 5, 6, 'target']].copy()
a.dropna(inplace=True)

In [370]:
y = a['target']
X = a.drop('target', axis=1).to_numpy()

In [371]:
X_add = X + X * 0.01
X_sub = X - X * 0.01

In [372]:
X = np.hstack([X, X_add, X_sub])

In [363]:
models = [RandomForestClassifier, XGBClassifier, LGBMClassifier]
mt = ModelTrainer(models, X, y, ['rfc', 'xgb', 'lgbm'])

In [364]:
accs, index_pred = mt.kfold_models(5)



In [365]:
accs

[0.5336824779290739, 0.5642974711955708, 0.530046386353434]

In [376]:
def baseline_model():
    x_in = Input(shape=(18,))
    dense1 = Dense(200, activation='sigmoid')(x_in)
    dense2 = Dense(200, activation='sigmoid')(dense1)
    dense3 = Dense(100, activation='sigmoid')(dense2)
    dense5 = Dense(2, activation='sigmoid')(dense3)
    model = Model(inputs=x_in, outputs=dense5)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy', 'AUC'])
    return model

In [377]:
kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
estimator = KerasClassifier(build_fn = baseline_model, epochs = 100, batch_size = 10, verbose = 0)
results = cross_val_score(estimator, X, y, cv = kfold)

In [379]:
np.mean(results)

0.5116938471794128

In [400]:
def lenet():
    model = Sequential()

    model.add(Conv1D(filters=2, kernel_size=2, activation='relu', input_shape=(18, 1)))
    model.add(AveragePooling1D())

    model.add(Conv1D(filters=1, kernel_size=4, activation='relu'))
    model.add(AveragePooling1D())

    model.add(Flatten())

    model.add(Dense(units=500, activation='sigmoid'))

    model.add(Dense(units=100, activation='relu'))

    model.add(Dense(units=2, activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC'])
    return model

In [401]:
X_train, X_test, y_train, y_test = train_test_split(X.reshape((817, 18, 1)), y, test_size=0.2)

In [402]:
kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
estimator = KerasClassifier(build_fn = lenet, epochs = 100, batch_size = 10, verbose = 0)
estimator.fit(X_train, y_train)

<tensorflow.python.keras.callbacks.History at 0x7fb76c64f950>

In [403]:
preds = estimator.predict_proba(X_test)
predictions = []
for i in range(len(preds)):
    predictions.append(np.where(preds[i]==max(preds[i]))[0][0])
roc_auc_score(pd.get_dummies(y_test), preds)
accuracy_score(y_test, predictions)



0.5426829268292683