In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sys import path
%matplotlib inline

In [2]:
path.insert(0, '../src')
from setup import data_setup
from data_generator import DataGenerator
from model_trainer import ModelTrainer

In [3]:
dg = data_setup()
norm_data = dg.df()

In [4]:
from data_transformation import get_isotope_data, get_hydrocarbs
isotope_data = get_isotope_data()

In [5]:
from data_transformation import get_suspicious_peaks, get_peak_suspiciousness, get_ranges
original_data = dg.calibrated_df(True, .005, .01,use_ranges=True, ranges=[0, 0, 0.5], cat=True)
ranges = get_ranges(isotope_data, 2000)
original_data['target'] = original_data['target'].apply(lambda a: a - 1 if a > 0 else a)
original_data['sus_peaks'] = original_data['masses'].apply(get_suspicious_peaks, args=(ranges, .1))
original_data['peak_sussness'] = original_data['masses'].apply(get_peak_suspiciousness, args=(ranges, True))

In [158]:
from data_transformation import mass_formula
diffs = []
errs = []
no_mans_lands_err = []
no_mans_lands_norm = []
masses = []
means = 0
for row in norm_data.itertuples():
    diff = []
    mass = []
    nml_err = []
    nml_norm = []
    mean = 0
    err_offset, amt_err_offset = add_error(row.MassOffset)
    errs.append(amt_err_offset)
    for peak in row.peaks:
        init_mass = mass_formula(peak[0], row.SpecBinSize, row.StartFlightTime, row[4], row.MassOffset)
        i = int(init_mass)
        if init_mass > ranges[i][0] and init_mass < ranges[i][1]:
            nml_norm.append(init_mass)                                                
        new_mass = mass_formula(peak[0], row.SpecBinSize, row.StartFlightTime, row[4], err_offset)
        i = int(new_mass)
        if new_mass > ranges[i][0] and new_mass < ranges[i][1]:
            nml_err.append(new_mass)  
        mass.append(init_mass)
        diff.append(init_mass - new_mass)
        mean += abs(init_mass - new_mass)
    diffs.append(diff)
    no_mans_lands_err.append(nml_err)
    no_mans_lands_norm.append(nml_norm)
    masses.append(mass)
    mean = mean / len(row.peaks)
    means += mean
means = means / len(norm_data)

In [64]:
df = pd.read_csv('../data/processed_cas.csv')

In [65]:
test = df['Calibration'][3]

In [66]:
labels = []
positions = []
for row in df.itertuples():
    for split in row.Calibration.split(')')[:-1]:
        calibrators = split.split('(')[1].split(',')
        if calibrators[1] not in labels:
            labels.append(calibrators[1])
            positions.append(float(calibrators[2].strip()))

In [69]:
df2 = pd.DataFrame({'labels': labels, 'positions': positions})
df2 = df2.sort_values('positions', ascending=False).reset_index(drop=True)

In [81]:
def get_x(masses, intensities, x=12, thresh=0.1):
    '''
    Get all peaks in data near a specific mass x.
    '''
    row_x = -1
    max = -1
    for i, mass in enumerate(masses):
        dif = abs(mass-x)
        inten = intensities[i]
        if dif < thresh and (inten > max or max == -1):
            max = inten
            row_x = dif
    return row_x

In [116]:
training_data = pd.DataFrame(columns=list(df2['labels'])+['target'])

In [90]:
from data_transformation import generate_data
# 10,000 Examples only offset error
erred = generate_data(norm_data, 2, 2, True, [0, 0, 0])
for _ in range(10):
    erred = pd.concat([erred, generate_data(norm_data, 2, 2, True, [0, 0, 1], True)], axis=0)
#erred['target'] = erred['target'].apply(lambda a: a - 1 if a > 0 else a)
dg.set_df(erred)
erred = dg.calibrated_df()

In [117]:
for i, row in enumerate(erred.itertuples()):
    dists = []
    for pos in df2['positions']:
        dists.append(get_x(row.masses, row.intensities, x=pos, thresh=0.1))
    training_data.loc[i] = dists + [row.target]

In [122]:
from sklearn.model_selection import train_test_split
X = training_data.drop('target', axis=1)
X = X.to_numpy().reshape(9966, 309, 1)
y = training_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [123]:
from tensorflow.keras.layers import Dense, Flatten, Input, Conv1D, BatchNormalization
from tensorflow.keras.layers import AveragePooling1D, MaxPooling1D, Layer, Concatenate
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import Model, Sequential

In [124]:
def lenet():
    model = Sequential()

    model.add(Conv1D(filters=8, kernel_size=10, activation='tanh', input_shape=(309, 1)))
    model.add(AveragePooling1D())

    model.add(Conv1D(filters=16, kernel_size=20, activation='tanh'))
    model.add(AveragePooling1D())

    model.add(Flatten())

    model.add(Dense(units=500, activation='tanh'))

    model.add(Dense(units=100, activation='tanh'))

    model.add(Dense(units=2, activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC'])
    return model

In [138]:
estimator2 = KerasClassifier(build_fn = lenet, epochs = 200, batch_size = 20, verbose = 1, class_weight={0:11, 1:1})
estimator2.fit(X_train, y_train)
preds2 = estimator2.predict_proba(X_test)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
  8/399 [..............................] - ETA: 2s - loss: 0.7485 - accuracy: 0.5099 - auc: 0.6167

KeyboardInterrupt: 

In [132]:
preds2 = estimator2.predict_proba(X_test)



In [137]:
np.argmax(preds2)

2383

In [134]:
y_test

6692    1.0
7286    1.0
1111    1.0
1672    1.0
8247    1.0
       ... 
8604    1.0
1287    1.0
7592    1.0
8959    1.0
8244    1.0
Name: target, Length: 1994, dtype: float64