In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/PHI/ToF_ML/src

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/PHI/ToF_ML/src


In [5]:
from setup import data_setup
from data_generator import DataGenerator
from model_trainer import ModelTrainer

In [6]:
dg = data_setup()
original_data = dg.calibrated_df()

In [28]:
from data_transformation import get_frags
frags = get_frags('../data/Fragment Table.csv')
frags.reset_index(inplace=True, drop=True)
spots = frags['FragmentMass']

In [30]:
def get_frags_dists(masses, frags, thresh=0.005):
    '''
    Determines which elemental / compound masses correspond
    to actual spectra masses and returns both the fragments
    and the distance between each fragment and its related mass in
    the given spectra.
    '''
    found_masses = []
    found_frags = []
    dists = []
    for mass in masses:
        not_found = True
        i = (len(frags)) // 2
        floor = 0
        cieling = len(frags) - 1

        def is_findable():
            if abs(floor - cieling) <= 1:
                return False
            return True
        
        while not_found:
            dist = frags[i] - mass
            if abs(dist) < thresh:
                not_found = False
                i = get_closest(i, frags, mass)
                found_masses.append(mass)
                found_frags.append(frags[i])
                dists.append(abs(frags[i] - mass))
            elif dist > 0:
                not_found = is_findable()
                cieling = i
                num = abs(floor - i)
                if num != 1:
                    i -= abs(floor - i) // 2
                else:
                    i -= 1
            else:
                not_found = is_findable()
                floor = i
                num = abs(cieling - i)
                if num != 1:
                    i += abs(cieling - i) // 2
                else:
                    i += 1
    return found_masses, found_frags, dists


def get_closest(i, frags, mass):
    d = abs(frags[i] - mass)
    if len(frags) > i + 1 and d > abs(frags[i + 1] - mass):
        i = get_closest(i + 1, frags, mass)
    elif i - 1 >= 0 and d > abs(frags[i - 1] - mass):
        i = get_closest(i - 1, frags, mass)
    return i

In [31]:
dists_low_thresh = []
dists_high_thresh = []
nums = []
props = []
for row in original_data.itertuples():
    masses, _, distances = get_frags_dists(row.masses, spots, thresh=0.003)
    nums.append(len(masses))
    props.append(len(masses) / len(row.masses))
    dists_low_thresh.append(np.mean(distances))
    _, _, distances = get_frags_dists(row.masses, spots, thresh=0.007)
    dists_high_thresh.append(np.mean(distances))

original_data['avg_dist_frags_low_thresh'] = dists_low_thresh
original_data['avg_dist_frags_high_thresh'] = dists_high_thresh
original_data['num_indentified_frags'] = nums
original_data['proportions_peaks_identified'] = props
original_data['diff'] = original_data['avg_dist_frags_high_thresh'] - original_data['avg_dist_frags_low_thresh']

In [33]:
original_data.head()

Unnamed: 0,precise_channels,precise_intensities,file_name,Mass/Time,MassOffset,StartFlightTime,SpecBinSize,channels,peaks,mass_channels,masses,intensities,avg_dist_frags_low_thresh,avg_dist_frags_high_thresh,num_indentified_frags,proportions_peaks_identified,diff
0,0 2644.03673 1 3505.01837 2 ...,0 73874 1 1234 2 138 3 ...,0106301.cas,0.387425,-0.275047,0.0,1.248,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(2644.0367300000003, 73874), (3505.0183700000...","[2644.0367300000003, 3505.0183700000002, 4162....","[1.0067357827001502, 2.0154136848060986, 3.019...","[73874, 1234, 138, 610, 1216, 4159, 8958, 1084...",0.00121,0.002547,51,0.398438,0.001337
1,0 2647.00072 1 3508.99491 2 ...,0 49864 1 1034 2 168 3 ...,0107316.cas,0.387337,-0.278302,0.0,1.248,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(2647.00072, 49864), (3508.9949100000003, 103...","[2647.00072, 3508.9949100000003, 4164.59326000...","[1.0025002148906825, 2.0105384152423085, 3.009...","[49864, 1034, 168, 4696, 8247, 13992, 17903, 2...",0.00123,0.002755,17,0.129771,0.001525
2,0 1973.87665 1 2049.07068 2 2122.0...,0 23352 1 74717 2 10387 3 94...,0110203.cas,0.378938,-0.271015,0.0,4.992,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(1973.87665, 23352), (2049.0706800000003, 747...","[1973.87665, 2049.0706800000003, 2122.01224, 2...","[11.9915465622805, 12.996908473240481, 14.0108...","[23352, 74717, 10387, 947, 12344, 9121, 249, 4...",0.001164,0.002964,7,0.388889,0.0018
3,0 672.00298 1 891.90543 2 197...,0 34398 1 304 2 2223 3 ...,0110212.cas,0.379297,-0.269824,0.0,4.992,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(672.00298, 34398), (891.90543, 304), (1970.9...","[672.00298, 891.90543, 1970.94521, 2046.11295,...","[1.0051638174978783, 2.0134261084051874, 11.98...","[34398, 304, 2223, 3521, 5205, 6509, 99, 115, ...",0.001326,0.002605,41,0.37963,0.001279
4,0 2726.98153 1 3594.53649 2 ...,0 42995 1 602 2 151 3 ...,0116511.cas,0.383481,-0.302161,0.0,1.248,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(2726.98153, 42995), (3594.53649, 602), (4265...","[2726.98153, 3594.53649, 4265.28736, 7866.5038...","[1.0058658675679875, 2.011083088854326, 3.0245...","[42995, 602, 151, 17912, 9536, 17609, 29604, 5...",0.001521,0.003114,41,0.336066,0.001593


In [None]:
def get_calibration(data, modifier=.5, prop_thresh=0.65):
    calibs = []
    for row in data.itertuples():
        if row.diff < row.avg_dist_frags_low_thresh * modifier:
            if row.proportions_peaks_identified > prop_thresh:
                calibs.append(1)
            else:
                calibs.append(0)
        else:
            calibs.append(0)
    return calibs

In [None]:
original_data['calibration'] = get_calibration(original_data, .5, .55)