In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sys import path
%matplotlib inline

In [2]:
path.insert(0, '../src')
from setup import data_setup
from data_generator import DataGenerator
from model_trainer import ModelTrainer

In [3]:
dg = data_setup()
norm_data = dg.df()

In [4]:
def get_frags(loc='../data/FragLibData32_Converted2010 (1)/Fragment Table.csv'):
    df = pd.read_csv(loc)
    a = list(df.columns[0:])
    a[0] = float(a[0])
    a[-1] = int(a[-1])
    a[-2] = a[-2][0]
    df.columns = ['FragmentMass', 'FragmentLabel', 'Isotopes', 'Formula', 'FragmentID']
    b = {'FragmentMass': a[0], 'FragmentLabel': a[1], 'Isotopes':a[2] , 'Formula':a[3],'FragmentID':a[4]}
    df = pd.concat([pd.DataFrame(b, index=[0]), df], sort=False)
    return df

In [233]:
frags = get_frags()
frags.reset_index(drop=True, inplace=True)

In [238]:
spec = pd.read_csv('../data/PET_rrCalibd_Pos-37-012.csv')['SOFH']

In [239]:
slope = float(spec[10].split(': ')[1])
offset = float(spec[11].split(': ')[1])
time = float(spec[5].split(': ')[1])
binsize = float(spec[14].split(': ')[1])
peaks = spec[20:].copy().reset_index(drop=True).apply(float)

In [240]:
from data_transformation import mass_formula
peaks_perfect = peaks.apply(mass_formula, args=(binsize, time, slope, offset))

In [241]:
spots = frags['FragmentMass']

In [242]:
def get_frags_dists(masses, frags, thresh=0.005):
    '''
    Determines which elemental / compound masses correspond
    to actual spectra masses and returns both the fragments
    and the distance between each fragment and its related mass in
    the given spectra.
    '''
    found_masses = []
    found_frags = []
    dists = []
    for mass in masses:
        not_found = True
        i = (len(frags)) // 2
        floor = 0
        cieling = len(frags) - 1

        def is_findable():
            if abs(floor - cieling) <= 1:
                return False
            return True
        
        while not_found:
            dist = frags[i] - mass
            if abs(dist) < thresh:
                not_found = False
                i = get_closest(i, frags, mass)
                found_masses.append(mass)
                found_frags.append(frags[i])
                dists.append(abs(frags[i] - mass))
            elif dist > 0:
                not_found = is_findable()
                cieling = i
                num = abs(floor - i)
                if num != 1:
                    i -= abs(floor - i) // 2
                else:
                    i -= 1
            else:
                not_found = is_findable()
                floor = i
                num = abs(cieling - i)
                if num != 1:
                    i += abs(cieling - i) // 2
                else:
                    i += 1
    return found_masses, found_frags, dists

In [243]:
def get_closest(i, frags, mass):
    d = abs(frags[i] - mass)
    if len(frags) > i + 1 and d > abs(frags[i + 1] - mass):
        i = get_closest(i + 1, frags, mass)
    elif i - 1 >= 0 and d > abs(frags[i - 1] - mass):
        i = get_closest(i - 1, frags, mass)
    return i

In [338]:
masses, fragments, distances = get_frags_dists(peaks_perfect, spots, thresh=0.007)
print('num: ', len(masses))
print('mean dist: ', np.mean(distances))

num:  182
mean dist:  0.0012963278514642824


In [330]:
offset_add = offset + offset * .001
peaks_add_offset = peaks.apply(mass_formula, args=(binsize, time, slope, offset_add))

In [342]:
masses, fragments, distances = get_frags_dists(peaks_add_offset, spots, thresh=0.003)
print('num: ', len(masses))
print('mean dist: ', np.mean(distances))

num:  103
mean dist:  0.001345759991282151


In [336]:
offset_sub = offset - offset * .001
peaks_sub_offset = peaks.apply(mass_formula, args=(binsize, time, slope, offset_sub))

In [337]:
masses, fragments, distances = get_frags_dists(peaks_sub_offset, spots, thresh=0.007)
print('num: ', len(masses))
print('mean dist: ', np.mean(distances))

num:  148
mean dist:  0.0018591080592085267


In [351]:
original_data = dg.calibrated_df()

In [352]:
dists_low_thresh = []
dists_high_thresh = []
nums = []
props = []
for row in original_data.itertuples():
    masses, _, distances = get_frags_dists(row.masses, spots, thresh=0.003)
    nums.append(len(masses))
    props.append(len(masses) / len(row.masses))
    dists_low_thresh.append(np.mean(distances))
    _, _, distances = get_frags_dists(row.masses, spots, thresh=0.007)
    dists_high_thresh.append(np.mean(distances))

In [353]:
original_data['avg_dist_frags_low_thresh'] = dists_low_thresh
original_data['avg_dist_frags_high_thresh'] = dists_high_thresh
original_data['num_indentified_frags'] = nums
original_data['proportions_peaks_identified'] = props
original_data['diff'] = original_data['avg_dist_frags_high_thresh'] - original_data['avg_dist_frags_low_thresh']

In [394]:
def get_calibration(data, modifier=.5, prop_thresh=0.65):
    calibs = []
    for row in data.itertuples():
        if row.diff < row.avg_dist_frags_low_thresh * modifier:
            if row.proportions_peaks_identified > prop_thresh:
                calibs.append(1)
            else:
                calibs.append(0)
        else:
            calibs.append(0)
    return calibs

In [405]:
original_data['calibration'] = get_calibration(original_data, .4, .6)

In [431]:
from data_transformation import mass_formula
def get_cols(data, amt):
    cols = [[[] for num in range(3)] for num in range(4)]
    for row in data.itertuples():
        print('row: ', row[0])
        mass_lists = [row.masses]
        for num in [1, -1]:
            x = pd.Series(row.precise_channels)
            y = x.apply(mass_formula, args=(row.SpecBinSize,
                                        row.StartFlightTime,
                                        row[4],
                                        row.MassOffset + num * row.MassOffset * amt))
            mass_lists.append(y)
        for i in range(3):
            a, b, c, d = get_stats(mass_lists[i])
            cols[0][i].append(a) # num masses
            cols[1][i].append(b) # proportion
            cols[2][i].append(c) # low thresh
            cols[3][i].append(d) # high thresh
    return cols

In [432]:
def get_stats(row_masses):
    masses, _, distances1 = get_frags_dists(row_masses, spots, thresh=0.003)
    _, _, distances2 = get_frags_dists(row_masses, spots, thresh=0.007)
    x = 0
    if len(distances1) > 0:
        x = np.mean(distances1)
    y = 0
    if len(distances2) > 0:
        y = np.mean(distances2)
    return len(masses), len(masses) / len(row_masses), x, y

In [433]:
cols = get_cols(original_data, .005)

row:  0
row:  1
row:  2
row:  3
row:  4
row:  5
row:  6
row:  7
row:  8
row:  9
row:  10
row:  11
row:  12
row:  13
row:  14
row:  15
row:  16
row:  17
row:  18
row:  19
row:  20
row:  21
row:  22
row:  23
row:  24
row:  25
row:  26
row:  27
row:  28
row:  29
row:  30
row:  31
row:  32
row:  33
row:  34
row:  35
row:  36
row:  37
row:  38
row:  39
row:  40
row:  41
row:  42
row:  43
row:  44
row:  45
row:  46
row:  47
row:  48
row:  49
row:  50
row:  51
row:  52
row:  53
row:  54
row:  55
row:  56
row:  57
row:  58
row:  59
row:  60
row:  61
row:  62
row:  63
row:  64
row:  65
row:  66
row:  67
row:  68
row:  69
row:  70
row:  71
row:  72
row:  73
row:  74
row:  75
row:  76
row:  77
row:  78
row:  79
row:  80
row:  81
row:  82
row:  83
row:  84
row:  85
row:  86
row:  87
row:  88
row:  89
row:  90
row:  91
row:  92
row:  93
row:  94
row:  95
row:  96
row:  97
row:  98
row:  99
row:  100
row:  101
row:  102
row:  103
row:  104
row:  105
row:  106
row:  107
row:  108
row:  109
row:  110


row:  831
row:  832
row:  833
row:  834
row:  835
row:  836
row:  837
row:  838
row:  839
row:  840
row:  841
row:  842
row:  843
row:  844
row:  845
row:  846
row:  847
row:  848
row:  849
row:  850
row:  851
row:  852
row:  853
row:  854
row:  855
row:  856
row:  857
row:  858
row:  859
row:  860
row:  861
row:  862
row:  863
row:  864
row:  865
row:  866
row:  867
row:  868
row:  869
row:  870
row:  871
row:  872
row:  873
row:  874
row:  875
row:  876
row:  877
row:  878
row:  879
row:  880
row:  881
row:  882
row:  883
row:  884
row:  885
row:  886
row:  887
row:  888
row:  889
row:  890
row:  891
row:  892
row:  893
row:  894
row:  895
row:  896
row:  897
row:  898
row:  899
row:  900
row:  901
row:  902
row:  903
row:  904
row:  905


In [421]:
np.mean([])

nan

In [445]:
from sklearn.model_selection import train_test_split
d = {'num_norm': cols[0][0], 'prop_norm': cols[1][0],'diff_low_norm': cols[2][0],'diff_high_norm': cols[3][0], 'num_add':cols[0][1], 'prop_add':cols[1][1], 'diff_low_add':cols[2][1], 'diff_high_add':cols[3][1], 'num_sub': cols[0][2], 'prop_sub':cols[1][2], 'diff_low_sub':cols[2][2], 'diff_high_sub':cols[3][2]}
training_data = pd.DataFrame(d)
X = training_data
y = original_data['calibration']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [447]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.9065934065934066