In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sys import path
%matplotlib inline

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')
%cd /content/drive/MyDrive/PHI/ToF_ML/src

/content/drive/MyDrive/PHI/ToF_ML/src


In [3]:
from data_transformation import get_frags
spots = get_frags()['FragmentMass']

In [4]:
def C(channels, spec_bin_size, start_time,  mass_over_time, mass_offset):
    return ((np.array(channels) * .001 * spec_bin_size + start_time) * mass_over_time + mass_offset)**2

In [5]:
from data_transformation import recalibrate, get_best_offset

In [6]:
df_read = pd.read_csv('../data/ssl_csvs_converted_to_df.csv')
from ast import literal_eval
df_read['intensities'] = df_read['intensities'].apply(literal_eval)
df_read['channels'] = df_read['channels'].apply(literal_eval)

In [7]:
def mass_formula(channels: np.array, spec_bin_size, start_time,  mass_over_time, mass_offset):
    '''
    Fast conversion from flightime to mass.
    '''
    return ((channels * .001 * spec_bin_size + start_time) * mass_over_time + mass_offset)**2

In [9]:
def generate_calibrated_data(data, numba=False):
    '''
    Applies mass_formula to every row in dataset to allow
    calibrated graphs to be generated.
    '''
    new_data = data.copy()
    masses = []
    for row in new_data.itertuples():
        spec = row.SpecBinSize
        m_over_t = row.MassOverTime
        m_offset = row.MassOffset
        time = row.StartFlightTime
        if not numba:
            masses.append(mass_formula(np.array(row.channels), spec, time,
                          m_over_t, m_offset))
        else:
            masses.append(numba_mass_formula(np.array(row.channels), spec,
                          time, m_over_t, m_offset))
    new_data['masses'] = masses
    return new_data

In [8]:
from data_transformation import get_fragment_stats, generate_calibrated_data
df_read = generate_calibrated_data(df_read[df_read['channels'].apply(len)> 0], 4)
df_read.reset_index(inplace=True)

In [10]:
df = get_fragment_stats(df_read, prop_name='original_proportion_identified')

In [None]:
df.to_csv('../data/new_1900_without_peakless_spectra.csv', index=False)

In [None]:
a = df['index']
df.drop('index', axis=1, inplace=True)
df['index'] = a

In [None]:
def get_best_offset(spectrum, slope_range, offset_range, prev=0,
                    offsets=30, slopes=20, first=True, frags=None):
    '''
    Find best amount of slope/offset to add/subtract to slope/offset value
    to achieve the optimal calibration for spectrum. Calibration is measured
    using mass fragments. A spectrum which more matches to known masses is more
    calibrated than one with fewer. A spectrum whose matches are very close to
    known mass is more calibrated than one that is further away.

    Arguments -------
    spectrum: row from dataframe containing information on a spectrum
    slope_range: data structure containing min, max slope to try, slope erros
                 are typically smaller than offset errors.
    offset_range: data structure containing min, max offset agumentation
                       to try. This method shrinks the range iteratively until
                       the best offset is achieved.
    spots: a list of mass fragments
    '''
    if first:
        print('optimizing ' + spectrum.file_name)
        print('initial proportion: ' + str(spectrum.proportions_peaks_identified))
    if not isinstance(frags, pd.Series):
        frags = get_frags()
    proportions = [0]
    low_distances = []
    high_distances = []
    best_prop = 0
    best_offset = spectrum.MassOffset
    best_slope = spectrum[2]
    best_ld = 1
    best_hd = 1
    ys = [] 
    i = 0
    mults = [1, -1]
    slope_space = np.linspace(slope_range[0], slope_range[1], slopes)
    while 1:
        slope = slope_space[i]
        for slope_mult in mults:
            slope_val = slope * slope_mult
            props = []
            low_dists = []
            high_dists = []
            y = []
            space = np.linspace(offset_range[0], offset_range[1], offsets)
            j = 0
            changed = False
            while 1:
                offset = space[j]
                for mult in mults:
                    improved = False
                    val = mult * offset
                    y.append(val)
                    prop, low, high = recalibrate(spectrum, frags, slope_val, val, 4)
                    if prop > best_prop:
                        improved=True
                    elif (prop == best_prop and best_hd - high > 0 and
                          best_ld - low > 0):
                        improved = True
                    props.append(prop)
                    low_dists.append(low)
                    high_dists.append(high)
                    if improved:
                        best_prop = prop 
                        best_offset = val
                        best_slope = slope_val
                        best_ld = low
                        best_hd = high
                        edge = (np.where(space == mult * best_offset)[0][0] + .1)/ len(space)
                        edge = 2 * abs(.5 - edge)
                        slope_edge = np.where(slope_space == slope_mult * best_slope)[0][0]
                        slope_edge = 2 * abs(.5 - (slope_edge + .1) / len(slope_space))
                        changed = True            
                j += 1
                if j >= len(space):
                    break
            if changed:
                ys = y 
                proportions = props 
                low_distances = low_dists
                high_distances = high_dists
        i += 1
        if i >= len(slope_space):
            break
    print(best_prop)
    if best_prop > prev:
            a = best_offset - (.5/edge) * best_offset
            b = best_offset + (.5/edge) * best_offset
            c = best_slope + (.5 / slope_edge) * best_slope
            d = best_slope - (.5 / slope_edge) * best_slope
            p, ld, hd, yss, bp, bo, bs = get_best_offset(spectrum,
                                                            slope_range=[c,d],
                                                            offset_range=[a, b],
                                                            prev=best_prop,
                                                            offsets=20,
                                                            slopes=5, 
                                                            first=False, 
                                                            frags=frags)
            if bp >= best_prop:
                proportions = p
                low_distances = ld
                high_distances = hd
                ys = yss
                best_prop = bp
                best_offset = bo
                best_slope = bs
    return (proportions, low_distances, high_distances, ys, best_prop,
            best_offset, best_slope)

In [None]:
offsets = []
slopes = []
files = []
indices = []
i = 0
for row in df.loc[0:478].itertuples():
    print(i)
    a, b, c, d, p, o, s = get_best_offset(row, [.001, .0000001],
                                          [.001, .000001], offsets=20,
                                          slopes=20,
                                          prev=0,
                                          frags=spots)
    offsets.append(o)
    slopes.append(s)
    files.append(row.file_name)
    indices.append(row.index)
    i += 1

In [None]:
spots[2238]

148.05236000000005

In [None]:
pd.DataFrame({'offsets':offsets, 'slopes':slopes, 'names':files, 'indices':indices}).to_csv('../data/updated_calibration_new_1900_first_478.csv', index=False)

In [None]:
a = pd.read_csv('../data/updated_calibration_new_1900_first_478.csv')
b = pd.read_csv('../data/updated_calibration_new_1900_first_956_478.csv')
c = pd.read_csv('../data/updated_calibration_new_1900_first_beyond_1434.csv')
d = pd.read_csv('../data/updated_calibration_new_1900_first_478_956.csv')

In [None]:
mods_1910 = pd.concat([a.drop(478), d.drop(478), b.drop(478), c]).reset_index(drop=True)

In [15]:
!ls ../data

 all_calibrated_data.csv
 best_model.hdf5
'calibrating new 1910'
 classification_cas_data.csv
 df_best_model.csv
 df_metrics.csv
 Elements.txt
 epochs_loss.png
 fixed_1400.csv
'fixing 1441 calibration'
'Fragment Table.csv'
 full_data_with_non_calibrateds.csv
 history.csv
 last_model.hdf5
 model_init.hdf5
 new_1900_without_peakless_spectra.csv
 new_spectra
 new_spectra.csv
 processed_cas.csv
 SpectraCsvFiles_BkgndSubtractWatsonPeakFinder
 SSL_Csvs
 ssl_csvs_converted_to_df.csv
 test_duration.csv
 titanic
 training_data_no_NPZ_.0001.csv
 training_data_no_NPZ_.003.csv
 training_data_no_NPZ_.004.csv
 y_pred.npy


In [14]:
mods_1910 = pd.read_csv('../data/calibrating new 1910/mods_for_1910.csv')

In [14]:
mods_1910.tail()

Unnamed: 0,offsets,slopes,names,indices
1906,2.7e-05,-8.527342e-05,C0057-V1.cas,1907
1907,-0.000947,1e-07,C0058-U1.cas,1908
1908,-0.001637,-5.272632e-05,C0059-U1.cas,1909
1909,-0.000682,4.505495e-08,C0058-V1.cas,1910
1910,-0.000685,-1e-07,C0060-U1.cas,1911


In [12]:
new = df.copy()
new.head()

Unnamed: 0,index,file_name,technique,StartFlightTime,MassOverTime,MassOffset,SpecBinSize,intensities,channels,masses,avg_dist_frags_low,avg_dist_frags_high,adjusted_original_proportion_identified,original_proportion_identified,diff,prop_diff_in_low,calibration
0,0,C0059-V1.cas,-TofSIMS,0.0,1.69097,0.7095,0.128,"[10106.0, 32018.0, 4303.0, 816.0, 44497.0, 346...","[12729.6134, 13388.07292, 14021.38724, 14625.2...","[12.004495885110552, 13.012398275521816, 14.02...",0.001419,0.003314,0.307692,0.294737,0.001895,1.335205,0
1,1,C0060-V1.cas,-TofSIMS,0.0,1.68665,0.7071,0.128,"[37242.0, 67973.0, 112.0, 5361.0, 4489.0, 2193...","[12773.45818, 13433.49441, 13495.05187, 14068....","[12.004681147019333, 13.01242014410188, 13.108...",0.001261,0.002194,0.409091,0.393939,0.000933,0.740399,0
2,2,C0061-U1.cas,+TofSIMS,0.0,1.68482,0.70859,0.128,"[483.0, 782.0, 1374.0, 2985.0, 152.0, 302.0, 3...","[12779.34033, 13440.22492, 14076.03163, 14689....","[12.003062954788383, 13.010939902638805, 14.01...",0.001245,0.002281,0.428571,0.416058,0.001036,0.831804,0
3,3,C0061-V1.cas,-TofSIMS,0.0,1.69038,0.70903,0.128,"[2675.0, 1381.0, 674.0, 196.0, 93.0, 50539.0, ...","[1381.98178, 1400.517, 1412.53457, 1440.11759,...","[1.016159799036636, 1.024261311539043, 1.02953...",0.001129,0.002034,0.305155,0.24183,0.000905,0.801556,0
4,4,C0062-U1.cas,+TofSIMS,0.0,1.68314,0.70792,0.128,"[565.0, 835.0, 1243.0, 2532.0, 99.0, 220.0, 14...","[12795.06881, 13456.57005, 14092.96684, 14707....","[12.002858579352937, 13.010660285835183, 14.01...",0.001288,0.002298,0.427509,0.424354,0.001011,0.784703,0


In [15]:
modified_new = pd.DataFrame(columns = new.columns)
i = 0
for row in mods_1910.itertuples():
    ind = row[0]
    loc = new.loc[ind].copy()
    loc['MassOverTime'] += row.slopes * loc['MassOverTime']
    loc['MassOffset'] += row.offsets * loc['MassOffset']
    modified_new.loc[i] = loc
    i += 1

In [16]:
modified_new.head()

Unnamed: 0,index,file_name,technique,StartFlightTime,MassOverTime,MassOffset,SpecBinSize,intensities,channels,masses,avg_dist_frags_low,avg_dist_frags_high,adjusted_original_proportion_identified,original_proportion_identified,diff,prop_diff_in_low,calibration
0,0,C0059-V1.cas,-TofSIMS,0.0,1.69097,0.708925,0.128,"[10106.0, 32018.0, 4303.0, 816.0, 44497.0, 346...","[12729.6134, 13388.07292, 14021.38724, 14625.2...","[12.004495885110552, 13.012398275521816, 14.02...",0.001419,0.003314,0.307692,0.294737,0.001895,1.335205,0
1,1,C0060-V1.cas,-TofSIMS,0.0,1.68665,0.706374,0.128,"[37242.0, 67973.0, 112.0, 5361.0, 4489.0, 2193...","[12773.45818, 13433.49441, 13495.05187, 14068....","[12.004681147019333, 13.01242014410188, 13.108...",0.001261,0.002194,0.409091,0.393939,0.000933,0.740399,0
2,2,C0061-U1.cas,+TofSIMS,0.0,1.68482,0.708105,0.128,"[483.0, 782.0, 1374.0, 2985.0, 152.0, 302.0, 3...","[12779.34033, 13440.22492, 14076.03163, 14689....","[12.003062954788383, 13.010939902638805, 14.01...",0.001245,0.002281,0.428571,0.416058,0.001036,0.831804,0
3,3,C0061-V1.cas,-TofSIMS,0.0,1.69038,0.708417,0.128,"[2675.0, 1381.0, 674.0, 196.0, 93.0, 50539.0, ...","[1381.98178, 1400.517, 1412.53457, 1440.11759,...","[1.016159799036636, 1.024261311539043, 1.02953...",0.001129,0.002034,0.305155,0.24183,0.000905,0.801556,0
4,4,C0062-U1.cas,+TofSIMS,0.0,1.683051,0.707803,0.128,"[565.0, 835.0, 1243.0, 2532.0, 99.0, 220.0, 14...","[12795.06881, 13456.57005, 14092.96684, 14707....","[12.002858579352937, 13.010660285835183, 14.01...",0.001288,0.002298,0.427509,0.424354,0.001011,0.784703,0


In [17]:
fixed_1910 = generate_calibrated_data(modified_new)

In [18]:
fixed_1910 = get_fragment_stats(fixed_1910)

In [19]:
fixed_1910.head()

Unnamed: 0,index,file_name,technique,StartFlightTime,MassOverTime,MassOffset,SpecBinSize,intensities,channels,masses,avg_dist_frags_low,avg_dist_frags_high,adjusted_original_proportion_identified,original_proportion_identified,diff,prop_diff_in_low,calibration,adjusted_proportion_identified,proportion_identified
0,0,C0059-V1.cas,-TofSIMS,0.0,1.69097,0.708925,0.128,"[10106.0, 32018.0, 4303.0, 816.0, 44497.0, 346...","[12729.6134, 13388.07292, 14021.38724, 14625.2...","[12.000516134304917, 13.00825496659651, 14.015...",0.00093,0.0014,0.307692,0.294737,0.00047,0.504991,0,0.538462,0.515789
1,1,C0060-V1.cas,-TofSIMS,0.0,1.68665,0.706374,0.128,"[37242.0, 67973.0, 112.0, 5361.0, 4489.0, 2193...","[12773.45818, 13433.49441, 13495.05187, 14068....","[11.999648578820876, 13.007180420340381, 13.10...",0.000994,0.001364,0.409091,0.393939,0.00037,0.372432,1,0.562937,0.542088
2,2,C0061-U1.cas,+TofSIMS,0.0,1.68482,0.708105,0.128,"[483.0, 782.0, 1374.0, 2985.0, 152.0, 302.0, 3...","[12779.34033, 13440.22492, 14076.03163, 14689....","[11.999700337252365, 13.007438841283296, 14.01...",0.001181,0.001583,0.428571,0.416058,0.000402,0.340298,1,0.575188,0.558394
3,3,C0061-V1.cas,-TofSIMS,0.0,1.69038,0.708417,0.128,"[2675.0, 1381.0, 674.0, 196.0, 93.0, 50539.0, ...","[1381.98178, 1400.517, 1412.53457, 1440.11759,...","[1.014923540233473, 1.0230201316204572, 1.0282...",0.001156,0.001566,0.305155,0.24183,0.00041,0.354811,0,0.381443,0.302288
4,4,C0062-U1.cas,+TofSIMS,0.0,1.683051,0.707803,0.128,"[565.0, 835.0, 1243.0, 2532.0, 99.0, 220.0, 14...","[12795.06881, 13456.57005, 14092.96684, 14707....","[12.0010393218355, 13.008711984543739, 14.0164...",0.001284,0.001697,0.427509,0.424354,0.000412,0.321084,1,0.609665,0.605166


In [20]:
fixed_1910[fixed_1910['adjusted_original_proportion_identified'] > fixed_1910['adjusted_proportion_identified']]

Unnamed: 0,index,file_name,technique,StartFlightTime,MassOverTime,MassOffset,SpecBinSize,intensities,channels,masses,avg_dist_frags_low,avg_dist_frags_high,adjusted_original_proportion_identified,original_proportion_identified,diff,prop_diff_in_low,calibration,adjusted_proportion_identified,proportion_identified


In [21]:
merge1 = pd.DataFrame({'file_name': fixed_1910['file_name'], 'MassOverTime': fixed_1910['MassOverTime'], 'MassOffset' : fixed_1910['MassOffset'],
'StartFlightTime' : fixed_1910['StartFlightTime'], 'SpecBinSize' :fixed_1910['SpecBinSize'], 'channels' : fixed_1910['channels'],
'intensities' : fixed_1910['intensities'], 'masses' : fixed_1910['masses'], 'adjusted_original_proportion_identified' : fixed_1910['adjusted_original_proportion_identified']})

In [22]:
old = pd.read_csv('../data/fixed_1400.csv')
old['intensities'] = old['intensities'].apply(literal_eval)
old['channels'] = old['channels'].apply(literal_eval)
old['masses'] = old['masses'].apply(literal_eval)
old.head()

Unnamed: 0,file_name,Mass/Time,MassOffset,StartFlightTime,SpecBinSize,channels,intensities,masses,avg_dist_frags_low,avg_dist_frags_high,adjusted_original_proportions_identified,original_proportions_identified,diff,prop_diff_in_low,calibration,adjusted_proportion_identified,proportion_identified
0,0106301.cas,0.387384,-0.275004,0.0,1.248,"[2644.0367300000003, 3505.0183700000002, 4162....","[73874, 1234, 138, 610, 1216, 4159, 8958, 1084...","[1.0065519723918102, 2.015029094672708, 3.0191...",0.001298,0.002255,0.398438,0.398438,0.000958,0.738174,0,0.515625,0.515625
1,0107316.cas,0.387113,-0.278302,0.0,1.248,"[2647.00072, 3508.9949100000003, 4164.59326000...","[49864, 1034, 168, 4696, 8247, 13992, 17903, 2...","[1.00101811517532, 2.0077555328930656, 3.00565...",0.001537,0.002586,0.129771,0.129771,0.001049,0.682225,0,0.51145,0.51145
2,0110203.cas,0.379037,-0.271056,0.0,4.992,"[1973.87665, 2049.0706800000003, 2122.01224, 2...","[23352, 74717, 10387, 947, 12344, 9121, 249, 4...","[11.998071176139083, 13.003971096434277, 14.01...",0.00164,0.001858,0.388889,0.388889,0.000218,0.1331,0,0.444444,0.444444
3,0110212.cas,0.379177,-0.269744,0.0,4.992,"[672.00298, 891.90543, 1970.94521, 2046.11295,...","[34398, 304, 2223, 3521, 5205, 6509, 99, 115, ...","[1.0045194511091773, 2.012140966655108, 11.978...",0.001337,0.002206,0.37963,0.37963,0.000868,0.649178,0,0.592593,0.592593
4,0116511.cas,0.38336,-0.302184,0.0,1.248,"[2726.98153, 3594.53649, 4265.28736, 7866.5038...","[42995, 602, 151, 17912, 9536, 17609, 29604, 5...","[1.0049940659400325, 2.0094784706009245, 3.022...",0.001397,0.002171,0.350427,0.336066,0.000774,0.554114,0,0.487179,0.467213


In [35]:
old.MassOffset.describe()

count    1437.000000
mean       -0.461531
std         0.412082
min        -1.656402
25%        -0.713084
50%        -0.513035
75%        -0.000046
max         0.001711
Name: MassOffset, dtype: float64

In [34]:
old[old['adjusted_original_proportions_identified'] > old['adjusted_proportion_identified']]

Unnamed: 0,file_name,Mass/Time,MassOffset,StartFlightTime,SpecBinSize,channels,intensities,masses,avg_dist_frags_low,avg_dist_frags_high,adjusted_original_proportions_identified,original_proportions_identified,diff,prop_diff_in_low,calibration,adjusted_proportion_identified,proportion_identified


In [24]:
merge2 = pd.DataFrame({'file_name': old['file_name'], 'MassOverTime': old['Mass/Time'], 'MassOffset' : old['MassOffset'],
'StartFlightTime' : old['StartFlightTime'], 'SpecBinSize' :old['SpecBinSize'], 'channels' : old['channels'],
'intensities' : old['intensities'], 'masses' : old['masses'], 'adjusted_original_proportion_identified' : old['adjusted_original_proportions_identified']})

In [25]:
merge1.head()

Unnamed: 0,file_name,MassOverTime,MassOffset,StartFlightTime,SpecBinSize,channels,intensities,masses,adjusted_original_proportion_identified
0,C0059-V1.cas,1.69097,0.708925,0.0,0.128,"[12729.6134, 13388.07292, 14021.38724, 14625.2...","[10106.0, 32018.0, 4303.0, 816.0, 44497.0, 346...","[12.000516134304917, 13.00825496659651, 14.015...",0.307692
1,C0060-V1.cas,1.68665,0.706374,0.0,0.128,"[12773.45818, 13433.49441, 13495.05187, 14068....","[37242.0, 67973.0, 112.0, 5361.0, 4489.0, 2193...","[11.999648578820876, 13.007180420340381, 13.10...",0.409091
2,C0061-U1.cas,1.68482,0.708105,0.0,0.128,"[12779.34033, 13440.22492, 14076.03163, 14689....","[483.0, 782.0, 1374.0, 2985.0, 152.0, 302.0, 3...","[11.999700337252365, 13.007438841283296, 14.01...",0.428571
3,C0061-V1.cas,1.69038,0.708417,0.0,0.128,"[1381.98178, 1400.517, 1412.53457, 1440.11759,...","[2675.0, 1381.0, 674.0, 196.0, 93.0, 50539.0, ...","[1.014923540233473, 1.0230201316204572, 1.0282...",0.305155
4,C0062-U1.cas,1.683051,0.707803,0.0,0.128,"[12795.06881, 13456.57005, 14092.96684, 14707....","[565.0, 835.0, 1243.0, 2532.0, 99.0, 220.0, 14...","[12.0010393218355, 13.008711984543739, 14.0164...",0.427509


In [26]:
merge2.head()

Unnamed: 0,file_name,MassOverTime,MassOffset,StartFlightTime,SpecBinSize,channels,intensities,masses,adjusted_original_proportion_identified
0,0106301.cas,0.387384,-0.275004,0.0,1.248,"[2644.0367300000003, 3505.0183700000002, 4162....","[73874, 1234, 138, 610, 1216, 4159, 8958, 1084...","[1.0065519723918102, 2.015029094672708, 3.0191...",0.398438
1,0107316.cas,0.387113,-0.278302,0.0,1.248,"[2647.00072, 3508.9949100000003, 4164.59326000...","[49864, 1034, 168, 4696, 8247, 13992, 17903, 2...","[1.00101811517532, 2.0077555328930656, 3.00565...",0.129771
2,0110203.cas,0.379037,-0.271056,0.0,4.992,"[1973.87665, 2049.0706800000003, 2122.01224, 2...","[23352, 74717, 10387, 947, 12344, 9121, 249, 4...","[11.998071176139083, 13.003971096434277, 14.01...",0.388889
3,0110212.cas,0.379177,-0.269744,0.0,4.992,"[672.00298, 891.90543, 1970.94521, 2046.11295,...","[34398, 304, 2223, 3521, 5205, 6509, 99, 115, ...","[1.0045194511091773, 2.012140966655108, 11.978...",0.37963
4,0116511.cas,0.38336,-0.302184,0.0,1.248,"[2726.98153, 3594.53649, 4265.28736, 7866.5038...","[42995, 602, 151, 17912, 9536, 17609, 29604, 5...","[1.0049940659400325, 2.0094784706009245, 3.022...",0.350427


In [27]:
full_dataset = pd.concat([merge2, merge1])

In [30]:
full_dataset

Unnamed: 0,file_name,MassOverTime,MassOffset,StartFlightTime,SpecBinSize,channels,intensities,masses,adjusted_original_proportion_identified
0,0106301.cas,0.387384,-0.275004,0.0,1.248,"[2644.0367300000003, 3505.0183700000002, 4162....","[73874, 1234, 138, 610, 1216, 4159, 8958, 1084...","[1.0065519723918102, 2.015029094672708, 3.0191...",0.398438
1,0107316.cas,0.387113,-0.278302,0.0,1.248,"[2647.00072, 3508.9949100000003, 4164.59326000...","[49864, 1034, 168, 4696, 8247, 13992, 17903, 2...","[1.00101811517532, 2.0077555328930656, 3.00565...",0.129771
2,0110203.cas,0.379037,-0.271056,0.0,4.992,"[1973.87665, 2049.0706800000003, 2122.01224, 2...","[23352, 74717, 10387, 947, 12344, 9121, 249, 4...","[11.998071176139083, 13.003971096434277, 14.01...",0.388889
3,0110212.cas,0.379177,-0.269744,0.0,4.992,"[672.00298, 891.90543, 1970.94521, 2046.11295,...","[34398, 304, 2223, 3521, 5205, 6509, 99, 115, ...","[1.0045194511091773, 2.012140966655108, 11.978...",0.379630
4,0116511.cas,0.383360,-0.302184,0.0,1.248,"[2726.98153, 3594.53649, 4265.28736, 7866.5038...","[42995, 602, 151, 17912, 9536, 17609, 29604, 5...","[1.0049940659400325, 2.0094784706009245, 3.022...",0.350427
...,...,...,...,...,...,...,...,...,...
3343,C0057-V1.cas,1.690576,0.709349,0.0,0.128,"[13390.71602, 14024.11022, 14627.8413, 15205.0...","[10551.0, 1068.0, 431.0, 43369.0, 22470.0, 97....","[13.010562184883582, 14.018121625729533, 15.01...",0.298507
3344,C0058-U1.cas,1.684620,0.706110,0.0,0.128,"[13450.94882, 14086.54094, 14700.04437, 15861....","[227.0, 488.0, 1396.0, 138.0, 1305.0, 892.0, 2...","[13.007253820832963, 14.014620819235093, 15.02...",0.519126
3345,C0059-U1.cas,1.689831,0.103121,0.0,0.128,"[15540.00119, 16198.67257, 16832.47514, 17444....","[470.0, 948.0, 2328.0, 6228.0, 178.0, 425.0, 1...","[12.002050812136709, 13.009490620172128, 14.01...",0.349785
3346,C0058-V1.cas,1.694800,0.710445,0.0,0.128,"[12693.92252, 13350.93471, 13982.99081, 14584....","[7950.0, 17953.0, 1973.0, 1141.0, 56712.0, 342...","[12.000639967164542, 13.008447457907286, 14.01...",0.381250


In [29]:
full_dataset.reset_index(inplace=True, drop=True)

In [31]:
from data_transformation import get_fragment_stats
full_dataset = get_fragment_stats(full_dataset)

In [46]:
full_dataset[full_dataset['adjusted_original_proportion_identified'] > full_dataset['adjusted_proportion_identified']]

Unnamed: 0,file_name,MassOverTime,MassOffset,StartFlightTime,SpecBinSize,channels,intensities,masses,adjusted_original_proportion_identified,avg_dist_frags_low,avg_dist_frags_high,adjusted_proportion_identified,proportion_identified,diff,prop_diff_in_low,calibration
198,CHA152NC.cas,0.3794,4.928744e-06,0.0,0.138,"[66160.94769, 68883.49382, 71502.52432000001, ...","[6114, 12392, 2133, 911, 47630, 175, 26887, 92...","[11.999355928572399, 13.007228974127232, 14.01...",0.481481,0.001418,0.002351,0.481481,0.481481,0.000933,0.657426,0
212,CHB101NE.cas,0.379355,-0.0003188741,0.0,0.138,"[19103.48159, 19183.00116, 19310.47349, 19525....","[355, 1564539, 2480, 215, 364, 453, 366, 369, ...","[0.9995340956203119, 1.0078753399362244, 1.021...",0.247619,0.000877,0.001793,0.247619,0.247619,0.000917,1.045541,0
229,CHB106NE.cas,0.379106,-5.190784e-07,0.0,0.138,"[19114.530430000003, 19195.02395, 19322.1058, ...","[369, 1821309, 2909, 600, 352, 87, 342, 288, 3...","[1.0000161650691828, 1.0084562729738886, 1.021...",0.152047,0.000784,0.001566,0.152047,0.152047,0.000781,0.99617,0
234,CHB107NG.cas,0.3791,-9.898005e-05,0.0,0.138,"[19117.30504, 19197.938769999997, 19325.45575,...","[650, 2945305, 3870, 464, 3151, 330, 728, 508,...","[1.0000775837019045, 1.0085325478374099, 1.021...",0.191489,0.000642,0.001106,0.191489,0.191489,0.000463,0.72147,0
281,CHB144PC.cas,0.376776,-1.78476e-05,0.0,0.138,"[27302.11627, 50940.98393, 66622.00967999999, ...","[479, 1128, 691, 1317, 5921, 23671, 1172, 293,...","[2.0151487874388714, 7.015428774960274, 11.999...",0.469595,0.00106,0.001844,0.469595,0.469595,0.000784,0.739379,0
282,CHB145NA.cas,0.379186,-6.141472e-05,0.0,0.138,"[27122.26129, 66116.96467999999, 66199.46774, ...","[159, 159, 140112, 305, 457, 399855, 747, 5220...","[2.0140864402939513, 11.969452846498372, 11.99...",0.396552,0.00107,0.001649,0.396552,0.396552,0.000578,0.540329,0
305,CHB21NC.cas,0.379113,-0.0001889905,0.0,0.138,"[19112.90874, 19194.01205, 19321.931669999998,...","[473, 1872786, 5121, 324, 73, 759, 506, 396, 3...","[0.9995040397238781, 1.0080061957016224, 1.021...",0.39759,0.000764,0.000903,0.39759,0.39759,0.000139,0.181388,0
306,CHB21ND.cas,0.379125,-5.024241e-05,0.0,0.138,"[19109.49791, 19189.96713, 19317.927330000002,...","[730, 2812299, 6473, 203, 160, 1103, 357, 281,...","[0.9994865352644157, 1.0079222671007846, 1.021...",0.18705,0.001172,0.001338,0.18705,0.18705,0.000166,0.141656,0
319,CHB49NE.cas,0.379121,-6.167168e-05,0.0,0.138,"[22664.71569, 22758.475609999998, 22785.76336,...","[117, 176, 152, 46, 136, 51, 251, 270, 51, 178...","[1.4059462994045338, 1.4176032696417233, 1.421...",0.476923,0.000768,0.001124,0.476923,0.476923,0.000356,0.464031,0
326,CHB53PA.cas,0.380719,-8.856237e-05,0.0,0.138,"[65937.4877, 68647.44929, 71257.45606, 73775.4...","[167, 604, 2846, 11296, 128, 217, 545, 7388, 5...","[12.000784882750796, 13.007521293496552, 14.01...",0.47541,0.000971,0.001729,0.47541,0.353659,0.000758,0.779874,0


In [49]:
old[old['file_name']=='CH3 Bi+ Pos-055.cas']

Unnamed: 0,file_name,Mass/Time,MassOffset,StartFlightTime,SpecBinSize,channels,intensities,masses,avg_dist_frags_low,avg_dist_frags_high,adjusted_original_proportions_identified,original_proportions_identified,diff,prop_diff_in_low,calibration,adjusted_proportion_identified,proportion_identified
859,CH3 Bi+ Pos-055.cas,0.38288,-0.712269,0.0,0.128,"[15779.51561, 15890.33012, 15906.35714, 15935....","[701.0, 603.0, 143.0, 220.0, 235.0, 826.0, 255...","[0.003728766501089724, 0.00442151776525263, 0....",0.000937,0.001345,0.408046,0.408046,0.000408,0.435582,0,0.408046,0.408046


In [51]:
mass_formula(np.array(old.loc[859]['channels']), 0.128, 0, 0.38288,-0.7122692)

array([3.72873881e-03, 4.42148772e-03, 4.52656217e-03, 4.71848051e-03,
       6.60603888e-03, 7.11796450e-03, 7.52432280e-03, 8.40527920e-03,
       9.67438846e-03, 1.05104098e-02, 1.07326941e-02, 1.09953643e-02,
       1.19062747e-02, 1.20831109e-02, 1.29147271e-02, 1.77023039e-02,
       1.82481459e-02, 2.16964537e-02, 1.00794142e+00, 1.00971362e+00,
       1.01422788e+00, 1.04433281e+00, 1.04747724e+00, 1.05577368e+00,
       1.05960972e+00, 1.06510869e+00, 1.06779570e+00, 1.07197249e+00,
       1.09469018e+00, 1.19993225e+01, 1.20173093e+01, 1.30073463e+01,
       1.30246717e+01, 1.40104732e+01, 1.40150467e+01, 1.40321950e+01,
       1.50234889e+01, 1.59945604e+01, 1.60159343e+01, 1.70025143e+01,
       1.70256698e+01, 1.70379820e+01, 1.79986752e+01, 1.89976985e+01,
       2.39994562e+01, 2.40470233e+01, 2.50071599e+01, 2.50581794e+01,
       2.53332586e+01, 2.55363690e+01, 2.56512555e+01, 2.57202985e+01,
       2.57767153e+01, 2.58582974e+01, 2.59475435e+01, 2.60028469e+01,
      

In [55]:
full_dataset['intensities']

0       [73874, 1234, 138, 610, 1216, 4159, 8958, 1084...
1       [49864, 1034, 168, 4696, 8247, 13992, 17903, 2...
2       [23352, 74717, 10387, 947, 12344, 9121, 249, 4...
3       [34398, 304, 2223, 3521, 5205, 6509, 99, 115, ...
4       [42995, 602, 151, 17912, 9536, 17609, 29604, 5...
                              ...                        
3343    [10551.0, 1068.0, 431.0, 43369.0, 22470.0, 97....
3344    [227.0, 488.0, 1396.0, 138.0, 1305.0, 892.0, 2...
3345    [470.0, 948.0, 2328.0, 6228.0, 178.0, 425.0, 1...
3346    [7950.0, 17953.0, 1973.0, 1141.0, 56712.0, 342...
3347    [408.0, 687.0, 1121.0, 2405.0, 105.0, 181.0, 9...
Name: intensities, Length: 3348, dtype: object

In [71]:
full_dataset.to_csv('../data/full_corrected_data.csv', index=False)

In [70]:
full_dataset['masses'] = full_dataset['masses'].apply(list)

In [None]:
test = pd.read_csv('../data/full_corrected_data.csv')
test['channels'] = test['channels'].apply(literal_eval)
test['masses'] = test['masses'].apply(literal_eval)
test['intensities'] = test['intensities'].apply(literal_eval)

In [64]:
literal_eval(test.masses[3300])

SyntaxError: ignored