In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sys import path
%matplotlib inline

In [2]:
path.insert(0, '../src')
from data_generator import DataGenerator
from model_trainer import ModelTrainer

In [31]:
from data_transformation import get_better_spectra, get_precise_peaks
dg = DataGenerator('../data/classification_cas_data.csv')
norm_data = dg.df()

In [42]:
data = get_better_spectra(dir='../data/SpectraCsvFiles_BkgndSubtractWatsonPeakFinder/')
norm_data.sort_values('file_name', inplace=True)
data.sort_values('file_name', inplace=True)
original_data = pd.merge(data, norm_data, on='file_name')

In [43]:
peaks = get_precise_peaks(original_data, ['precise_channels', 'precise_intensities'])
original_data['peaks'] = peaks
dg.set_df(original_data)

In [44]:
from data_transformation import get_isotope_data, get_isotope_mass_list
isotope_data = get_isotope_data()
nom_masses_low = get_isotope_mass_list(isotope_data, False, 2000)
nom_masses_high = get_isotope_mass_list(isotope_data, True, 2000)

In [45]:
original_data = dg.calibrated_df(False)

In [46]:
from data_transformation import get_peaks_near_nom_masses
def get_extreme_peaks(masses, nom_masses_low, nom_masses_high, thresh=0.1):
    '''
    Finds masses in no mans land between extreme isotope mass values.
    Returns peaks below extreme value, values below nearest isotope
    mass but greater than .5 above the isotope mass below them. Also
    returns peaks above nearest isotope mass.
    
    Arguments-------
    masses: data structure containing arrays / lists of masses
    nom_masses_low: list of low extreme isotope masses
    nom_masses_high: list of high extreme isotope masses
    thresh: how far above or below an isotope mass a peak must be for
    it to be selected.
    '''
    num_above = []
    peaks_above = []
    num_below = []
    peaks_below = []
    for row in masses:
        peaks, dists = get_peaks_near_nom_masses(row, nom_masses_low, -1)
        peaks = np.array(peaks)[np.array(dists) < -1 * thresh]
        num_below.append(len(peaks))
        peaks_below.append(peaks)
        peaks, dist = get_peaks_near_nom_masses(row, nom_masses_high, -1, rev=True)
        peaks = np.array(peaks)[np.array(dists) > thresh]
        num_above.append(len(peaks))
        peaks_above.append(peaks)
    return peaks_above, peaks_below

In [47]:
peaks_above, peaks_below = get_extreme_peaks(original_data['masses'], nom_masses_low, nom_masses_high)
outliers = pd.DataFrame(original_data['file_name'])
outliers['peaks_below'] = peaks_below
outliers['num_below'] = outliers['peaks_below'].apply(len)
outliers['peaks_above'] = peaks_above
outliers['num_above'] = outliers['peaks_above'].apply(len)
outliers.sort_values('num_below', inplace=True, ascending=False)
outliers.head()

Unnamed: 0,file_name,peaks_below,num_below,peaks_above,num_above
681,FC724_02.cas,"[261.5863525802578, 1142.5003190574034, 226.52...",349,"[583.0132731240417, 201.9836303307803, 176.956...",577
677,FC722_04.cas,"[261.5078375858391, 242.54690404411895, 426.50...",307,"[594.0356761177015, 411.03562082142247, 42.994...",559
867,TBA002.cas,"[664.5002131278137, 719.5013906155907, 668.503...",291,"[117.93866198431608, 63.964369811383335, 101.9...",512
839,PTMO008.cas,"[554.5069499895332, 596.5082766780985, 582.514...",278,"[240.82398944909326, 234.810623096754, 268.765...",174
666,FC129_01.cas,"[294.5008580818698, 278.51299746892175, 296.52...",266,"[493.79149815910375, 350.78944612495286, 242.8...",88


In [48]:
outliers.describe()

Unnamed: 0,num_below,num_above
count,906.0,906.0
mean,19.512141,102.984547
std,40.553578,124.312188
min,0.0,0.0
25%,0.0,25.0
50%,3.0,64.0
75%,18.0,134.5
max,349.0,1351.0


In [41]:
outliers.describe()

Unnamed: 0,num_below,num_above
count,906.0,906.0
mean,19.8234,103.3234
std,42.461965,125.46409
min,0.0,0.0
25%,0.0,25.0
50%,3.0,64.0
75%,18.0,135.0
max,378.0,1351.0


In [49]:
outliers.to_csv('../data/outliers.csv')

In [8]:
from data_transformation import get_peaks_near_nom_masses
num_below = []
peaks_below = []
for row in original_data.itertuples():
    peaks, dists = get_peaks_near_nom_masses(row.masses, nom_masses_low, 2000)
    dists = np.array(dists)
    peaks = np.array(peaks)
    num_below.append(len(dists[dists < -0.1]))
    peaks_below.append(peaks[dists < -0.1])

In [9]:
original_data['num_below'] = num_below
original_data['peaks_below'] = peaks_below

In [10]:
weird = original_data[['num_below', 'peaks_below', 'file_name']].copy()
weird.sort_values('num_below', ascending=False)
weird.to_csv('extreme_spectra.csv')

In [11]:
from data_transformation import get_error_masses, get_dist_nom_mass, augment_value
def augment_values(values, amount, sign):
    vals = []
    for val in values:
        vals.append(augment_value(val, amount, sign))
    return vals

def get_avg(masses, nom_mass):
    avgs = 0
    for mass in masses:
        avgs += get_dist_nom_mass(mass, nom_mass)
    return avgs / len(masses)

avgs_slope = []
add_avgs_slope = []
sub_avgs_slope = []
avgs_offset = []
add_avgs_offset = []
sub_avgs_offset = []
avgs_both = []
add_avgs_both = []
sub_avgs_both = []
for num in range(50):
    masses, avgs, sl_err, off_err = get_error_masses(norm_data, False, get_dist_nom_mass, args=(nom_masses_low,), add_to='slope')
    avgs_slope.append(avgs)
    p = pd.Series(np.array(masses, dtype=object)[np.array(sl_err) < 0]).apply(augment_values, args=(0.1, 1,))
    add_avgs_slope.append(p.apply(get_avg, args=(nom_masses_low,)))
    masses, avgs, sl_err, off_err = get_error_masses(norm_data, False, get_dist_nom_mass, args=(nom_masses_low,), add_to='offset')
    avgs_offset.append(avgs)
    p = pd.Series(np.array(masses, dtype=object)[np.array(off_err) < 0]).apply(augment_values, args=(0.1, 1,))
    add_avgs_offset.append(p.apply(get_avg, args=(nom_masses_low,)))
    masses, avgs, sl_err, off_err = get_error_masses(norm_data, False, get_dist_nom_mass, args=(nom_masses_low,), add_to='both')
    avgs_both.append(avgs)
    p = pd.Series(np.array(masses, dtype=object)[np.array(off_err) < 0]).apply(augment_values, args=(0.1, 1,))
    add_avgs_both.append(p.apply(get_avg, args=(nom_masses_low,)))

IndexError: list index out of range

In [None]:
default = original_data['masses'].apply(get_avg, args=(nom_masses_low,))

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(default, bins=50, alpha=0.5, label='no error')
plt.vlines(np.mean(default), 0, 60)
plt.hist(avgs_slope[0], bins=50, alpha=0.5, label='slope err')
plt.vlines(np.mean(avgs_slope[0]), 0, 60)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(default, bins=50, alpha=0.5, label='no error')
plt.vlines(np.mean(default), 0, 60)
plt.hist(avgs_offset[0], bins=50, alpha=0.5, label='offset err')
plt.vlines(np.mean(avgs_offset[0]), 0, 60)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(default, bins=50, alpha=0.5, label='no error')
plt.vlines(np.mean(default), 0, 60)
plt.hist(avgs_both[0], bins=50, alpha=0.5, label='err both')
plt.vlines(np.mean(avgs_both[0]), 0, 60)
plt.legend()
plt.show()