In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sys import path
%matplotlib inline

In [2]:
path.insert(0, '../src')
from data_generator import DataGenerator

In [2]:
dg = DataGenerator('./data/classification_cas_data.csv')
err_data = dg.calibrated_df(error=True, low_proportion=.005, high_proportion=.01)
err_data['peak_num'] = err_data['peaks'].apply(len)
err_data = err_data[err_data['peak_num'] >= 20].copy()
err_data.reset_index(inplace=True, drop=True)

In [3]:
avg_dists = []
avg_dists_beg = []
avg_dists_end = []
for peaks in err_data['masses']:
    avg_dist = 0
    avg_dist_b = 0
    avg_dist_e = 0
    for i, peak in enumerate(peaks):
        dist = abs(peak - round(peak))
        if i < len(peaks) // 2:
            avg_dist_b += dist
        else:
            avg_dist_e += dist
        avg_dist += dist
    avg_dists.append(avg_dist / len(peaks))
    avg_dists_beg.append(avg_dist_b / (len(peaks) // 2))
    avg_dists_end.append(avg_dist_e / (len(peaks) - len(peaks) // 2))

In [4]:
err_data['avg_dist_whole_num'] = avg_dists
err_data['avg_dist_beg'] = avg_dists_beg
err_data['avg_dist_end'] = avg_dists_end

In [5]:
err_data[err_data['target'] == 0].head(1)

Unnamed: 0,Mass/Time,MassOffset,file_name,StartFlightTime,SpecBinSize,channels,target,err_prop_slope,err_prop_offset,peaks,mass_channels,masses,intensities,peak_num,avg_dist_whole_num,avg_dist_beg,avg_dist_end
0,0.384486,-0.300591,0909402.cas,0.0,1.248,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0.009979,-0.009999,"[(7929, 6046), (8231, 16662), (8515, 1658), (8...","[7929, 8231, 8515, 8518, 8520, 8788, 8794, 879...","[12.27838438225773, 13.314937853539146, 14.328...","[6046, 16662, 1658, 1804, 1799, 53, 83, 66, 66...",38,0.335668,0.383401,0.287935


In [6]:
err_data[(err_data['target'] == 1)].head(1)

Unnamed: 0,Mass/Time,MassOffset,file_name,StartFlightTime,SpecBinSize,channels,target,err_prop_slope,err_prop_offset,peaks,mass_channels,masses,intensities,peak_num,avg_dist_whole_num,avg_dist_beg,avg_dist_end
2,0.387569,-0.280513,1229201.cas,0.0,1.248,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0.0,0.0,"[(2643, 1502), (2645, 1577), (2647, 1550), (26...","[2643, 2645, 2647, 2654, 7728, 7730, 7733, 773...","[0.9957411158786821, 0.9976726701232416, 0.999...","[1502, 1577, 1550, 1390, 85, 92, 105, 98, 98, ...",186,0.166712,0.039488,0.293937


In [7]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
X = err_data[['avg_dist_whole_num', 'avg_dist_beg', 'avg_dist_end']]
y = err_data['target']

In [8]:
from model_trainer import ModelTrainer

In [9]:
models = [LogisticRegressionCV, SVC, LGBMClassifier]
mt = ModelTrainer(models, X, y, ['LogRegCV', 'SVC', 'LGBM'])

In [14]:
accs, preds, X_test, y_test = mt.ttt_models()

LogRegCV: 0.9608938547486033
SVC: 0.9441340782122905
LGBM: 0.9608938547486033




In [None]:
accs2, index_preds = mt.kfold_models(5)

In [16]:
accs2

[0.9709057811813446, 0.949645345552696, 0.9652878036532546]

In [122]:
def model_optimizer(models, names, parameters, param_names, num_seeds=15, verbose=False):
    final = {}
    for name in names:
        final[name + ' accs'] = []
        final[name + ' params'] = []

    for i, model_obj in enumerate(models):
        accs = []
        params = []
        for param in parameter_generator(parameters[i], param_names[i]):
            model = model_obj(**param)
            seed_acc = 0
            for seed in np.random.randint(1, 900, size=num_seeds):
                acc, _, p = get_kfold_stats(X, y, 5, seed, models=[model])
                seed_acc += acc[0] / 15
            accs.append(seed_acc)
            params.append(param)
            if verbose:
                print(param, seed_acc)
        final[names[i] + ' accs'].append(accs)
        final[names[i] + ' params'].append(params)
    return final

In [128]:
num_leaves = [8, 15, 31]
learning_rate = [.1, .3, .5]
max_depth = [3]
names = ['num_leaves', 'learning_rate', 'max_depth']
parameters = [num_leaves, learning_rate, max_depth]

In [129]:
from create_folds import get_kfold_stats
results = model_optimizer([LGBMClassifier], ['LGBM'], [parameters], [names])

In [132]:
def parameter_generator(parameters, names):
    '''
    Given a list of lists containing parameters, and a list of names
    yields every combination of the parameters in the lists.
    '''
    indices = [0 for x in names]
    max_vals = [len(x) for x in parameters]
    while indices[0] < len(parameters[0]):
        params = {}
        for i, param in enumerate(parameters):
            params[names[i]] = param[indices[i]]
        indices = increment_index(indices, max_vals)
        yield params
            
def increment_index(indices, max_vals):
    '''
    Recursively increments the indices of several lists so that
    every combination of elements of those lists can be seen.
    
    Arguments -------
    indices = list of indices for lists
    max_vals = length of each list
    '''
    indices[-1] += 1
    if indices[-1] > max_vals[-1] - 1 and len(indices) > 1:
        indices[-1] = 0
        indices[0:-1] = increment_index(indices[0:-1], max_vals[0:-1])
    return indices

+ Research mass defect formula to determine true spot peaks should be
+ then for each nominal mass calculate distance from this value
+ average these values for whole spectrum or for each section of spectrum to determine mass deviation
+ then use to classifiy spectrum
+ error lower bound probably > 0.001