In [1]:
import scipy
from scipy.signal import argrelmax
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [2]:
RAW_DATA_DIR_NAME = '../data/raw/'

In [3]:
train = pd.read_csv(RAW_DATA_DIR_NAME + 'train.csv')
test = pd.read_csv(RAW_DATA_DIR_NAME + 'test.csv')

In [4]:
train = train.drop('target', axis=1)

In [5]:
train.shape, test.shape

((7436, 8), (6952, 8))

In [6]:
df = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)

In [7]:
df.shape

(14388, 8)

In [8]:
def get_peak_features(spec_df, th):
    peak_height = spec_df.spectrum.max()
    peak_pos = argrelmax(spec_df.spectrum.values, order=th)[0]
    num_peaks = len(peak_pos)
    peak_df = spec_df.iloc[peak_pos].sort_values('spectrum')
    peak_df = peak_df.loc[peak_df.spectrum > peak_height / 2]
    num_strong_peaks = len(peak_df)
    if len(peak_df) >=2:
        peak_wave_length2 = peak_df.iloc[-2].name
        peak_intensity2 = peak_df.iloc[-2].spectrum
    else:
        peak_wave_length2 = 0
        peak_intensity2 = 0
    return num_peaks, num_strong_peaks, peak_wave_length2, peak_intensity2

In [9]:
for i, row in tqdm(df.iterrows()):
    spec_df = pd.read_csv(f'{RAW_DATA_DIR_NAME}spectrum_raw/{row.spectrum_filename}', sep='\t', header=None)
    spec_df.columns = ['wave_length', 'spectrum']
    spec_df = spec_df.set_index('wave_length')
    wave_length = spec_df.index[spec_df.values.argmax()]
    df.loc[i, 'peak_wave_length'] = wave_length
    df.loc[i, 'peak_intensity'] = spec_df.values.max()
    df.loc[i, 'mean'] = spec_df.values.mean()
    df.loc[i, 'std'] = spec_df.values.std()
    df.loc[i, 'skew'] = scipy.stats.skew(spec_df.values)
    df.loc[i, 'kurtosis'] = scipy.stats.skew(spec_df.values)
    
    for th in [10, 15, 20, 25, 30, 50, 100, 150, 200]:
        df.loc[i, f'skew_{th}'] = scipy.stats.skew(spec_df[(spec_df.index > wave_length-th/2) & (spec_df.index < wave_length+th/2)].values)
        df.loc[i, f'kurtosis_{th}'] = scipy.stats.kurtosis(spec_df[(spec_df.index > wave_length-th/2) & (spec_df.index < wave_length+th/2)].values)
        df.loc[i, f'mean_{th}'] = np.mean(spec_df[(spec_df.index > wave_length-th/2) & (spec_df.index < wave_length+th/2)].values)
        df.loc[i, f'std_{th}'] = np.std(spec_df[(spec_df.index > wave_length-th/2) & (spec_df.index < wave_length+th/2)].values)

df['em_ev'] = 1240 / df.peak_wave_length
df['ex_ev'] = 1240 / df.exc_wl
df['ev_diff'] = df['em_ev'] - df['ex_ev']
df['ev_ratio'] = df['em_ev'] / df['ex_ev']

14388it [07:50, 30.58it/s]


In [10]:
display(df.head(), df.shape)

Unnamed: 0,spectrum_id,spectrum_filename,chip_id,exc_wl,layout_a,layout_x,layout_y,pos_x,peak_wave_length,peak_intensity,...,mean_150,std_150,skew_200,kurtosis_200,mean_200,std_200,em_ev,ex_ev,ev_diff,ev_ratio
0,000da4633378740f1ee8,b2e223339f4abce9b400.dat,79ad4647da6de6425abf,850,2,36,140,1313.081,1032.836,1751.0,...,195.123457,348.523199,3.124929,10.629135,144.540509,317.536694,1.200578,1.458824,-0.258246,0.822977
1,000ed1a5a9fe0ad2b7dd,e2f150a503244145e7ce.dat,79ad4647da6de6425abf,780,3,0,168,159.415,1079.008,4219.0,...,490.62069,873.894145,3.267705,10.269234,424.407143,809.275928,1.149204,1.589744,-0.44054,0.722886
2,0016e3322c4ce0700f9a,3d58b7ccaee157979cf0.dat,c695a1e61e002b34e556,780,1,34,29,-610.7688,1380.586,2412.0,...,388.37931,523.478894,2.708492,7.532413,312.917098,475.169927,0.898169,1.589744,-0.691574,0.564977
3,00256bd0f8c6cf5f59c8,ed3641184d3b7c0ae703.dat,c695a1e61e002b34e556,780,2,32,139,1214.618,1146.045,3209.0,...,650.848488,682.439572,2.999545,8.700362,558.064575,614.459246,1.081982,1.589744,-0.507762,0.680602
4,003483ee5ae313d37590,4c63418d39f86dfab9bb.dat,c695a1e61e002b34e556,780,0,45,85,-257.6161,1119.876,3998.0,...,384.944056,827.057737,3.513627,12.075204,323.994413,750.568697,1.107265,1.589744,-0.482478,0.696506


(14388, 54)

In [11]:
temp_df = df.copy()

In [13]:
for i, row in tqdm(df.iterrows()):
    spec_df = pd.read_csv(f'{RAW_DATA_DIR_NAME}spectrum_raw/{row.spectrum_filename}', sep='\t', header=None)
    spec_df.columns = ['wave_length', 'spectrum']
    spec_df = spec_df.set_index('wave_length')
    df.loc[i, f'num_peak'], df.loc[i, f'num_strong_peak'], df.loc[i, f'peak_wave_length2'], df.loc[i, f'peak_intensity2'] = get_peak_features(spec_df, th=50)

df['peak_ratio'] = df['peak_intensity'] / df['mean']
df['peak_ratio2'] = df['peak_intensity2'] / df['mean']

14388it [01:12, 199.06it/s]


In [14]:
display(df.head(), df.shape)

Unnamed: 0,spectrum_id,spectrum_filename,chip_id,exc_wl,layout_a,layout_x,layout_y,pos_x,peak_wave_length,peak_intensity,...,em_ev,ex_ev,ev_diff,ev_ratio,num_peak,num_strong_peak,peak_wave_length2,peak_intensity2,peak_ratio,peak_ratio2
0,000da4633378740f1ee8,b2e223339f4abce9b400.dat,79ad4647da6de6425abf,850,2,36,140,1313.081,1032.836,1751.0,...,1.200578,1.458824,-0.258246,0.822977,5.0,0.0,0.0,0.0,43.456948,0.0
1,000ed1a5a9fe0ad2b7dd,e2f150a503244145e7ce.dat,79ad4647da6de6425abf,780,3,0,168,159.415,1079.008,4219.0,...,1.149204,1.589744,-0.44054,0.722886,4.0,1.0,0.0,0.0,25.269679,0.0
2,0016e3322c4ce0700f9a,3d58b7ccaee157979cf0.dat,c695a1e61e002b34e556,780,1,34,29,-610.7688,1380.586,2412.0,...,0.898169,1.589744,-0.691574,0.564977,4.0,1.0,0.0,0.0,15.912632,0.0
3,00256bd0f8c6cf5f59c8,ed3641184d3b7c0ae703.dat,c695a1e61e002b34e556,780,2,32,139,1214.618,1146.045,3209.0,...,1.081982,1.589744,-0.507762,0.680602,3.0,1.0,0.0,0.0,6.134806,0.0
4,003483ee5ae313d37590,4c63418d39f86dfab9bb.dat,c695a1e61e002b34e556,780,0,45,85,-257.6161,1119.876,3998.0,...,1.107265,1.589744,-0.482478,0.696506,6.0,1.0,0.0,0.0,28.93166,0.0


(14388, 60)

In [15]:
temp_df = df.copy()

In [16]:
drop_col = ['spectrum_id', 'chip_id', 'exc_wl', 'layout_a', 'layout_x', 'layout_y', 'pos_x']

In [18]:
df = df.drop(drop_col, axis=1)

In [19]:
display(df.head(), df.shape)

Unnamed: 0,spectrum_filename,peak_wave_length,peak_intensity,mean,std,skew,kurtosis,skew_10,kurtosis_10,mean_10,...,em_ev,ex_ev,ev_diff,ev_ratio,num_peak,num_strong_peak,peak_wave_length2,peak_intensity2,peak_ratio,peak_ratio2
0,b2e223339f4abce9b400.dat,1032.836,1751.0,40.292752,172.038539,4.809706,4.809706,0.100252,-1.170818,1312.6,...,1.200578,1.458824,-0.258246,0.822977,5.0,0.0,0.0,0.0,43.456948,0.0
1,e2f150a503244145e7ce.dat,1079.008,4219.0,166.958984,462.975575,6.180535,6.180535,-0.410343,-1.135887,3220.777778,...,1.149204,1.589744,-0.44054,0.722886,4.0,1.0,0.0,0.0,25.269679,0.0
2,3d58b7ccaee157979cf0.dat,1380.586,2412.0,151.577691,327.537364,4.261846,4.261846,-0.375001,-1.434386,1982.222222,...,0.898169,1.589744,-0.691574,0.564977,4.0,1.0,0.0,0.0,15.912632,0.0
3,ed3641184d3b7c0ae703.dat,1146.045,3209.0,523.080947,436.05495,3.319229,3.319229,-0.960049,-0.338949,2682.617333,...,1.081982,1.589744,-0.507762,0.680602,3.0,1.0,0.0,0.0,6.134806,0.0
4,4c63418d39f86dfab9bb.dat,1119.876,3998.0,138.187717,471.548758,5.996866,5.996866,-0.735786,-0.508143,3210.777778,...,1.107265,1.589744,-0.482478,0.696506,6.0,1.0,0.0,0.0,28.93166,0.0


(14388, 53)

In [21]:
df.to_pickle('../data/features/camaro_spectrum.pkl')

In [22]:
df.columns.tolist()

['spectrum_filename',
 'peak_wave_length',
 'peak_intensity',
 'mean',
 'std',
 'skew',
 'kurtosis',
 'skew_10',
 'kurtosis_10',
 'mean_10',
 'std_10',
 'skew_15',
 'kurtosis_15',
 'mean_15',
 'std_15',
 'skew_20',
 'kurtosis_20',
 'mean_20',
 'std_20',
 'skew_25',
 'kurtosis_25',
 'mean_25',
 'std_25',
 'skew_30',
 'kurtosis_30',
 'mean_30',
 'std_30',
 'skew_50',
 'kurtosis_50',
 'mean_50',
 'std_50',
 'skew_100',
 'kurtosis_100',
 'mean_100',
 'std_100',
 'skew_150',
 'kurtosis_150',
 'mean_150',
 'std_150',
 'skew_200',
 'kurtosis_200',
 'mean_200',
 'std_200',
 'em_ev',
 'ex_ev',
 'ev_diff',
 'ev_ratio',
 'num_peak',
 'num_strong_peak',
 'peak_wave_length2',
 'peak_intensity2',
 'peak_ratio',
 'peak_ratio2']