In [328]:
import sys
sys.path.append("/Users/anuram/Documents/hydrogenase-ftir/src")

In [329]:
%load_ext autoreload
%autoreload 2

#Setting Up and Importing the Necessary Packages/Libraries
##Package for reading in Bruker OPUS type files
from brukeropusreader import read_file
import matplotlib.pyplot as plt
from scipy.interpolate import UnivariateSpline
import pathlib
import numpy as np
import pandas as pd
#Local Functions
from hydrogenase_processing.cut_range import cut_range_subtraction
from hydrogenase_processing.second_deriv import second_deriv
#testing
#find peaks
from scipy.signal import find_peaks
import ast

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [330]:
#Paths to Local Data
path_to_water_vapor_data = pathlib.Path("../../data/opus_files/water_vapor")
path_to_output_plots_= pathlib.Path("../../data/output_plots/")
path_to_all_test_data = pathlib.Path("../../data/opus_files/subtraction_coefficient_testing") 

## Importing the Data form the local paths specified above ##

In [331]:
#Importing Water Vapor
wv_data = read_file(f'{path_to_water_vapor_data}''/water vapor 4cm-1.1')

In [332]:
#Pulling in all test data
second_derivative_test_raw_data = list(path_to_all_test_data.iterdir())
second_derivative_test_raw_data.sort()

#Initializing dict of raw spectra files from the file system
raw_data = dict()

#Populating the raw_test_data dict with all the read in raw opus files
for i in (second_derivative_test_raw_data):
    if not i.name.startswith('.DS_Store'):
        raw_data[i.name[0:4]] = read_file(i)

print(raw_data.keys())

dict_keys(['007_', '009_', '010_', '011_', '011a', '011b', '011c', '011d', '011e', '011f', '011g', '011h', '011i', '011k', '011l', '011m', '011n', '011o', '011p', '011q', '011r', '011s', '011t', '012_', '013_', '015_', '016_', '020_', '024_', '028_', '032_', '035_', '160 ', '162 ', '164 ', '165 ', '166 ', '167 ', '168 ', '169 ', '170 ', '171 ', '172 ', '173 ', '174 ', '175 ', '176 '])


In [333]:
test_comparisons_parameters_df = pd.read_excel("../../data/test_subtraction_coefficients.xlsx", sheet_name="uncut_samples")

#full_file_names = test_comparisons_parameters_df["file_name"]

test_comparisons_parameters_df["file_name"] = test_comparisons_parameters_df["file_name"].apply(lambda file_name: file_name[0:4]) 

#test_comparisons_parameters_df["file_name"] = exp_num_file_name


indexed_test_comparison_parameters = test_comparisons_parameters_df.set_index('file_name')

In [334]:
#Initializing dict of post water vapor subtraction spectra
cut_range_sub_wv_data = dict()

results = list()

for idx, row in indexed_test_comparison_parameters.iterrows():  
    if idx in raw_data:
        raw_data_i = raw_data[idx]
        cut_range_sub_wv_data[f'{idx}_cut_range_wv_sub'] = cut_range_subtraction(raw_data_i, wv_data, row["range_start"], row["range_end"], SG_poly = 3, SG_points = 21)
        subtraction_parameters = cut_range_sub_wv_data[f'{idx}_cut_range_wv_sub'][0][0].fit_atm_params
        results.append(subtraction_parameters[0])
    else:
        results.append(None)

indexed_test_comparison_parameters["pb_subtraction_coefficient"] = results

2150 1800
959 1141
2150 1800
959 1141
2150 1800
959 1141
2150 1800
959 1141
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2000 1800
1037 1141
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115
2150 1850
959 1115


In [335]:
def peakDetectionTest(manual_peaks,prospecpy_peaks):
    detected_peak = [False]*len(manual_peaks)
    #In these two for loops cross check every value in the prospecpy_peaks with the manual peaks 
    #and if the absolute difference between the wavenumbers are <=2 consider that peak to be detetcted.
    #Make that index true in the detected peak array
    #Detection rate is total number of trues in detected peak array/ length of detected peak array
    for prospec_peak in prospecpy_peaks:
        for idx, man_peak in enumerate(manual_peaks):
            if abs(int(prospec_peak) - man_peak) <= 2:
                detected_peak[idx] = True
    #print(np.sort(manual_peaks))
    #print(np.sort([round(x) for x in prospecpy_peaks]))
    detection_percent = 100*sum(detected_peak)/len(detected_peak)

    return detected_peak, detection_percent
    

In [336]:
def getSecondDerivative(cut_range_sub_wv_data, sample_name,threshold = 0.15):
    example_cut_sub = cut_range_sub_wv_data[sample_name]
    x = second_deriv(example_cut_sub, show_plots=False)
    #because we want the peaks in the negative-y direction
    d2ydx2_spl_upsidedown = x[1] * -1
    relative_height = threshold * max(d2ydx2_spl_upsidedown)
        #using the scipy peak finder to find peaks' indices in the 2nd derivative spline results
    peaks_index = find_peaks(d2ydx2_spl_upsidedown, prominence=relative_height)

    #use for loops to extract the coordinates of the peaks so we can plot them on the plot above
    d2ydx2_peak_val = []
    deriv_x_peak_val = []

    for i in peaks_index[0]:
        d2ydx2_peak = x[1][i]
        deriv_x_peak = x[2][i]
        d2ydx2_peak_val.append(d2ydx2_peak)
        deriv_x_peak_val.append(deriv_x_peak)

    return deriv_x_peak_val
    

In [337]:
def populatePeakTestFile(sample_name, prospecpy_peaks, peak_testing_file):
    sample_number = sample_name.split('_')[0]
    manual_peaks = peak_testing_file.loc[sample_number,'peak_x_val_manual']
    if not pd.isna(manual_peaks) and manual_peaks != 'NA ':
        manual_peaks = ast.literal_eval(manual_peaks)
        peak_detected, detection_rate = peakDetectionTest(manual_peaks, prospecpy_peaks)
        #Adding these valus back to the peak_testing_file dataframe
        #Setting the wavenumbers of peaks identified by peak finder to the corresponding filename in column peak_x_val_prospecpy
        peak_testing_file.loc[sample_number,'peak_x_val_prospecpy'] = str([round(x) for x in prospecpy_peaks])
        peak_testing_file.loc[sample_number, 'detected_peak'] = str(peak_detected)
        peak_testing_file.loc[sample_number, 'detection_percent'] = str(round(detection_rate))
        peak_testing_file.loc[sample_number, 'num_peaks_in_manual_method'] = len(manual_peaks)
        peak_testing_file.loc[sample_number, 'num_peaks_in_Prospecpy_method'] = len(prospecpy_peaks)
    

In [338]:
peak_testing_file = pd.read_excel("../../data/pre_baseline_testing.xlsx", sheet_name="anuradha_test", index_col=0)
for sample_name in cut_range_sub_wv_data.keys():
    prospecpy_peaks = getSecondDerivative(cut_range_sub_wv_data,sample_name)
    populatePeakTestFile(sample_name, prospecpy_peaks, peak_testing_file)

In [339]:
peak_testing_file.drop('test', axis = 1, inplace = True)
peak_testing_file

Unnamed: 0_level_0,peak_x_val_manual,peak_x_val_prospecpy,detected_peak,detection_percent,num_peaks_in_manual_method,num_peaks_in_Prospecpy_method
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
160,,,,,,
167,,,,,,
162,,,,,,
011a,"[2092, 2080, 2061, 1978, 1957, 1943, 1937]","[2093, 2081, 1958, 1939]","[True, True, False, False, True, False, True]",57.0,7.0,4.0
011b,"[2093, 2079, 1978, 1957, 1945, 1937]","[2093, 2081, 1958, 1939]","[True, True, False, True, False, True]",67.0,6.0,4.0
011c,"[2092, 2080, 2062, 1972, 1957, 1943, 1937]","[2093, 2081, 1958, 1939]","[True, True, False, False, True, False, True]",57.0,7.0,4.0
011d,"[2092, 2080, 1957, 1943, 1937]","[2093, 2081, 1958, 1939]","[True, True, True, False, True]",80.0,5.0,4.0
011e,"[2092, 2079, 1958, 1946, 1937]","[2093, 2081, 1959, 1947, 1938]","[True, True, True, True, True]",100.0,5.0,5.0
011f,"[2092, 2078, 1957, 1945, 1937]","[2093, 2079, 1979, 1959, 1947, 1938]","[True, True, True, True, True]",100.0,5.0,6.0
011g,"[2086, 1957, 1945, 1937]","[2116, 2087, 2058, 1959, 1947, 1938, 1896]","[True, True, True, True]",100.0,4.0,7.0
