# TG Raman process code

<a target="_blank" href="https://colab.research.google.com/github/usnistgov/TGRS-Plasticizer-Library/blob/main/TG%20Raman%20Library%20work%20process.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Declare relevant Python packages

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import scipy as sp
from scipy.signal import find_peaks
import csv
import requests
from urllib.parse import quote
from itertools import combinations

Identifies files in folder

In [2]:
local = False

In [3]:
if local:
    myfilter = (
        lambda file: os.path.splitext(file)[1] == ".csv"
    )  # selects only the csv files
    csv_files = sorted([f for f in os.listdir("Raw files") if myfilter(f)])
    path = "Raw files/"

else:
    responsedict = requests.get(
        "https://api.github.com/repos/usnistgov/TGRS-Plasticizer-Library/git/trees/main?recursive=1"
    ).json()
    csv_files = [
        f["path"][10:] for f in responsedict["tree"] if "Raw files/" in f["path"]
    ]
    path = "https://raw.githubusercontent.com/usnistgov/TGRS-Plasticizer-Library/main/Raw files/"

Declares functions

In [4]:
def Calibration(f, raw_data):
    # Reads in file to acquire calibration shift
    df = pd.read_csv(f, skiprows=13, nrows=1, header=None, delimiter=":")

    Cal_Delta = str(df.iloc[0][1])[
        0:8
    ]  # Pulls out calibration factor from raw data file line 13 accounting for sigfigs
    # print(Cal_Delta)
    # Wavenumber correction
    raw_data.iloc[:, [0]] += float(Cal_Delta)

    return raw_data

In [5]:
# Performs the time gating and snips data to desired wavenumber range
def declare_snip(df, StartTG, EndTG, StartWN_snip_one, EndWN_snip_one):

    times = list(df.columns.values)
    time_range = np.array(times, dtype=float)

    wave_num = df.index
    wave_num = np.asarray(wave_num, dtype=np.float32)

    # find index location of begining of time gate snip
    for i in range(0, len(time_range)):
        dif_1 = time_range[i] - StartTG
        if dif_1 >= 0:
            start_snip = i + 1  # +1 accounts for python read in eg 1:4 reads in 1,2,3
            break

    # find index location of end of time gate snip
    for i in range(0, len(time_range)):
        dif_2 = time_range[i] - EndTG
        if dif_2 >= 0:
            end_snip = i + 1  # +1 accounts for python read in eg 1:4 reads in 1,2,3
            break

    # TG_snip grabs the time range desired for time gating
    TG_snip = df[df.columns[start_snip:end_snip]]

    # Target_snip is a dataframe with xy data, (wavenumber,averaged intensity data for snip region)
    Target_snip = TG_snip.mean(axis=1)

    # Rename intensity column
    Target_snip.name = "intensity"

    # find location of start of wavenumber snip
    for i in range(0, len(wave_num)):
        dif_1 = wave_num[i] - StartWN_snip_one
        if dif_1 >= 0:
            WNstart_snip_one = (
                i + 1
            )  # time_range.index(i)+1 # +1 accounts for python read in eg 1:4 reads in 1,2,3
            break
    # find location of end of wavenumber snip
    for i in range(0, len(wave_num)):
        dif_3 = wave_num[i] - EndWN_snip_one
        if dif_3 >= 0:
            WNend_snip_one = (
                i + 1
            )  # time_range.index(i)+1 # +1 accounts for python read in eg 1:4 reads in 1,2,3
            break

    Target_snip = Target_snip.iloc[WNstart_snip_one:WNend_snip_one]

    # Normalize Data
    Target_snip = Target_snip / Target_snip.abs().max()

    return Target_snip, StartTG, EndTG

In [6]:
# Script will generate a folder containing a plot of each data file in the folder. df is Target_snip from the declare_snip function
def Spectra_plot(df, sample_name):
    # Plotting
    fig, ax = plt.subplots(1, 1)
    ax.plot(df)
    ax.set_title(
        os.path.splitext(sample_name)[0]
    )  # Sets the title of plot to title of the csv file
    plt.xlabel(
        "Raman shift ($\mathregular{cm^{-1}}$)", labelpad=2
    )  # labelpad=10 adjusts space between axis lable and plot
    plt.ylabel("intensity", labelpad=10)

    # PLOT AESTHETICS
    # set font sizes
    S_size = 12
    M_size = 14
    L_size = 16

    plt.rc("font", size=S_size)  # controls default text sizes
    plt.rc("axes", titlesize=M_size)  # fontsize of the axes title
    plt.rc("axes", labelsize=M_size)  # fontsize of the x and y labels
    plt.rc("xtick", labelsize=S_size)  # fontsize of the tick labels
    plt.rc("ytick", labelsize=S_size)  # fontsize of the tick labels
    plt.rc("legend", fontsize=S_size)  # legend fontsize
    plt.rc("figure", titlesize=L_size)  # fontsize of the figure title

    # Export plots
    # next 2 lines create a new folder in current folder to store plots
    directory = "plots"
    os.makedirs(directory, exist_ok=True)
    filename = os.path.splitext(sample_name)[0] + ".png"
    path = os.path.join(directory, filename)
    fig.savefig(path)
    plt.close()

    return path

  "Raman shift ($\mathregular{cm^{-1}}$)", labelpad=2


In [7]:
# Calculates the spectral background
# derived from stackoverflow code written for python 3.6:  https://stackoverflow.com/questions/29156532/python-baseline-correction-library -- Source: "Baseline Correction with Asymmetric Least Squares Smoothing" Paul H. C. Eilers Hans F.M. Boelens. Download access at- https://prod-dcd-datasets-public-files-eu-west-1.s3.eu-west-1.amazonaws.com/dd7c1919-302c-4ba0-8f88-8aa61e86bb9d
import numpy as np
from scipy import sparse
from scipy.sparse.linalg import spsolve

# y is the signal of length L, z is the fitted baseline
# D is the difference matrix assuming a second order difference matrix
# w is a weighted vector (currently un weighted e.g. all ones) if peak regions are known, can be set to zero in those regions adjusted in last boolean statment over iterations
# W is a diagonal matrix with w on its diagonal, lam adjusts the balance between terms
# p is the asymmetry parameter - recommended set between 0.001-0.1
# lam: 10^4 to 10^6 are a good starting point


def baseline_als(y, lam, p, niter=10):
    L = len(y)
    # constructs a sparse matrix from diagonals
    D = sparse.diags([1, -2, 1], [0, -1, -2], shape=(L, L - 2))
    w = np.ones(L)
    i = 0
    for i in range(niter):
        W = sparse.spdiags(w, 0, L, L)
        # z solves for the optimization of the statement with Z intermediate
        Z = W + lam * D.dot(D.transpose())
        z = spsolve(Z, w * y)
        w = p * (y > z) + (1 - p) * (y < z)
        # peakfind identifies where peaks are (assigned zero value) and where the absence of peask (assigned 1 value)
        # sensitivity value 0.02 was tuned using the level of noise in a test image and may need to be adjusted
        peakfind = (y - z > 0.02) * 0 + (y - z <= 0.02) * 1

    return z, peakfind

In [8]:
# Identifies the longest run in Raman spectra with no peaks. Used in Signal-to-Background Ratio (SBR) calculation
def run_length(vals, max_length=500):
    best_length = 0
    current_length = 0
    i = 0
    range_end = 0
    for i, t in enumerate(vals):
        if t == 1:
            current_length += 1
            if current_length > best_length:
                range_end = i
                best_length = current_length
            if current_length == max_length:
                range_end = i
                break
        else:
            current_length = 0

    return best_length, range_end

In [9]:
# Calculates Signal-to-Background Ratio (SBR)
def SBRcalc(df):
    base, ispeak = baseline_als(df, 10e5, 0.01, niter=15)
    # background subtracted data - to determine height of tallest peak above fitted background
    y = df - base

    # find a long run with no peaks
    min_length = 100
    troughs = ispeak.values
    best_length, range_end = run_length(troughs, max_length=500)

    if best_length < min_length:
        # if there is not sufficient region found to have 'no peaks' it is assumed that
        # the range is primarily background or broad peaks where the noise can be accessible without too many peak outliers
        run = df
    else:
        range_begin = range_end - best_length + 50
        no_peaks = df.iloc[range_begin:range_end]
        run = no_peaks

    fit, toss = baseline_als(run, 10e4, 0.5, niter=15)
    flat_noise = run - fit

    error_STD = flat_noise.std()
    # SBR = height above baseline of max peak / STD of background --- 2*STD =95% confidence interval
    sbr = y.max() / (2 * error_STD)

    # bins the error into a histogram
    # flat_noise.hist()
    # plots in window to confirm fit
    # no_peaks.plot()
    # plt.plot(flat_noise.index,fit)

    return sbr

In [10]:
# Exports text data and metadata details:
def details_CSV(df, sample_name, StartTG_snip, EndTG_snip, sbr):
    # Adjust name
    Read_in_Name = os.path.splitext(sample_name)[0]
    Sample_Name = Read_in_Name[:-7]
    # writing metadata to csv
    directory = "Final Processed Files"
    os.makedirs(directory, exist_ok=True)
    filename = Sample_Name + "_Proc" + ".csv"
    path = os.path.join(directory, filename)

    # Metadata content options
    # 1. sample name
    name = "Sample: " + Sample_Name + "\n"
    # 2. normalization protocol
    norm_proc = "Intensity data was averaged across the time gated region (df.mean(axis=1)) followed by normalization by dividing the whole range by the absolute value of the highest point\n"
    # 3. gating snip
    snip_range = (
        "The time gated region is "
        + str(StartTG_snip)
        + " - "
        + str(EndTG_snip)
        + " ns\n"
    )
    # 4. Signal to background -reference to self
    SBR_read = f"The signal to background ratio (SBR) is: {sbr}\n"

    # open file to write
    # 'w' indicates that the file is open to 'write' mode
    with open(path, "w") as csvfile:
        # creating a csv writer object
        csvwriter = csv.writer(csvfile)

        # Print to file
        # 1. sample name
        csvfile.write(name)
        # 2. normalization protocol
        csvfile.write(norm_proc)
        # 3. gating snip
        csvfile.write(snip_range)
        # 4. Signal to background -reference to self
        csvfile.write(SBR_read)

    # insert xy data
    df.to_csv(path, mode="a", sep=";")

    path = os.path.join(directory, filename)

In [11]:
# Chunk 1: Reads in csv files and initiates calibration
def Chunk_1(fname, start_snip, end_snip, StartWN_snip_one, EndWN_snip_one):
    # Reads raw time gate Raman data into a Pandas dataframe
    df = pd.read_csv(fname, sep=";", skiprows=15)

    # Wavenumber calibration
    df = Calibration(fname, df)

    # Renames first column and set to index
    df.rename(columns={"NaN": "cm-1"}, inplace=True)
    df.set_index("cm-1", inplace=True, drop=True)

    # declare_snip snips data to declared gating region and wavenumber window, averages it to 2D spectra and normalizes data between 0-1 intensity

    Target_snip, StartTG_snip, EndTG_snip = declare_snip(
        df, start_snip, end_snip, StartWN_snip_one, EndWN_snip_one
    )  # update in format of declare_snip(StartTG_snip,EndTG_snip) in nanoseconds

    return Target_snip

Runs the code

In [13]:
# Declare the desired snip region -- 5.5 ns - 5.8 ns works well for the plasticizer data library
StartTG_snip = 5.5
EndTG_snip = 5.8

# Declare wavenumber snip region - the function will identify the index of values closest to the input values
StartWN_snip_one = (
    120  # for SBR calculation
)
EndWN_snip_one = 2500

current_spectra = []

# Reads in the csv files one at a time and isolates the snipped region, adjusts wavenumbers for instrument calibration error
for f in csv_files:
    # Averages over the snipped region, normalizes data, and returns
    if local:
        current_spectra = Chunk_1(
            path + f, StartTG_snip, EndTG_snip, StartWN_snip_one, EndWN_snip_one
        )
    else:
        current_spectra = Chunk_1(
            quote(path + f, safe=":/"),
            StartTG_snip,
            EndTG_snip,
            StartWN_snip_one,
            EndWN_snip_one,
        )
    # Creates a new folder called Plots and creates PNG plots of all the spectra in the folder
    Spectra_plot(current_spectra, f)
    # Calculates Signal-to-Background Ratio
    SBR = SBRcalc(current_spectra)
    
    details_CSV(current_spectra, f, StartTG_snip, EndTG_snip, SBR)

In [44]:
# Find all the unique chemistries
chemistries = [f[:-14] for f in csv_files]
unique_chem, unique_indicies, unique_counts = np.unique(chemistries, return_index=True, return_counts=True)

# Loop through each chemistry
for i, chem in enumerate(unique_chem):

    replicalist = [f for f in csv_files if chem in f]

    Ave_peak_list = []
    Noise_list = []
    
    # Find the pairs
    for pair in combinations(replicalist, 2):

        print(pair)
        
        spectraA = pd.read_csv('Final Processed Files/'+pair[0].replace('3D_Raw','Proc'), sep=";", skiprows=5, header=None).iloc[:,1]
        spectraB = pd.read_csv('Final Processed Files/'+pair[1].replace('3D_Raw','Proc'), sep=";", skiprows=5, header=None).iloc[:,1]

        # Performs background subtraction on the current spectra and previous spectra if sample name is the same compound
        baseA, peaksA = baseline_als(spectraA, 10e5, 0.01, niter=15)
        baseB, peaksB = baseline_als(spectraB, 10e5, 0.01, niter=15) 
        specA_bkgsub = spectraA - baseA
        specB_bkgsub = spectraB - baseB

        # Finds major peak in the spectra and selects a range of 24 wavenumber about the peak
        peaks, properties = find_peaks(
            spectraA, height=0.4
        )  # finds the most significant peak in the spectra. height = 0.95 accomodates for background subtraction of normalized spectra
        max_peak_index = np.argmax(np.array(properties["peak_heights"]))
        max_peak = int(peaks[max_peak_index])

        bumped = specA_bkgsub.reset_index()
        wavenum = bumped.iloc[:, 0]
        S_Range = wavenum.iloc[max_peak] - 12
        E_Range = wavenum.iloc[max_peak] + 12

        # Subtracts the baseline corrected spectra of successive replicates within the region about the major peak
        Subtracted = specA_bkgsub.subtract(specB_bkgsub)
        Noise = Subtracted.loc[S_Range:E_Range]

        # Calculates standard deviation of the noise in the region under the major peak and calculates average height of the major peak between successive spectra
        #Noise_STD = np.std(Noise.to_numpy())
        peakA = specA_bkgsub.max()
        peakB = specB_bkgsub.max()
        peaks = [peakA, peakB]
        #Ave_peak = np.mean(peaks)

        Ave_peak_list.append(peaks)
        Noise_list.append(Noise.to_numpy())
        
    print(Ave_peak_list)
    print(Noise_list)

    # Calculates SNR based on McCreery 2000 Chap. 4
    #SNR = Ave_peak / (Noise_STD / np.sqrt(2))

    #print(SNR)




('2,2,4-trimethyl-1,3-pentanediol diisobutyrate_10_3D_Raw.csv', '2,2,4-trimethyl-1,3-pentanediol diisobutyrate_99_3D_Raw.csv')
[[0.9733897832073034, 0.9730552924865714]]
[array([ 1.95286476e-03,  1.20431739e-03,  6.70388419e-05, -8.76845188e-04,
       -1.08397874e-03, -9.42419677e-04, -9.30504993e-04, -3.55178977e-04,
        8.18767405e-05,  3.09394953e-04,  2.07607566e-03,  2.03469676e-03,
        3.34490721e-04, -9.69519454e-04, -1.18597663e-03,  1.24745889e-03,
        9.88118452e-04, -1.57816132e-03, -1.78073167e-03, -1.43717795e-03,
       -2.09733196e-03, -1.74358074e-03, -5.26343888e-04,  7.43737632e-04,
        1.96475201e-03])]
('2-ethylhexyl diphenyl phosphate_02_3D_Raw.csv', '2-ethylhexyl diphenyl phosphate_03_3D_Raw.csv')
[[0.9581634394368048, 0.9562968498559946]]
[array([-6.58621061e-04, -9.23175182e-04, -2.41668293e-04, -1.40948910e-04,
       -2.48954575e-04,  1.11561406e-04, -5.74445656e-05,  1.85572646e-04,
        1.27028844e-03,  3.23699380e-04, -6.05322761e-04,  1

In [None]:
# Declare the desired snip region -- 5.5 ns - 5.8 ns works well for the plasticizer data library
StartTG_snip = 5.5
EndTG_snip = 5.8

# Declare wavenumber snip region - the function will identify the index of values closest to the input values
StartWN_snip_one = (
    120  # for SNR calculation, set StartWN_snip > 400 to cut out the Rayleigh-wing
)
EndWN_snip_one = 2500

# Declares variables to calculate SNR from duplicate data
current_name = []
current_spectra = []
current_base = []
current_peaks = []
old_name = []
old_spectra = []
# Allows commenting out SBR, SNR calculation sections if not needed
# SBR = 0
# SNR = 0

# Reads in the csv files one at a time and isolates the snipped region, adjusts wavenumbers for instrument calibration error
for f in csv_files:
    # Averages over the snipped region, normalizes data, and returns
    if local:
        current_spectra = Chunk_1(
            path + f, StartTG_snip, EndTG_snip, StartWN_snip_one, EndWN_snip_one
        )
    else:
        current_spectra = Chunk_1(
            quote(path + f, safe=":/"),
            StartTG_snip,
            EndTG_snip,
            StartWN_snip_one,
            EndWN_snip_one,
        )
    # Creates a new folder called Plots and creates PNG plots of all the spectra in the folder
    Spectra_plot(current_spectra, f)
    # Calculates Signal-to-Background Ratio
    SBR = SBRcalc(current_spectra)

    # Calculates SNR for data in duplicate as the code iterates through the files in the list
    current_name = f[:-14]  # cuts off file type and unique identifiers

    if current_name == old_name:
        # Performs background subtraction on the current spectra and previous spectra if sample name is the same compound
        current_base, current_peaks = baseline_als(
            current_spectra, 10e5, 0.01, niter=15
        )
        old_base, old_peaks = baseline_als(old_spectra, 10e5, 0.01, niter=15)
        cur_spec_bkgsub = current_spectra - current_base
        old_spec_bkgsub = old_spectra - old_base

        # Finds major peak in the spectra and selects a range of 24 wavenumber about the peak
        peaks, properties = find_peaks(
            current_spectra, height=0.4
        )  # finds the most significant peak in the spectra. height = 0.95 accomodates for background subtraction of normalized spectra
        max_peak_index = np.argmax(np.array(properties["peak_heights"]))
        max_peak = int(peaks[max_peak_index])

        bumped = cur_spec_bkgsub.reset_index()
        wavenum = bumped.iloc[:, 0]
        S_Range = wavenum.iloc[max_peak] - 12
        E_Range = wavenum.iloc[max_peak] + 12

        # Subtracts the baseline corrected spectra of successive replicates within the region about the major peak
        Subtracted = cur_spec_bkgsub.subtract(old_spec_bkgsub)
        Noise = Subtracted.loc[S_Range:E_Range]

        # Calculates standard deviation of the noise in the region under the major peak and calculates average height of the major peak between successive spectra
        Noise_STD = np.std(Noise.to_numpy())
        current_peak = cur_spec_bkgsub.max()
        old_peak = old_spec_bkgsub.max()
        peaks = [current_peak, old_peak]
        Ave_peak = np.mean(peaks)

        # Calculates SNR based on McCreery 2000 Chap. 4
        SNR = Ave_peak / (Noise_STD / np.sqrt(2))
    else:
        old_name = current_name
        old_spectra = current_spectra
        SNR = 0

    details_CSV(current_spectra, f, StartTG_snip, EndTG_snip, SBR, SNR)