# Pre-processing


Aqui aplicamos os filtros de pré-processamento para refinar os dados de espectroscopia. Os filtros utilizados serão a Correção Multiplicativa de Espalhamento (MSC), Padronização Normal de Sinal  (SNV) e a primeira derivada de Savitzky-Golay.

# Imports


In [20]:
import os
import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
from sklearn.preprocessing import StandardScaler

# Load data

In [25]:
def load_data(filepath):
    df = pd.read_excel(filepath, engine="openpyxl")
    metadata = df.iloc[:, [0,1,2,3,4,6]]  # Seleciona colunas 1,2,3,4,5 e 7
    wavelengths = df.iloc[:, 7:]  # Da oitava coluna até o fim
    return metadata, wavelengths

# Filter's algorithm

In [38]:
def msc(input_data):
    mean_spectrum = input_data.mean(axis=0)
    corrected_spectra = []
    
    for i in range(input_data.shape[0]):
        spectrum = input_data[i, :]
        slope, intercept = np.polyfit(mean_spectrum, spectrum, 1)
        corrected_spectrum = (spectrum - intercept) / slope
        corrected_spectra.append(corrected_spectrum)
    
    corrected_df = pd.DataFrame(corrected_spectra)
    return corrected_df

def snv(input_data):
    return (input_data - np.mean(input_data, axis=1, keepdims=True)) / np.std(input_data, axis=1, keepdims=True)

def savitzky_golay(input_data, window_size=41, poly_order=1):
    return savgol_filter(input_data, window_length=window_size, polyorder=poly_order, deriv=1, axis=1)

# Plot and save

In [39]:
def plot_spectra(wavelengths, processed_data, title, save_path):
    x_values = np.linspace(350, 2500, num=processed_data.shape[1])
    plt.figure(figsize=(10,5))
    plt.plot(x_values, processed_data.T, alpha=0.5)
    plt.xlabel("Comprimento de onda (nm)")
    plt.ylabel("Absorbância")
    plt.title(title)
    plt.axhline(y=0, color='k', linewidth=1.5)
    plt.savefig(save_path)
    plt.close()

def save_processed_data(metadata, processed_data, filename, output_folder, new_wavelengths):
    processed_df = pd.DataFrame(processed_data)
    processed_df.columns = new_wavelengths
    result = pd.concat([metadata, processed_df], axis=1)
    result.to_excel(os.path.join(output_folder, filename), index=False)


# Main

In [41]:
def main():
    filepath = r"C:\Users\luyza\OneDrive\Documentos\spectroscopy_machine_learning_analysis\Data\dataset_cotton_fruit.xlsx"
    
    root_folder = os.path.dirname(os.path.dirname(filepath))  
    output_folder = os.path.join(root_folder, "Processed")
    os.makedirs(output_folder, exist_ok=True)
    
    metadata, wavelengths = load_data(filepath)
    
    # Verifique se wavelengths.values é unidimensional e torne-o 2D
    if wavelengths.values.ndim == 1:
        wavelengths_values_2d = wavelengths.values.reshape(1, -1)  # Torna o array 2D
    else:
        wavelengths_values_2d = wavelengths.values  # Já é 2D
    
    new_wavelengths = np.linspace(350, 2500, num=len(wavelengths_values_2d[0]))

    # Agora, passe o new_wavelengths em vez de wavelengths para as funções
    msc_data = msc(wavelengths_values_2d)
    snv_data = snv(wavelengths_values_2d)
    sg_data = savitzky_golay(wavelengths_values_2d)

    save_processed_data(metadata, msc_data, "msc_processed.xlsx", output_folder, new_wavelengths)
    plot_spectra(new_wavelengths, msc_data, "MSC Pre-processed Spectra", os.path.join(output_folder, "msc_plot.png"))

    save_processed_data(metadata, snv_data, "snv_processed.xlsx", output_folder, new_wavelengths)
    plot_spectra(new_wavelengths, snv_data, "SNV Pre-processed Spectra", os.path.join(output_folder, "snv_plot.png"))

    save_processed_data(metadata, sg_data, "sg_processed.xlsx", output_folder, new_wavelengths)
    plot_spectra(new_wavelengths, sg_data, "Savitzky-Golay 1st Derivative", os.path.join(output_folder, "sg_plot.png"))

if __name__ == "__main__":
    main()