# Imports

In [4]:
import pandas as pd
from pathlib import Path
import os
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict

# Obtenção dos dados

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Definindo o caminho base para a pasta Pre-processamento
base_path = Path(parent_dir) / 'Partial Components Analysis'

file_path_raw_cal = base_path / 'raw_calibration_data.xlsx'
file_path_msc_cal = base_path / 'msc_calibration_data.xlsx'
file_path_snv_cal = base_path / 'snv_calibration_data.xlsx'
file_path_sg_cal = base_path / 'sg_calibration_data.xlsx'

df_raw_cal = pd.read_excel(file_path_raw_cal)
df_msc_cal = pd.read_excel(file_path_msc_cal)
df_snv_cal = pd.read_excel(file_path_snv_cal)
df_sg_cal = pd.read_excel(file_path_sg_cal)

file_path_raw_val = base_path / 'raw_validation_data.xlsx'
file_path_msc_val = base_path / 'msc_validation_data.xlsx'
file_path_snv_val = base_path / 'snv_validation_data.xlsx'
file_path_sg_val = base_path / 'sg_validation_data.xlsx'

df_raw_val = pd.read_excel(file_path_raw_val)
df_msc_val = pd.read_excel(file_path_msc_val)
df_snv_val = pd.read_excel(file_path_snv_val)
df_sg_val = pd.read_excel(file_path_sg_val)

In [5]:
# Centralização dos dados
def mean_center_data(X):
    return X - np.mean(X, axis=0)

# Detecção de outliers usando Hotelling's T2 e F-residual
def detect_outliers(pls, X, Y, n_components):
    T2_limit = 2 * n_components * (len(X) - 1) / (len(X) - n_components)
    Q_limit = 2 * np.mean((Y - pls.predict(X)) ** 2)  # F-residual limit

    T2 = np.sum((pls.x_scores_ / np.sqrt(pls.x_scores_.shape[0] - 1))**2, axis=1)
    Q = np.mean((Y - pls.predict(X)) ** 2, axis=1)

    return T2, Q, T2_limit, Q_limit

# Determinação do número ótimo de componentes
def determine_optimal_components(X_centered, Y, max_components=15):
    mean_explained_variance = []
    std_explained_variance = []
    
    for n in range(1, max_components + 1):
        pls = PLSRegression(n_components=n)
        Y_pred = cross_val_predict(pls, X_centered, Y, cv=len(X_centered))  # LOO-CV
        explained_variance = r2_score(Y, Y_pred)
        mean_explained_variance.append(explained_variance)
    
    mean_explained_variance = np.array(mean_explained_variance)
    min_diff_index = np.argmax(np.diff(mean_explained_variance) <= 0.009)
    
    optimal_components = min_diff_index + 1  # Adicionar 1 para obter o número correto de componentes
    return optimal_components, mean_explained_variance

# Avaliação do modelo
def evaluate_model(pls, X_test, Y_test):
    Y_pred = pls.predict(X_test)
    r2 = r2_score(Y_test, Y_pred)
    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
    bias = np.mean(Y_pred - Y_test)
    slope = np.polyfit(Y_test.flatten(), Y_pred.flatten(), 1)[0]
    offset = np.polyfit(Y_test.flatten(), Y_pred.flatten(), 1)[1]
    return r2, rmse, bias, Y_pred, slope, offset

# Predição em novos dados
def predict_new_data(pls, X_new, Y_reference):
    Y_pred = pls.predict(X_new)
    diff = Y_pred - Y_reference
    results_df = pd.DataFrame({'Predicted': Y_pred.flatten(), 'Reference': Y_reference.flatten(), 'Difference': diff.flatten()})
    return results_df


In [None]:
# Exemplo de uso
X = df_msc_cal.iloc[:,1:].values  # Exemplo de dados de comprimento de onda (350 a 2500 nm)
Y = df_msc_cal.iloc[:,0].values    # Exemplo de atributo de qualidade

# Centralizar os dados
X_centered = mean_center_data(X)

# Determinar o número ótimo de componentes
optimal_components, mean_explained_variance = determine_optimal_components(X_centered, Y)
print(f"Número ótimo de componentes: {optimal_components}")

# Treinar o modelo PLSR com o número ótimo de componentes
pls = PLSRegression(n_components=optimal_components)
pls.fit(X_centered, Y)

# Detectar outliers
T2, Q, T2_limit, Q_limit = detect_outliers(pls, X_centered, Y, optimal_components)
print(f"T2 Limit: {T2_limit}, Q Limit: {Q_limit}")

# Avaliação do modelo
X_test = np.random.rand(20, 2151)  # Exemplo de dados de teste
Y_test = np.random.rand(20, 1)     # Exemplo de atributo de qualidade para teste
X_test_centered = mean_center_data(X_test)

r2, rmse, bias, Y_pred, slope, offset = evaluate_model(pls, X_test_centered, Y_test)
print(f"R2: {r2}, RMSE: {rmse}, Bias: {bias}, Slope: {slope}, Offset: {offset}")

# Plotando os resultados
plt.figure(figsize=(10, 6))
plt.scatter(Y_test, Y_pred, color='blue')
plt.plot(Y_test, slope * Y_test + offset, color='red')

# Adicionando as métricas no gráfico
plt.text(0.05, 0.95, f'Slope: {slope:.6f}\nOffset: {offset:.6f}\nRMSE: {rmse:.6f}\nR-Square: {r2:.6f}',
         transform=plt.gca().transAxes, fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white'))

plt.xlabel('Reference Y')
plt.ylabel('Predicted Y')
plt.title('Predicted vs. Reference')
plt.show()

# Predição em novos dados
new_data = np.random.rand(10, 2151)  # Novos dados de comprimento de onda
Y_reference = np.random.rand(10, 1)  # Referência de atributo de qualidade
results_df = predict_new_data(pls, new_data, Y_reference)
print(results_df)