# Imports

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_predict
import os
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Dados

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Definindo o caminho base para a pasta Pre-processamento
base_path = Path(parent_dir) / 'Partial Components Analysis'

file_path_raw_cal = base_path / 'RAW_calibration.xlsx'
file_path_msc_cal = base_path / 'MSC_calibration.xlsx'
file_path_snv_cal = base_path / 'SNV_calibration.xlsx'
file_path_sg_cal = base_path / 'SG_calibration.xlsx'

df_raw_cal = pd.read_excel(file_path_raw_cal)
df_msc_cal = pd.read_excel(file_path_msc_cal)
df_snv_cal = pd.read_excel(file_path_snv_cal)
df_sg_cal = pd.read_excel(file_path_sg_cal)

file_path_raw_val = base_path / 'RAW_validation.xlsx'
file_path_msc_val = base_path / 'MSC_validation.xlsx'
file_path_snv_val = base_path / 'SNV_validation.xlsx'
file_path_sg_val = base_path / 'SG_validation.xlsx'

df_raw_val = pd.read_excel(file_path_raw_val)
df_msc_val = pd.read_excel(file_path_msc_val)
df_snv_val = pd.read_excel(file_path_snv_val)
df_sg_val = pd.read_excel(file_path_sg_val)

# Testes

In [3]:
X_test, y_test = df_msc_val.iloc[:,6:], df_msc_val['SST'].values
X_train, y_train = df_msc_cal.iloc[:,6:], df_msc_cal['SST'].values

In [4]:
X_train.shape, X_test.shape

((175, 2151), (75, 2151))

In [5]:
cols = X_train.columns

In [34]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)
#X_train = pd.DataFrame(X_train, columns=[cols])
#X_test = pd.DataFrame(X_test, columns=[cols])

In [8]:
# Função para calcular as métricas
def calculate_metrics(y_true, y_pred):
    correlation_coefficient = np.corrcoef(y_true, y_pred)[0, 1]
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_y_true = np.mean(y_true)
    relative_absolute_error = 100 * (mae / np.mean(np.abs(y_true - mean_y_true)))
    root_relative_squared_error = 100 * (rmse / np.std(y_true))

    return {
        "Correlation coefficient": correlation_coefficient,
        "Mean absolute error": mae,
        "Root mean squared error": rmse,
        "Relative absolute error": relative_absolute_error,
        "Root relative squared error": root_relative_squared_error,
        "Total Number of Instances": len(y_true)
    }

def display_metrics(title, metrics):
    print(f"\n=== {title} ===")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

# Configurações do modelo SVR com kernel RBF
model = SVR(
    C=1,                    # Regularização
    epsilon=0.001,        # Tolerância
    kernel='poly',          # Kernel Polinomial
    degree=1,               # Grau do polinômio
    coef0=1,                # Termo de bias
    tol=0.001,              # Tolerância para precisão
    cache_size=250007,      # Tamanho do cache
    shrinking=True,         # Usar a heurística de shrinking
    verbose=False            # Exibir mensagens detalhadas
)
# Treinar o modelo
model.fit(X_train, y_train)

# Fazer previsões no conjunto de treinamento
y_train_pred = model.predict(X_train)
training_metrics = calculate_metrics(y_train, y_train_pred)
display_metrics("Training Metrics", training_metrics)

# Validação cruzada Leave-One-Out (LOO)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
y_train_cv = cross_val_predict(model, X_train, y_train, cv=loo)
cv_metrics = calculate_metrics(y_train, y_train_cv)
display_metrics("Cross-Validation Metrics", cv_metrics)

# Avaliação final no conjunto de teste
y_pred_val = model.predict(X_test)
validation_metrics = calculate_metrics(y_test, y_pred_val)
display_metrics("Validation Metrics", validation_metrics)


=== Training Metrics ===
Correlation coefficient: 0.8412
Mean absolute error: 1.1297
Root mean squared error: 1.4467
Relative absolute error: 51.1629
Root relative squared error: 54.4612
Total Number of Instances: 175.0000

=== Cross-Validation Metrics ===
Correlation coefficient: 0.8206
Mean absolute error: 1.2187
Root mean squared error: 1.5218
Relative absolute error: 55.1948
Root relative squared error: 57.2882
Total Number of Instances: 175.0000

=== Validation Metrics ===
Correlation coefficient: 0.7561
Mean absolute error: 1.1345
Root mean squared error: 1.3880
Relative absolute error: 65.8262
Root relative squared error: 65.9570
Total Number of Instances: 75.0000


In [10]:
# Função para calcular as métricas
def calculate_metrics(y_true, y_pred):
    correlation_coefficient = np.corrcoef(y_true, y_pred)[0, 1]
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_y_true = np.mean(y_true)
    relative_absolute_error = 100 * (mae / np.mean(np.abs(y_true - mean_y_true)))
    root_relative_squared_error = 100 * (rmse / np.std(y_true))

    return {
        "Correlation coefficient": correlation_coefficient,
        "Mean absolute error": mae,
        "Root mean squared error": rmse,
        "Relative absolute error": relative_absolute_error,
        "Root relative squared error": root_relative_squared_error,
        "Total Number of Instances": len(y_true)
    }

def display_metrics(title, metrics):
    print(f"\n=== {title} ===")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

# Criar o pipeline
# Configurar o modelo SVR com parâmetros similares aos do Weka
model = SVR(
    C=4,                # Complexidade do modelo
    epsilon=0.1,        # Epsilon para a perda insensível
    kernel='poly',        # Kernel Polinomial
    degree=2,             # Grau do polinômio (exponente)
    gamma=0.001,     # Gamma do kernel (inverso do cacheSize em Weka)
    tol=0.001,            # Tolerância para critério de parada
    cache_size=250007,    # Tamanho do cache
    verbose=False,        # Não exibir mensagens detalhadas
    coef0=1             # Termo independente do kernel polinomial
)

# Criar o pipeline
pipeline = make_pipeline(
    StandardScaler(),  # Normalização dos dados
    model  # Modelo SVR
)

# Treinar o modelo
pipeline.fit(X_train, y_train)

# Fazer previsões no conjunto de treinamento
y_train_pred = pipeline.predict(X_train)
training_metrics = calculate_metrics(y_train, y_train_pred)
display_metrics("Training Metrics", training_metrics)

# Validação cruzada Leave-One-Out (LOO)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
y_train_cv = cross_val_predict(pipeline, X_train, y_train, cv=loo)
cv_metrics = calculate_metrics(y_train, y_train_cv)
display_metrics("Cross-Validation Metrics", cv_metrics)

# Avaliação final no conjunto de teste
y_pred_val = pipeline.predict(X_test)
validation_metrics = calculate_metrics(y_test, y_pred_val)
display_metrics("Validation Metrics", validation_metrics)


=== Training Metrics ===
Correlation coefficient: 0.9605
Mean absolute error: 0.4644
Root mean squared error: 0.7431
Relative absolute error: 21.0324
Root relative squared error: 27.9758
Total Number of Instances: 175.0000

=== Cross-Validation Metrics ===
Correlation coefficient: 0.8446
Mean absolute error: 1.0726
Root mean squared error: 1.4889
Relative absolute error: 48.5783
Root relative squared error: 56.0495
Total Number of Instances: 175.0000

=== Validation Metrics ===
Correlation coefficient: 0.8768
Mean absolute error: 0.8197
Root mean squared error: 1.0471
Relative absolute error: 47.5581
Root relative squared error: 49.7610
Total Number of Instances: 75.0000


In [7]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Definir o pipeline
pipeline = make_pipeline(StandardScaler(), SVR(kernel='poly'))

# Definir os parâmetros para o GridSearch
param_grid = {
    'svr__C': [0.1, 1],  # Valores de regularização
    'svr__degree': [2, 3],     # Graus do polinômio
    'svr__gamma': ['scale', 'auto', 0.1, 0.01, 0.001],  # Parâmetro gamma
    'svr__coef0': [0.0, 0.1, 0.5, 1],  # Coeficiente do termo independente
    'svr__epsilon': [0.1, 0.01, 0.001],  # Epsilon para a função de perda
}

# Definir o GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Usando MAE como métrica de avaliação
    cv=175,  # Validação cruzada com 5 folds
    n_jobs=-1,  # Utiliza todos os processadores disponíveis
    verbose=3  # Mostra detalhes do processo
)

# Treinar o GridSearchCV
grid_search.fit(X_train, y_train)

# Resultados do GridSearch
print("Melhores parâmetros encontrados:")
print(grid_search.best_params_)

# Fazer previsões no conjunto de teste usando o melhor modelo encontrado
y_pred = grid_search.predict(X_test)

# Calcular e exibir as métricas de desempenho
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nResultados no conjunto de teste:")
print(f"MAE (Mean Absolute Error): {mae:.4f}")
print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")
print(f"R² (Coeficiente de Determinação): {r2:.4f}")


Fitting 175 folds for each of 240 candidates, totalling 42000 fits
Melhores parâmetros encontrados:
{'svr__C': 1, 'svr__coef0': 1, 'svr__degree': 2, 'svr__epsilon': 0.001, 'svr__gamma': 0.001}

Resultados no conjunto de teste:
MAE (Mean Absolute Error): 0.9236
RMSE (Root Mean Squared Error): 1.1644
R² (Coeficiente de Determinação): 0.6938
