# Imports

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_predict
import os
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Dados

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Definindo o caminho base para a pasta Pre-processamento
base_path = Path(parent_dir) / 'Partial Components Analysis'

file_path_raw_cal = base_path / 'RAW_calibration.xlsx'
file_path_msc_cal = base_path / 'MSC_calibration.xlsx'
file_path_snv_cal = base_path / 'SNV_calibration.xlsx'
file_path_sg_cal = base_path / 'SG_calibration.xlsx'

df_raw_cal = pd.read_excel(file_path_raw_cal)
df_msc_cal = pd.read_excel(file_path_msc_cal)
df_snv_cal = pd.read_excel(file_path_snv_cal)
df_sg_cal = pd.read_excel(file_path_sg_cal)

file_path_raw_val = base_path / 'RAW_validation.xlsx'
file_path_msc_val = base_path / 'MSC_validation.xlsx'
file_path_snv_val = base_path / 'SNV_validation.xlsx'
file_path_sg_val = base_path / 'SG_validation.xlsx'

df_raw_val = pd.read_excel(file_path_raw_val)
df_msc_val = pd.read_excel(file_path_msc_val)
df_snv_val = pd.read_excel(file_path_snv_val)
df_sg_val = pd.read_excel(file_path_sg_val)

# Testes

In [46]:
X_test, y_test = df_msc_val.iloc[:,6:], df_msc_val['SST'].values
X_train, y_train = df_msc_cal.iloc[:,6:], df_msc_cal['SST'].values

In [47]:
X_train.shape, X_test.shape

((175, 2151), (75, 2151))

In [9]:
cols = X_train.columns

In [43]:
#scaler = StandardScaler()

#X_train = scaler.fit_transform(X_train)

#X_test = scaler.transform(X_test)

#X_train = pd.DataFrame(X_train, columns=[cols])

#X_test = pd.DataFrame(X_test, columns=[cols])

In [49]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import pearsonr

# Função para calcular as métricas
def calculate_metrics(y_true, y_pred):
    correlation_coefficient = np.corrcoef(y_true, y_pred)[0, 1]
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_y_true = np.mean(y_true)
    relative_absolute_error = 100 * (mae / np.mean(np.abs(y_true - mean_y_true)))
    root_relative_squared_error = 100 * (rmse / np.std(y_true))

    return {
        "Correlation coefficient": correlation_coefficient,
        "Mean absolute error": mae,
        "Root mean squared error": rmse,
        "Relative absolute error": relative_absolute_error,
        "Root relative squared error": root_relative_squared_error,
        "Total Number of Instances": len(y_true)
    }

def display_metrics(title, metrics):
    print(f"\n=== {title} ===")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

# Função para remover colunas com baixa variância
def remove_low_variance_columns(df, threshold=0.01):
    try:
        df_numeric = df.select_dtypes(include=[np.number])
        if df_numeric.empty:
            raise ValueError("O DataFrame não contém colunas numéricas.")
        selector = VarianceThreshold(threshold=threshold)
        selector.fit(df_numeric)
        cols_to_keep = selector.get_support(indices=True)
        if len(cols_to_keep) == 0:
            raise ValueError("Todas as colunas foram removidas devido à baixa variância.")
        return df_numeric.iloc[:, cols_to_keep]
    except Exception as e:
        print(f"Erro ao remover colunas com baixa variância: {e}")
        return df

# Função para detectar e remover colunas que são identificadores
def remove_identifier_columns(df):
    try:
        cols_to_remove = [col for col in df.columns if df[col].nunique() == len(df)]
        if not cols_to_remove:
            raise ValueError("Nenhuma coluna identificadora foi encontrada.")
        return df.drop(cols_to_remove, axis=1)
    except Exception as e:
        print(f"Erro ao remover colunas identificadoras: {e}")
        return df

# Função para remover colunas com baixa correlação com o alvo
def remove_low_correlation_columns(df, target_column, threshold=0.1):
    try:
        correlations = df.apply(lambda x: pearsonr(x, df[target_column])[0] if x.name != target_column else 1)
        cols_to_remove = correlations[correlations.abs() < threshold].index
        if not cols_to_remove:
            raise ValueError("Nenhuma coluna com baixa correlação foi encontrada.")
        return df.drop(cols_to_remove, axis=1)
    except Exception as e:
        print(f"Erro ao remover colunas com baixa correlação: {e}")
        return df

# Verificar e carregar os dados
X_train = pd.read_csv('seu_arquivo.csv')  # Certifique-se de carregar os dados corretamente
y_train = X_train.pop('SST')  # Ajuste conforme o nome da sua coluna alvo

print("Informações do DataFrame original:")
print(X_train.info())
print(X_train.head())

# Aplicar o StandardScaler aos dados
try:
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
except Exception as e:
    print(f"Erro ao aplicar o StandardScaler: {e}")

# Verificar os dados após o escalonamento
print("Informações do DataFrame escalonado:")
print(X_train_scaled.info())
print(X_train_scaled.head())

# Aplicar as funções de limpeza ao dataset escalonado
X_train_cleaned = remove_low_variance_columns(X_train_scaled)
print("Após remoção de baixa variância:")
print(X_train_cleaned.info())

X_train_cleaned = remove_identifier_columns(X_train_cleaned)
print("Após remoção de colunas identificadoras:")
print(X_train_cleaned.info())

X_train_cleaned = remove_low_correlation_columns(X_train_cleaned, 'SST')  # Substitua 'SST' pelo nome da sua coluna alvo
print("Após remoção de baixa correlação:")
print(X_train_cleaned.info())

# Configurar e treinar o modelo SVR
model = SVR(
    C=5,             # Regularização
    epsilon=0.001,     # Tolerância
    kernel='poly',     # Kernel polinomial
    degree=1,          # Grau do polinômio
    coef0=1,           # Coeficiente do polinômio
    tol=1.0E-12        # Tolerância de precisão
)

# Garantir que os dados estejam limpos e prontos
print("Treinando o modelo com dados limpos:")
model.fit(X_train_cleaned, y_train)

# Fazer previsões no conjunto de treinamento
y_train_pred = model.predict(X_train_cleaned)
training_metrics = calculate_metrics(y_train, y_train_pred)
display_metrics("Training Metrics", training_metrics)

folds = 175  # Defina o número de folds para a validação cruzada
y_train_cv = cross_val_predict(model, X_train_cleaned, y_train, cv=folds)
cv_metrics = calculate_metrics(y_train, y_train_cv)
display_metrics("Cross-Validation Metrics", cv_metrics)

y_pred_val = model.predict(X_test)
validation_metrics = calculate_metrics(y_test, y_pred_val)
display_metrics("Validation Metrics", validation_metrics)


FileNotFoundError: [Errno 2] No such file or directory: 'seu_arquivo.csv'

In [34]:
param_grid = {
    'C': [1, 10, 50],        # Ajustando C para valores menores
    'epsilon': [0.1, 0.2, 0.3],  # Aumentando epsilon para permitir maior tolerância
    'gamma': [0.01, 0.1],    # Aumentando gamma para reduzir a sensibilidade do modelo
    'kernel': ['rbf'],
    'tol': [1e-08, 1e-06]    # Aumentando tol para uma otimização menos agressiva
}

grid_search = GridSearchCV(estimator=SVR(), param_grid=param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Treinando e avaliando o modelo otimizado
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train)
training_metrics = calculate_metrics(y_train, y_train_pred)
display_metrics("Optimized Training Metrics", training_metrics)


Best parameters found:  {'C': 10, 'epsilon': 0.1, 'gamma': 0.01, 'kernel': 'rbf', 'tol': 1e-08}

=== Optimized Training Metrics ===
Correlation coefficient: 0.9998
Mean absolute error: 0.0978
Root mean squared error: 0.0986
Relative absolute error: 4.4309
Root relative squared error: 3.7124
Total Number of Instances: 175.0000
