# Imports

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_predict
import os
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Dados

In [4]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Definindo o caminho base para a pasta Pre-processamento
base_path = Path(parent_dir) / 'Partial Components Analysis'

file_path_raw_cal = base_path / 'RAW_calibration.xlsx'
file_path_msc_cal = base_path / 'MSC_calibration.xlsx'
file_path_snv_cal = base_path / 'SNV_calibration.xlsx'
file_path_sg_cal = base_path / 'SG_calibration.xlsx'

df_raw_cal = pd.read_excel(file_path_raw_cal)
df_msc_cal = pd.read_excel(file_path_msc_cal)
df_snv_cal = pd.read_excel(file_path_snv_cal)
df_sg_cal = pd.read_excel(file_path_sg_cal)

file_path_raw_val = base_path / 'RAW_validation.xlsx'
file_path_msc_val = base_path / 'MSC_validation.xlsx'
file_path_snv_val = base_path / 'SNV_validation.xlsx'
file_path_sg_val = base_path / 'SG_validation.xlsx'

df_raw_val = pd.read_excel(file_path_raw_val)
df_msc_val = pd.read_excel(file_path_msc_val)
df_snv_val = pd.read_excel(file_path_snv_val)
df_sg_val = pd.read_excel(file_path_sg_val)

# Testes

In [18]:
def calculate_metrics(y_true, y_pred):
    correlation_coefficient = np.corrcoef(y_true, y_pred)[0, 1]
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    relative_absolute_error = 100 * (mae / np.mean(np.abs(y_true - np.mean(y_true))))
    root_relative_squared_error = 100 * (rmse / np.std(y_true))
    
    return {
        "Correlation coefficient": correlation_coefficient,
        "Mean absolute error": mae,
        "Root mean squared error": rmse,
        "Relative absolute error": relative_absolute_error,
        "Root relative squared error": root_relative_squared_error,
        "Total Number of Instances": len(y_true)
    }

def display_metrics(title, metrics):
    print(f"\n=== {title} ===")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print("\n")

# Carregar dados de calibração
calibration_data = df_msc_cal
X_calibration = calibration_data.iloc[:, 6:]
y_calibration = calibration_data['SST']

# Normalizar os dados
scaler = StandardScaler()
X_calibration_scaled = scaler.fit_transform(X_calibration)

# Ajustar o modelo para refletir o Weka's SMOreg com PolyKernel
model = SVR(kernel='poly', degree=1, C=1.0, epsilon=0.001)
model.fit(X_calibration_scaled, y_calibration)

# Avaliar no conjunto de treinamento
y_pred_train = model.predict(X_calibration_scaled)
training_metrics = calculate_metrics(y_calibration, y_pred_train)
display_metrics("Training Metrics", training_metrics)

# Realizar validação cruzada
folds = len(X_calibration)
y_pred_cv = cross_val_predict(model, X_calibration_scaled, y_calibration, cv=folds)
cv_metrics = calculate_metrics(y_calibration, y_pred_cv)
display_metrics("Cross-Validation Metrics", cv_metrics)

# Carregar dados de validação
validation_data = df_msc_val
X_validation = validation_data.iloc[:, 6:]
y_validation = validation_data['SST']

# Normalizar os dados de validação com o mesmo scaler
X_validation_scaled = scaler.transform(X_validation)

# Avaliar no conjunto de validação
y_pred_validation = model.predict(X_validation_scaled)
validation_metrics = calculate_metrics(y_validation, y_pred_validation)
display_metrics("Validation Metrics", validation_metrics)


=== Training Metrics ===
Correlation coefficient: 0.8412
Mean absolute error: 1.1297
Root mean squared error: 1.4467
Relative absolute error: 51.1634
Root relative squared error: 54.4617
Total Number of Instances: 175.0000



=== Cross-Validation Metrics ===
Correlation coefficient: 0.8206
Mean absolute error: 1.2187
Root mean squared error: 1.5218
Relative absolute error: 55.1949
Root relative squared error: 57.2876
Total Number of Instances: 175.0000



=== Validation Metrics ===
Correlation coefficient: 0.7561
Mean absolute error: 1.1345
Root mean squared error: 1.3879
Relative absolute error: 65.8251
Root relative squared error: 65.9561
Total Number of Instances: 75.0000




In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_msc_cal, df_msc_cal['SST'], test_size=0.3, random_state=42)


In [16]:
from sklearn.svm import SVR

model = SVR(kernel='linear')  # Use SVR para regressão
model.fit(X_train.iloc[:, 6:], y_train)


In [21]:
y_pred = model.predict(X_test.iloc[:, 6:])
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Erro Quadrático Médio na Calibração:", mse)
print("Coeficiente de Determinação R²:", r2)


Erro Quadrático Médio na Calibração: 3.9859901401722153
Coeficiente de Determinação R²: 0.5271203263279545


In [22]:
y_val_pred = model.predict(df_msc_val.iloc[:, 6:])


In [27]:
# Criando um DataFrame com as predições
df_pred = pd.DataFrame({
    'Predição': y_val_pred,
    'SST Real': df_msc_val['SST'].values  # Pegando os valores reais de 'SST'
})

# Calculando a diferença entre a predição e o valor real
df_pred['Diferença'] = df_pred['Predição'] - df_pred['SST Real']

In [34]:
df_pred

Unnamed: 0,Predição,SST Real,Diferença
0,12.283071,11.4,0.883071
1,11.876465,11.6,0.276465
2,11.636407,12.0,-0.363593
3,11.653587,11.0,0.653587
4,12.015562,9.4,2.615562
...,...,...,...
70,14.027456,17.1,-3.072544
71,15.540082,16.8,-1.259918
72,15.845054,17.5,-1.654946
73,14.054512,15.9,-1.845488


# Testando JVM

In [23]:
import weka.core.jvm as jvm

# Inicie a JVM sem especificar o caminho para o weka.jar
jvm.start()


DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['c:\\Users\\Luyza\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\arpack_combined.jar', 'c:\\Users\\Luyza\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\core.jar', 'c:\\Users\\Luyza\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\mtj.jar', 'c:\\Users\\Luyza\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\python-weka-wrapper.jar', 'c:\\Users\\Luyza\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


In [28]:
data_dir = "C:\\Users\\Luyza\\Documents\\Dados"


In [30]:
from weka.core.converters import Loader
# Cria uma instância do CSVLoader
loader = Loader(classname="weka.core.converters.CSVLoader")
# Carrega o dataset CSV (substitua pelo nome do seu arquivo CSV)
data = loader.load_file(f"{data_dir}/MSC_calibration.csv")
# Define a última coluna como a classe (se necessário)
data.class_is_last()

In [39]:
from weka.classifiers import Classifier
from weka.classifiers import Evaluation
import numpy as np

# Cria uma instância do classificador RandomForest
# As opções são ajustadas para refletir o que foi especificado no seu exemplo
cls = Classifier(classname="weka.classifiers.trees.RandomForest", options=[
    "-P", "100",  # Percentage of bagging (not used here, but set to 100)
    "-I", "100",  # Number of trees
    "-num-slots", "1",  # Number of threads to use
    "-K", "0",  # Number of attributes to consider (0 means all)
    "-M", "1.0",  # Minimum number of instances per leaf
    "-V", "0.001",  # Percentage of instances to leave out for the random forest model
    "-S", "1"  # Random seed
])

# Treina o classificador com o dataset
cls.build_classifier(data)

# Avalia o modelo com o próprio conjunto de dados de treinamento
evaluation = Evaluation(data)

# Gera previsões para o conjunto de dados de treinamento
predictions = [cls.classify_instance(inst) for inst in data]

# Obtém as classes reais
actual = [inst.get_value(data.class_index) for inst in data]

# Converte para numpy arrays para facilitar o cálculo das métricas
actual = np.array(actual)
predictions = np.array(predictions)

# Calcula as métricas manualmente
mae = np.mean(np.abs(predictions - actual))
rmse = np.sqrt(np.mean((predictions - actual) ** 2))
rae = np.sum(np.abs(predictions - actual)) / np.sum(np.abs(actual - np.mean(actual)))
rrmse = np.sqrt(np.mean((predictions - actual) ** 2)) / np.mean(actual)

# Coeficiente de Correlação
correlation_coefficient = np.corrcoef(predictions, actual)[0, 1]

# Número Total de Instâncias
total_instances = len(data)

# Imprime as métricas de avaliação
print(f"Coeficiente de Correlação: {correlation_coefficient}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"RAE: {rae}")
print(f"RRMSE: {rrmse}")
print(f"Número Total de Instâncias: {total_instances}")

Coeficiente de Correlação: 0.9789283120991824
MAE: 0.0005657088613731934
RMSE: 0.0007377036649620975
RAE: 0.23525020817412567
RRMSE: 0.03445389540483091
Número Total de Instâncias: 175


In [40]:
# Classifica cada instância e imprime os resultados
for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(f"{index+1}: label index={pred}, class distribution={dist}") -> output das prediçoes
    
# Inicializa a avaliação
evaluation = Evaluation(data)

# Avalia o modelo com o próprio conjunto de dados (cross-validation)
evaluation.evaluate_model(cls, data)

mae = np.mean(np.abs(predictions - actual))
rmse = np.sqrt(np.mean((predictions - actual) ** 2))
rae = np.sum(np.abs(predictions - actual)) / np.sum(np.abs(actual - np.mean(actual)))
rrmse = np.sqrt(np.mean((predictions - actual) ** 2)) / np.mean(actual)

# Coeficiente de Correlação
correlation_coefficient = np.corrcoef(predictions, actual)[0, 1]

# Número Total de Instâncias
total_instances = len(data)

# Imprime as métricas de avaliação
print(f"Coeficiente de Correlação: {correlation_coefficient}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"RAE: {rae}")
print(f"RRMSE: {rrmse}")
print(f"Número Total de Instâncias: {total_instances}")

TypeError: No matching overloads found for *static* weka.classifiers.Evaluation.evaluateModel(weka.classifiers.trees.RandomForest,Instances), options are:
	public double[] weka.classifiers.Evaluation.evaluateModel(weka.classifiers.Classifier,weka.core.Instances,java.lang.Object[]) throws java.lang.Exception
	public static java.lang.String weka.classifiers.Evaluation.evaluateModel(weka.classifiers.Classifier,java.lang.String[]) throws java.lang.Exception
	public static java.lang.String weka.classifiers.Evaluation.evaluateModel(java.lang.String,java.lang.String[]) throws java.lang.Exception


In [34]:
print(cls)

import weka.plot.graph as graph  # NB: pygraphviz and PIL are required
graph.plot_dot_graph(cls.graph)

RandomForest

Bagging with 100 iterations and base learner

weka.classifiers.trees.RandomTree -K 0 -M 1.0 -V 0.001 -S 1 -do-not-check-capabilities


ERROR:weka.plot.graph:Pygraphviz is not installed, cannot generate graph plot!
