In [None]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from project_ml_course.data_process import filter_columns_by_correlation_threshold

In [2]:
raw_df = pd.read_csv("../data/dados.csv", index_col="Unnamed: 0")

df = filter_columns_by_correlation_threshold(
    df=raw_df,
    ref_col="class",
    method_type="pearson",
    lower_threshold=0.001,
    higher_threshold=0.999,
)

In [3]:
X = df.drop(columns=["class"])
y = df["class"]

# Separar treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [17]:
# Pipeline com SVM (kernel não linear) após PCA
pipe_svm = Pipeline(
    [
        ("scaler", StandardScaler(with_std=False)),
        ("pca", PCA(n_components=3, random_state=42)),
        (
            "svm",
            SVC(
                kernel="rbf", probability=True, class_weight="balanced", random_state=42
            ),
        ),
    ]
)

pipe_svm.fit(X_train, y_train)
y_proba_svm = pipe_svm.predict_proba(X_test)[:, 1]

# Otimização do ponto de corte para F1 para SVM
thresholds_svm = np.arange(0, 1.01, 0.01)
f1_scores_svm = [f1_score(y_test, y_proba_svm >= t) for t in thresholds_svm]
best_idx_svm = np.argmax(f1_scores_svm)
best_threshold_svm = thresholds_svm[best_idx_svm]
best_f1_svm = f1_scores_svm[best_idx_svm]

print(f"[SVM] Melhor ponto de corte: {best_threshold_svm:.2f}")
print(f"[SVM] Melhor F1 score: {best_f1_svm:.4f}")


[SVM] Melhor ponto de corte: 0.60
[SVM] Melhor F1 score: 0.9657


In [18]:
# Pipeline com SVM (kernel não linear) após PCA
pipe_svm = Pipeline(
    [
        ("scaler", StandardScaler(with_std=False)),
        ("pca", PCA(n_components=3, random_state=42)),
        ("svm", SVC(kernel="rbf", probability=True, random_state=42)),
    ]
)

pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)

f1_scores_svm = f1_score(y_test, y_pred)

print(f"[SVM] F1 score: {f1_scores_svm:.4f}")

[SVM] F1 score: 0.9429


In [13]:
# Visualização interativa 3D dos pontos de teste no espaço das três primeiras componentes principais (sem grid de decisão)
import plotly.graph_objs as go

# Obter as três primeiras componentes principais dos dados de teste
X_test_pca = pipe_svm.named_steps["pca"].transform(
    pipe_svm.named_steps["scaler"].transform(X_test)
)

X_test_pca_3d = X_test_pca

# Para cada ponto de teste, determinar a predição SVM (0 ou 1) usando o threshold ótimo
y_proba_test = pipe_svm.predict_proba(X_test)[:, 1]
y_pred_svm = (y_proba_test >= best_threshold_svm).astype(int)
y_true = y_test.values

# Mapear cor para predição SVM (0: azul, 1: vermelho)
color_map = np.array(["blue", "red"])
colors = color_map[y_pred_svm]

# Mapear símbolo para target (0: círculo, 1: diamante)
symbol_map = np.array(["circle", "diamond"])
symbols = symbol_map[y_true]

# Traço para pontos de teste, shape pelo target, cor pela predição SVM
test_trace = go.Scatter3d(
    x=X_test_pca_3d[:, 0],
    y=X_test_pca_3d[:, 1],
    z=X_test_pca_3d[:, 2],
    mode="markers",
    marker=dict(
        size=7, color=colors, symbol=symbols, line=dict(width=1, color="black")
    ),
    name="Test points (shape=target, color=SVM pred)",
)

layout = go.Layout(
    scene=dict(
        xaxis_title="PC1",
        yaxis_title="PC2",
        zaxis_title="PC3",
    ),
    title="Pontos de teste no espaço das três primeiras componentes principais (3D)",
    legend=dict(x=0.01, y=0.99),
)

fig = go.Figure(data=[test_trace], layout=layout)
fig.show()
