# Projeto de Classificação com PCA e SVM
Este notebook cria e avalia um modelo de classificação com SVM após redução de dimensionalidade com PCA.
Serão avaliadas variações de kernel, otimização de hiperparâmetros e diferentes métricas de avaliação.

## 1. Importação de bibliotecas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

RANDOM_STATE = 42

## 2. Leitura e visualização inicial dos dados

In [2]:
# Substitua pelo caminho real do seu dataset, se necessário
df = pd.read_csv('smart-contract-dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,hash_id,label,bytecode_len,Weight bytecode_character_6,Weight bytecode_character_0,Weight bytecode_character_8,Weight bytecode_character_4,Weight bytecode_character_5,Weight bytecode_character_2,...,bytecode_character_k,bytecode_character_P,Weight bytecode_character_g,bytecode_character_g,Weight bytecode_character_I,Weight bytecode_character_m,bytecode_character_I,bytecode_character_m,Weight bytecode_character_x,bytecode_character_x
0,24345,3cef4261255f49f8ee35cc104becee8de0e98ecb979c64...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14128,76187d55675bfdc44d3a50de7e1bb425ca46f6141776d0...,0,240,0.079167,0.345833,0.070833,0.033333,0.058333,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12942,034cceceee0f4af4b83789d99fd2eca3c9b9d95cce0515...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7017,9d4a05454bba830a8e46f1afab8ad74a9c17aec7f6f684...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,21079,a505708c309efee083dd5f89dcac0f33273474f02330d4...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label,bytecode_len,Weight bytecode_character_6,Weight bytecode_character_0,Weight bytecode_character_8,Weight bytecode_character_4,Weight bytecode_character_5,Weight bytecode_character_2,Weight bytecode_character_1,...,bytecode_character_k,bytecode_character_P,Weight bytecode_character_g,bytecode_character_g,Weight bytecode_character_I,Weight bytecode_character_m,bytecode_character_I,bytecode_character_m,Weight bytecode_character_x,bytecode_character_x
count,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0,...,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0,36671.0
mean,18335.0,0.266041,4494.265769,0.054926,0.17213,0.037272,0.018298,0.054504,0.032398,0.05991,...,0.000491,2.7e-05,4.070075e-08,5.5e-05,5.270022e-08,5.270022e-08,0.000545,0.000545,3.051743e-08,0.000382
std,10586.150197,0.441892,6552.745838,0.034178,0.116114,0.024645,0.0135,0.03538,0.022089,0.039021,...,0.054267,0.005222,7.794061e-06,0.010444,4.514558e-06,4.514558e-06,0.046705,0.046705,4.132268e-06,0.051695
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9167.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,18335.0,0.0,1660.0,0.066755,0.187323,0.041914,0.02061,0.061487,0.040076,0.070336,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,27502.5,1.0,8104.0,0.077086,0.251205,0.054111,0.025904,0.079975,0.045181,0.090361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,36670.0,1.0,90356.0,0.185859,0.693885,0.115854,0.138139,0.147041,0.192966,0.196352,...,7.0,1.0,0.001492537,2.0,0.0003989627,0.0003989627,4.0,4.0,0.0005595524,7.0


## 3. Pré-processamento

In [4]:
# Separação entre features e target
X = df.iloc[:, 3:].select_dtypes(include="number")
y = df.label

# Split entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)

# Escalonamento dos dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. Redução de Dimensionalidade com PCA

In [5]:
pca = PCA(n_components=0.65, random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f'Nº de componentes escolhidos: {X_train_pca.shape[1]}')

Nº de componentes escolhidos: 32


## 5. Modelo SVM com Kernel Polinomial

In [6]:
svm_poly = SVC(kernel='poly', degree=3, gamma='scale', random_state=RANDOM_STATE)
svm_poly.fit(X_train_pca, y_train)
y_pred_poly = svm_poly.predict(X_test_pca)
print(classification_report(y_test, y_pred_poly))

              precision    recall  f1-score   support

           0       0.75      0.99      0.85      5384
           1       0.64      0.07      0.13      1951

    accuracy                           0.74      7335
   macro avg       0.70      0.53      0.49      7335
weighted avg       0.72      0.74      0.66      7335



## 7. Otimização de Hiperparâmetros com GridSearchCV (RBF)

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.01, 0.001],
    'degree': [2, 4]
}

grid = GridSearchCV(SVC(kernel='poly', random_state=RANDOM_STATE), param_grid, cv=3, scoring='precision', verbose=3)
grid.fit(X_train_pca, y_train)

print(f'Melhores parâmetros: {grid.best_params_}')
best_model = grid.best_estimator_

y_pred_grid = best_model.predict(X_test_pca)

print(classification_report(y_test, y_pred_grid))

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV 1/3] END ......C=0.1, degree=2, gamma=scale;, score=0.744 total time=  16.4s
[CV 2/3] END ......C=0.1, degree=2, gamma=scale;, score=0.571 total time=  13.9s
[CV 3/3] END ......C=0.1, degree=2, gamma=scale;, score=0.696 total time=  18.6s
[CV 1/3] END .......C=0.1, degree=2, gamma=0.01;, score=0.698 total time=  14.1s
[CV 2/3] END .......C=0.1, degree=2, gamma=0.01;, score=0.591 total time=  14.1s
[CV 3/3] END .......C=0.1, degree=2, gamma=0.01;, score=0.658 total time=  14.6s


## 8. Conclusão
O uso de PCA reduziu significativamente a dimensionalidade mantendo boa performance.
A SVM com kernel poly e parâmetros c 0.1 degre 2 e gamma 0.001 apresentou os melhores resultados.