In [1]:
import polars as pl

df = pl.read_csv("./dados_tratados.csv")

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(texts, ngram_range=(1, 2), max_features=2500, min_df=2, max_df=0.8):
    vectorizer = TfidfVectorizer(
        ngram_range=ngram_range,
        max_features=max_features,
        min_df=min_df,
        max_df=max_df
    )
    X_tfidf = vectorizer.fit_transform(texts)
    return X_tfidf, vectorizer

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

df_pandas = df.to_pandas()

X = df_pandas["texto"]
y = df_pandas["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=777)

X_train, vectorizer = tfidf(X_train)
X_test = vectorizer.transform(X_test)

svm = SVC(kernel='linear', probability=True, random_state=777, C=1)

params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf']
}

grid = GridSearchCV(svm, params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print("Melhores Parâmetros:", grid.best_params_)
print("Melhor Acurácia:", grid.best_score_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 3.2min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 3.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 3.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 3.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 3.3min
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 4.2min
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 4.2min
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 4.2min
[CV] END .......................C=1, gamma=scale, kernel=rbf; total time= 2.1min
[CV] END .......................C=1, gamma=scale, kernel=rbf; total time= 2.1min
[CV] END .......................C=1, gamma=scale, kernel=rbf; total time= 2.1min
[CV] END .......................C=1, gamma=scale,

In [9]:
import numpy as np
import pandas as pd

model = grid.best_estimator_

model.fit(X_train, y_train)

Ein = 1 - model.score(X_train, y_train)
Eout = 1 - model.score(X_test, y_test)
n_sv = len(model.support_)
N = X_train.shape[0]
Eout_esperado = (n_sv / N)

print(f"Melhores Parâmetros: {grid.best_params_}")
print(f"E_in: {Ein:.4f}")
print(f"E_out: {Eout:.4f}")
print(f"E_out Esperado: {Eout_esperado:.4f}")

y_pred = model.predict(X_test)
print(classification_report(y_pred=y_pred, y_true=y_test, digits=3))

df_t = pd.DataFrame(y_pred)
df_t.value_counts()


Melhores Parâmetros: {'C': 1}
Ein (Erro no Treino): 0.0110
Eout (Erro no Teste): 0.0298
Eout Esperado (baseado em vetores de suporte): 0.1755
              precision    recall  f1-score   support

           0      0.977     0.975     0.976      3333
           1      0.958     0.961     0.959      1929

    accuracy                          0.970      5262
   macro avg      0.968     0.968     0.968      5262
weighted avg      0.970     0.970     0.970      5262



0
0    3326
1    1936
Name: count, dtype: int64