In [52]:
import pandas as pd
import utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import joblib

In [53]:
input_csv = 'treino.csv'

In [54]:
df = pd.read_csv(input_csv, sep=';')

In [55]:
df.head(2)

Unnamed: 0,id,mensagem,tipo
0,1,bom dia,saudacao
1,2,boa tarde,saudacao


In [56]:
df['mensagem_tratada'] = df.mensagem.apply(utils.to_lower_remove_accents)

## Classificador

In [57]:
vectorizer = TfidfVectorizer()

In [58]:
corpus = df['mensagem_tratada'].values

In [59]:
vectorizer.fit(corpus)
joblib.dump(vectorizer, 'vectorizer.joblib')

X = vectorizer.transform(corpus)

In [60]:
y = df['tipo'].values

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Usando o modelo de classificação

In [62]:
clf = LogisticRegression(random_state=0, class_weight='balanced', multi_class='ovr', solver='lbfgs').fit(X_train, y_train)
clf

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
joblib.dump(clf, 'classifier.joblib')

['classifier.joblib']

In [64]:
y_pred = clf.predict(X_test)

In [65]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  avaliacoes       1.00      1.00      1.00         5
   pagamento       0.86      1.00      0.92         6
    saudacao       1.00      0.75      0.86         8
     tutoria       0.71      0.83      0.77         6

    accuracy                           0.88        25
   macro avg       0.89      0.90      0.89        25
weighted avg       0.90      0.88      0.88        25



In [66]:
print(confusion_matrix(y_test, y_pred))

[[5 0 0 0]
 [0 6 0 0]
 [0 0 6 2]
 [0 1 0 5]]
