# Classificaçãop de texto na realidade, múltiplas avaliações ao mesmo tempo

Começamos pelos imports:

1.   Bibliotecas para manipulação de dados;
2.   Spacy para processamento de texto;
3.   sklearn e xgboost para classificação e extração de features.

In [None]:
import pandas as pd
import string
from collections import Counter
import spacy
import en_core_web_sm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import cohen_kappa_score, make_scorer
from xgboost import XGBClassifier

nlp = en_core_web_sm.load()

# Lendo os dados de uma planilha usando o pandas

In [None]:
df = pd.read_csv('spam.csv')
df

Unnamed: 0,Class,Text
0,ham,all write or wat..
1,ham,and picking them up from various points
2,ham,bot notes oredi... Cos i juz rem i got...
3,ham,called dad oredi...
4,ham,"came to look at the flat, seems ok, in his 50..."
...,...,...
5562,ham,Yup... How _ noe leh...
5563,ham,Yup... I havent been there before... You want ...
5564,ham,Yup... Ok i go home look at the timings then i...
5565,ham,Yupz... I've oredi booked slots 4 my weekends ...


In [None]:
df.isnull().values.any()

NameError: ignored

# Pré-processamento

Remoção de pontuação

In [None]:
df['Text_no_ponctuation_number'] = df['Text'].apply(lambda x: [token for token in x if token not in string.punctuation and not token.isnumeric()])
df['Text_no_ponctuation_number'] = df['Text_no_ponctuation_number'].apply(lambda x: ''.join(x))


Remoção de stopwords

In [None]:
df['Text_no_stopword'] = df['Text_no_ponctuation_number'].apply(lambda x: [token.text.lower() for token in nlp(x) if (token.is_stop == False and len(token.text)>3)])
df['Text_no_stopword'] = df['Text_no_stopword'].apply(lambda x: ' '.join(x))

Lematização e remoção de stopwords

In [None]:
df['Text_lemma_no_stopword'] = df['Text_no_stopword'].apply(lambda x: [token.lemma_ for token in nlp(x)])
df['Text_lemma_no_stopword'] = df['Text_lemma_no_stopword'].apply(lambda x: ' '.join(x))

Lematização

In [None]:
df['Text_lemma'] = df['Text_no_ponctuation_number'].apply(lambda x: [token.lemma_ for token in nlp(x)])
df['Text_lemma'] = df['Text_lemma'].apply(lambda x: ' '.join(x))

In [None]:
df

Unnamed: 0,Class,Text,Text_no_ponctuation_number,Text_no_stopword,Text_lemma_no_stopword,Text_lemma
0,ham,all write or wat..,all write or wat,write wat,write wat,all write or wat
1,ham,and picking them up from various points,and picking them up from various points,picking points,pick point,and pick -PRON- up from various point
2,ham,bot notes oredi... Cos i juz rem i got...,bot notes oredi Cos i juz rem i got,bot notes oredi cos juz rem got,bot note oredi cos juz rem get,bot note oredi Cos i juz rem i get
3,ham,called dad oredi...,called dad oredi,called dad oredi,call dad oredi,call dad oredi
4,ham,"came to look at the flat, seems ok, in his 50...",came to look at the flat seems ok in his s I...,came look flat away alot wiv work got woman co...,come look flat away alot wiv work get woman come,come to look at the flat seem ok in -PRON- s...
...,...,...,...,...,...,...
5562,ham,Yup... How _ noe leh...,Yup How noe leh,yup noe leh,yup noe leh,yup how noe leh
5563,ham,Yup... I havent been there before... You want ...,Yup I havent been there before You want to go ...,yup want yoga book,yup want yoga book,yup -PRON- have not be there before -PRON- wan...
5564,ham,Yup... Ok i go home look at the timings then i...,Yup Ok i go home look at the timings then i ms...,yup home look timings msg xuhui going learn le...,yup home look timing msg xuhui go learn lesson,yup ok i go home look at the timing then i msg...
5565,ham,Yupz... I've oredi booked slots 4 my weekends ...,Yupz Ive oredi booked slots my weekends liao,yupz oredi booked slots weekends liao,yupz oredi book slot weekends liao,Yupz -PRON- have oredi book slot -PRON- week...


Extração de features usando o TfidfVectorizer - neste exemplo iremos avaliar os textos com lemma e sem stopword

In [None]:
# Código para pegar os valores de uma coluna do dataframe (dataframe,nomedacoluna,.values)
X = df.Text_lemma_no_stopword.values

#Extração das features
vectorizer = TfidfVectorizer(use_idf=True)
tfidf_model = vectorizer.fit(X)

X_tfidf = tfidf_model.transform(X)

print(X_tfidf[0,:])

  (0, 6858)	0.8121978182207468
  (0, 6628)	0.583382125263929


# Função para treinamento e avaliação de vários modelos e métricas ao mesmo tempo usando treinamento e teste. Explicações dentro da função.

In [None]:
def run_exps_train_test(x_train: pd.DataFrame ,
             y_train: pd.DataFrame,
             x_test: pd.DataFrame,
             y_test: pd.DataFrame) -> pd.DataFrame:
    """
    Lightweight script to test many models and find winners
    :param x_train: train split
    :param y_train: training target vector
    :param x_test: test split
    :param y_test: test target vector
    :return: DataFrame of predictions
    """

    dfs = []
#Modelos que serão avaliados (podem incluir quantos modelos quiserem)
    models = [
          ('LogReg', LogisticRegression()),
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('SVM', SVC(kernel="linear")),
          ('MNB', MultinomialNB()),
          ('Adaboost', AdaBoostClassifier()),
          ('XGB', XGBClassifier())
        ]

    results = []
    names = []
    #Métricas que serão avaliadas (podem incluir quantos métricas quiserem)
    kappa_scorer = make_scorer(cohen_kappa_score)
    scoring = {
                'accuracy': 'accuracy',
                'precision_weighted': 'precision_weighted',
                'recall_weighted': 'recall_weighted',
                'f1_weighted': 'f1_weighted',
                'kappa' : kappa_scorer
                }

  #Nomes das classes, esse atributo é opcional, caso não seja incluido o modelo
  #vai apresentar os valores de 0-n onde n é o número de classes.
   # target_names = ['ham', 'spam']

    for name, model in models:
        #em alguns casos é interessante se criar um classificador para cada classe
        #caso seja o caso descomentar linha abaixo
        #model = OneVsRestClassifier(model)
        clf = model.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        print(name)
        print(classification_report(y_test, y_pred))

# Recuperando classes das instâncias de treinamento e teste

In [None]:
Y = df.Class.values
tfidf_train, tfidf_test, class_train, class_test = train_test_split(X_tfidf, Y, test_size=0.25)

# Rodar função para treinamento e avaliação descrita acima.

In [None]:
run_exps_train_test(tfidf_train, class_train, tfidf_test, class_test)

LogReg
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1205
        spam       0.99      0.63      0.77       187

    accuracy                           0.95      1392
   macro avg       0.97      0.82      0.87      1392
weighted avg       0.95      0.95      0.94      1392

RF
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1205
        spam       1.00      0.83      0.91       187

    accuracy                           0.98      1392
   macro avg       0.99      0.91      0.95      1392
weighted avg       0.98      0.98      0.98      1392

KNN
              precision    recall  f1-score   support

         ham       0.90      1.00      0.95      1205
        spam       1.00      0.30      0.46       187

    accuracy                           0.91      1392
   macro avg       0.95      0.65      0.70      1392
weighted avg       0.92      0.91      0.88      1392

SVM
 

NameError: ignored

In [None]:
def run_exps_crossvalidation(x: pd.DataFrame ,
             y: pd.DataFrame) -> pd.DataFrame:
    """
    Lightweight script to test many models and find winners
    :param x: values vector
    :param y: target vector
    :return: DataFrame of predictions
    """

    dfs = []
    print("CARREGANDO MODELO")
    models = [
          ('LogReg', LogisticRegression()),
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('MNB', MultinomialNB()),
          ('Adaboost', AdaBoostClassifier()),
          ('XGB', XGBClassifier())
        ]

    results = []
    names = []
    kappa_scorer = make_scorer(cohen_kappa_score)
    scoring = {
                'accuracy': 'accuracy',
                'precision_weighted': 'precision_weighted',
                'recall_weighted': 'recall_weighted',
                'f1_weighted': 'f1_weighted',
                'kappa' : kappa_scorer
                }
    print("RODANDO")
    for name, model in models:
        print(name)
        kfold = model_selection.KFold(n_splits=10, shuffle=True)
        cv_results = model_selection.cross_validate(model, x, y, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)

    final = pd.concat(dfs, ignore_index=True)
    return final


In [None]:
final = run_exps_crossvalidation(X_tfidf, Y)
final

CARREGANDO MODELO
RODANDO
LogReg
RF
KNN
MNB
Adaboost
XGB


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_weighted,test_recall_weighted,test_f1_weighted,test_kappa,model
0,0.092357,0.013748,0.958707,0.960611,0.958707,0.955778,0.811678,LogReg
1,0.075462,0.01341,0.965889,0.966536,0.965889,0.964216,0.851402,LogReg
2,0.075993,0.014537,0.94614,0.948312,0.94614,0.942094,0.777544,LogReg
3,0.069625,0.014492,0.949731,0.951351,0.949731,0.945445,0.760614,LogReg
4,0.075157,0.01336,0.971275,0.971358,0.971275,0.969286,0.810826,LogReg
5,0.07968,0.01484,0.964093,0.965507,0.964093,0.961411,0.811071,LogReg
6,0.076525,0.013944,0.955117,0.956181,0.955117,0.951223,0.762327,LogReg
7,0.069207,0.013835,0.946043,0.947922,0.946043,0.940909,0.736776,LogReg
8,0.080944,0.014386,0.946043,0.947719,0.946043,0.940174,0.708789,LogReg
9,0.078066,0.013387,0.955036,0.957244,0.955036,0.950879,0.765978,LogReg


In [None]:
grouped = final[['test_accuracy','test_f1_weighted', 'test_kappa']].groupby(final['model'])
grouped.mean()

Unnamed: 0_level_0,test_accuracy,test_f1_weighted,test_kappa
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adaboost,0.959584,0.958684,0.814344
KNN,0.916476,0.899402,0.510812
LogReg,0.955807,0.952141,0.7797
MNB,0.965872,0.963668,0.834731
RF,0.976111,0.97498,0.888408
XGB,0.956711,0.953544,0.787644


In [None]:
grouped.std()

Unnamed: 0_level_0,test_accuracy,test_f1_weighted,test_kappa
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adaboost,0.007143,0.007161,0.048159
KNN,0.010202,0.013702,0.054602
LogReg,0.009068,0.010309,0.041961
MNB,0.009267,0.010296,0.041998
RF,0.009726,0.010628,0.045142
XGB,0.008807,0.010374,0.046855


# Medindo a importância das features no classificador

> Indented block



Treinar o modelo de árvore de decisão

In [None]:
model = RandomForestClassifier()
model.fit(X_tfidf, Y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Extraindo a importância das características

In [None]:
mdg_features = model.feature_importances_
mdg_features

array([3.35502226e-11, 0.00000000e+00, 1.93376479e-10, ...,
       7.82331269e-07, 1.49157401e-04, 0.00000000e+00])

Nomes e índices das features

In [None]:
features_names = tfidf_model.get_feature_names()

feature_importance = pd.DataFrame(mdg_features,
                                   index = features_names,
                                   columns=['importance']).sort_values('importance',ascending=False)

index_feature_importance = pd.DataFrame(mdg_features,
                                   index = range(len(features_names)),
                                   columns=['importance']).sort_values('importance',ascending=False)

Montar o array de importância

In [None]:
labels_features = feature_importance['importance'].index[:30]
indices_features = index_feature_importance['importance'].index[:30]
mdg_features = feature_importance['importance'].values[:30]

data = {"Variable": labels_features, "MDG": mdg_features}

df_feature_importance = pd.DataFrame(data)
df_feature_importance

Unnamed: 0,Variable,MDG
0,txt,0.039575
1,free,0.03013
2,win,0.025018
3,claim,0.023794
4,mobile,0.023119
5,service,0.020165
6,text,0.019697
7,stop,0.018541
8,prize,0.016123
9,tone,0.014968
