## Importação de bibliotecas e leitura de DataFrame

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score
from itertools import combinations 
from scipy.stats import ttest_rel
from sklearn import tree, svm
import pandas as pd
import numpy as np
import math

# O arquivo não usa vírgulas como separador, o argumento "sep" ajuda com isso definindo o espaço de tabulação como um tab
# O arquivo não tem uma linha de cabeçalho, o argumento "names" é usado para definir o cabeçalho de cada coluna 
df = pd.read_csv("smsspamcollection/SMSSpamCollection.csv", sep="\t", names=["Type", "Text"], header=None)

## DataFrame Analysis

In [2]:
# Da informações das colunas do Dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Type    5572 non-null   object
 1   Text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [3]:
# Printa os primeiros elementos do Dataframe
df.head()

Unnamed: 0,Type,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Da uma breve descricão dos dados do Dataframe
df.describe()

Unnamed: 0,Type,Text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
# A coluna "Type" tem valores "ham" e "spam", então mudei ela para categorica
# Mais informações sobre o pandas categorical são encontradas em = https://pandas.pydata.org/docs/reference/api/pandas.Categorical.html
df['Type'] = df['Type'].astype('category')

## TF-IDF (Term Frequency-Inverse Document Frequency)

a fórmula para o TF-IDF é: (frequência do termo no documento) x log((número total de documentos) / (número de documentos que contêm o termo)).  
Implementei o algoritmo para testar:

In [None]:
# Isso aqui pode ser simplificado, mas fiz apenas para teste (uma classe seria bom também)
tf_count = {}
num_documents = 0

for index, row in df.iterrows():
    line = row['Text'].strip().split()
    num_documents += 1
    seen = set()
    for word in line:
        if word not in seen:
            if word in tf_count:
                tf_count[word] += 1
            else:
                tf_count[word] = 1
        seen.add(word)

res = []
for index, row in df.iterrows():
    line = row['Text'].strip().split()
    local_res = []
    local_m = {}
    for word in line:
        if word in local_m:
            local_m[word] += 1
        else:
            local_m[word] = 1
    for word in line:
        local_res.append((word, local_m[word] * math.log(num_documents/1e-3 + tf_count[word])))
    res.append(local_res)

print(res[0])

[('Go', 15.53326712644149), ('until', 15.53326928005903), ('jurong', 15.53326479335059), ('point,', 15.53326479335059), ('crazy..', 15.53326479335059), ('Available', 15.533265152288006), ('only', 15.533290098122793), ('in', 15.533394720277643), ('bugis', 15.533265331756665), ('n', 15.533285252577684), ('great', 15.533277176616993), ('world', 15.53326766484631), ('la', 15.533264972819314), ('e', 15.533276279284001), ('buffet...', 15.53326479335059), ('Cine', 15.53326479335059), ('there', 15.533283816856105), ('got', 15.533299789142573), ('amore', 15.53326479335059), ('wat...', 15.533266588036382)]


Vou acabar usando a implementação de TF-IDF do Scikit-learn:

referência: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [None]:
vectorizer   = TfidfVectorizer()
tfidx_matrix = vectorizer.fit_transform(df['Text'])
tfidx_df     = pd.DataFrame(tfidx_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidx_df.describe()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
count,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,...,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0
mean,0.000402,0.001161,4.2e-05,9.4e-05,4.5e-05,5.5e-05,5.2e-05,8.2e-05,9.2e-05,0.000352,...,4.9e-05,5.8e-05,0.000104,6.4e-05,5.2e-05,4.9e-05,2.9e-05,3.3e-05,4e-05,5.5e-05
std,0.00951,0.018111,0.003123,0.004943,0.003352,0.004083,0.003888,0.004314,0.006843,0.009281,...,0.003669,0.004299,0.005467,0.004746,0.003886,0.00367,0.00215,0.002476,0.003011,0.004138
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.240128,0.654425,0.233155,0.265767,0.25021,0.304792,0.290193,0.227745,0.510769,0.256937,...,0.273889,0.320935,0.296818,0.354245,0.290067,0.273938,0.160467,0.184833,0.224784,0.308912


## Divisão do Dataset em treino/teste

In [8]:
X = tfidx_df
Y = df['Type']
x_train, x_test, y_train, y_test = train_test_split(X, Y)

## Classificação

### Decision Tree:

É um modelo semelhante a um fluxograma usado tanto para suporte à decisão quanto para aprendizado de máquina.  
É um diagrama em forma de árvore invertida que mapeia possíveis decisões, suas consequências e um resultado final por meio de uma série de nós e ramificações.

Vídeo do Statquest:
https://www.youtube.com/watch?v=_L39rN6gz7Y&t=429s

Esse próximo bloco de código foi usado para testar o GRID SEARCH com a Decision tree, mas ficou muito lerdo então comentei.  

Grid Search é um método para encontrar os melhores parâmetros de um modelo de Machine Learning.  
Ele faz isso testando todas as combinações possíveis de valores especificados, de forma exaustiva, até descobrir qual combinação gera o melhor desempenho.

In [9]:
model_dtree = tree.DecisionTreeClassifier()

"""
parameters = {
    'max_depth': [5, 10, 20, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

model_dtree = GridSearchCV(model_dtree, parameters)
model_dtree = model_dtree.fit(x_train, y_train)

y_pred = model_dtree.predict(x_test)
accuracy_score(y_pred, y_test)
f1_score(y_pred, y_test, average='macro')
"""

"\nparameters = {\n    'max_depth': [5, 10, 20, 50],\n    'min_samples_split': [2, 5, 10],\n    'min_samples_leaf': [1, 2, 4],\n}\n\nmodel_dtree = GridSearchCV(model_dtree, parameters)\nmodel_dtree = model_dtree.fit(x_train, y_train)\n\ny_pred = model_dtree.predict(x_test)\naccuracy_score(y_pred, y_test)\nf1_score(y_pred, y_test, average='macro')\n"

### Support Vector Machines:

Funciona encontrando o hiperplano ideal (uma linha em 2D, um plano em 3D) que melhor separa os pontos de dados em diferentes classes, maximizando a margem entre eles.

Vídeo(s) do Statquest: 
https://www.youtube.com/watch?v=efR1C6CvhmE  
https://www.youtube.com/watch?v=Toet3EiSFcM  
https://www.youtube.com/watch?v=Qc5IyLW_hns

In [10]:
model_svm = svm.SVC()
model_svm.fit(x_train, y_train)

y_pred = model_svm.predict(x_test)
print(accuracy_score(y_pred, y_test))
print(f1_score(y_pred, y_test, average='macro'))

0.9820531227566404
0.9598398437184672


### K Nearest Neighbours

Usa a proximidade para classificar ou prever o valor de um novo ponto de dados com base em seus ‘k’ vizinhos mais próximos em um conjunto de dados rotulado.

Vídeo do Statquest:
https://www.youtube.com/watch?v=HVXime0nQeI

In [11]:
model_knn = KNeighborsClassifier(n_neighbors = 4)
model_knn.fit(x_train, y_train)

y_pred = model_knn.predict(x_test)
accuracy_score(y_pred, y_test)
f1_score(y_pred, y_test, average='macro')

0.6900691336368678

#### Validação Cruzada K-fold

1. Embaralhe o conjunto de dados aleatoriamente.  
2. Divida o conjunto de dados em *k* grupos.  
3. Para cada grupo único:  
   1. Use o grupo como um conjunto de teste (*hold out*).  
   2. Use os grupos restantes como conjunto de treinamento.  
   3. Treine um modelo no conjunto de treinamento e avalie-o no conjunto de teste.  
   4. Guarde a pontuação da avaliação e descarte o modelo.  
4. Resuma o desempenho do modelo usando a amostra das pontuações de avaliação obtidas.

### K-fold

Fornece índices de treino/teste para dividir os dados em conjuntos de treino e teste.  
Divide o conjunto de dados em *k* dobras consecutivas (sem embaralhar por padrão).  
Cada dobra é usada uma vez como validação, enquanto as *k - 1* dobras restantes formam o conjunto de treinamento.

In [12]:
skf = KFold(n_splits=4)
skf.get_n_splits(X, Y)

for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):
    print(f'{i}: train={train_idx}, test={test_idx}')

0: train=[1393 1394 1395 ... 5569 5570 5571], test=[   0    1    2 ... 1390 1391 1392]
1: train=[   0    1    2 ... 5569 5570 5571], test=[1393 1394 1395 ... 2783 2784 2785]
2: train=[   0    1    2 ... 5569 5570 5571], test=[2786 2787 2788 ... 4176 4177 4178]
3: train=[   0    1    2 ... 4176 4177 4178], test=[4179 4180 4181 ... 5569 5570 5571]


### Stratified K-fold

Este objeto de validação cruzada é uma variação do KFold que retorna dobras estratificadas.  
As dobras são criadas preservando a porcentagem de amostras de cada classe em *y* em um cenário de classificação binária ou multiclasse.

In [13]:
skf = StratifiedKFold(n_splits=4)
skf.get_n_splits(X, Y)

for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):
    print(f'{i}: train={train_idx}, test={test_idx}')

0: train=[1227 1229 1252 ... 5569 5570 5571], test=[   0    1    2 ... 1406 1408 1409]
1: train=[   0    1    2 ... 5569 5570 5571], test=[1227 1229 1252 ... 2792 2793 2794]
2: train=[   0    1    2 ... 5569 5570 5571], test=[2719 2729 2730 ... 4181 4182 4184]
3: train=[   0    1    2 ... 4181 4182 4184], test=[4154 4156 4162 ... 5569 5570 5571]


### Teste T

O teste t é uma ferramenta estatística usada para determinar se há uma diferença significativa entre as médias de dois grupos, ou entre a média de um grupo e um valor conhecido.

Vídeo: https://www.youtube.com/watch?v=VekJxtk4BYM

#### Uma amostra 

t = (x̄ - μ) / (s / sqrt(n))

x̄ = média da amostra  
μ = média assumida (ou populacional)  
s = desvio padrão da amostra  
n = número de observações (tamanho da amostra)

#### Duas amostras

t = (x̄₁ - x̄₂) / sqrt((s₁² / n₁) + (s₂² / n₂))

x̄₁ = média observada da 1ª amostra  
x̄₂ = média observada da 2ª amostra  
s₁ = desvio padrão da 1ª amostra  
s₂ = desvio padrão da 2ª amostra  
n₁ = tamanho da 1ª amostra  
n₂ = tamanho da 2ª amostra  

#### Correção de Bonferroni

Método estatístico usado para controlar a taxa de falsos positivos (erros do Tipo I) ao realizar múltiplos testes estatísticos sobre o mesmo conjunto de dados.
Se você realizar *m* testes independentes com um nível de significância total *α*, o limite corrigido de Bonferroni é:  

alpha_corrigido = alpha / m  

Um teste é considerado significativo se:  

p_i < alpha_corrigido  

Alternativamente, você pode ajustar diretamente os valores de p:  

p_ajustado = p_i * m  

Em seguida, compare os valores de p ajustados com o nível de significância original (alpha).

### Teste final

In [14]:
kfold = StratifiedKFold(n_splits=10)
kfold.get_n_splits(X, Y)

model_dtree = tree.DecisionTreeClassifier()
model_svm = svm.SVC()
model_knear = KNeighborsClassifier()

acc_dtree, acc_svm, acc_knear = [], [], []

for i, (train_idx, test_idx) in enumerate(kfold.split(X, Y)):
    # Variáveis para os "blocos"
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = Y.iloc[train_idx], Y.iloc[test_idx]

    # Treino dos modelos
    model_dtree.fit(X_train, y_train)
    model_svm.fit(X_train, y_train)
    model_knear.fit(X_train, y_train)

    # Faz a previsão do y 
    y_pred_dtree = model_dtree.predict(X_test)
    y_pred_svm   = model_svm.predict(X_test)
    y_pred_knear = model_knear.predict(X_test)

    # Salva a accurracy
    acc_dtree.append(accuracy_score(y_test, y_pred_dtree))
    acc_svm.append(accuracy_score(y_test, y_pred_svm))
    acc_knear.append(accuracy_score(y_test, y_pred_knear))

Usando o t-test do Scipy sem correcão:

In [15]:
# Compara os modelos usando os t-test em pares
t_dtree_svm, p_dtree_svm     = ttest_rel(acc_dtree, acc_svm)
t_dtree_knear, p_dtree_knear = ttest_rel(acc_dtree, acc_knear)
t_svm_knear, p_svm_knear     = ttest_rel(acc_svm, acc_knear)

print("Decision Tree vs SVM: t = %.3f, p = %.3f" % (t_dtree_svm, p_dtree_svm))
print("Decision Tree vs KNN: t = %.3f, p = %.3f" % (t_dtree_knear, p_dtree_knear))
print("SVM vs KNN:           t = %.3f, p = %.3f" % (t_svm_knear, p_svm_knear))


Decision Tree vs SVM: t = -3.362, p = 0.008
Decision Tree vs KNN: t = 20.798, p = 0.000
SVM vs KNN:           t = 26.341, p = 0.000


com Correção:

In [16]:
alpha = 0.05
# Temos três comparações
alpha_bonf = alpha / 3

t_dtree_svm, p_dtree_svm     = ttest_rel(acc_dtree, acc_svm)
t_dtree_knear, p_dtree_knear = ttest_rel(acc_dtree, acc_knear)
t_svm_knear, p_svm_knear     = ttest_rel(acc_svm, acc_knear)

print("Decision Tree vs SVM: t = %.3f, p = %.3f, significant = %s" %
      (t_dtree_svm, p_dtree_svm, p_dtree_svm < alpha_bonf))
print("Decision Tree vs KNN: t = %.3f, p = %.3f, significant = %s" %
      (t_dtree_knear, p_dtree_knear, p_dtree_knear < alpha_bonf))
print("SVM vs KNN: t = %.3f, p = %.3f, significant = %s" %
      (t_svm_knear, p_svm_knear, p_svm_knear < alpha_bonf))


Decision Tree vs SVM: t = -3.362, p = 0.008, significant = True
Decision Tree vs KNN: t = 20.798, p = 0.000, significant = True
SVM vs KNN: t = 26.341, p = 0.000, significant = True
