In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from ucimlrepo import fetch_ucirepo 
import warnings
warnings.filterwarnings("ignore")

In [2]:
car_evaluation = fetch_ucirepo(id=19) 

In [3]:
X = car_evaluation.data.features 
y = car_evaluation.data.targets 

In [4]:
colunas = X.columns
for coluna in X.columns:
    target_name = dict()
    for i in range(len(X[coluna].value_counts().index)):
       target_name[X[coluna].value_counts().index[i]] = i
    X[coluna+str(1)] = X[coluna].map(target_name)
    X.drop(coluna, axis='columns', inplace=True)
X.columns = colunas
X

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,0,0,0,0,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,2
3,0,0,0,0,1,0
4,0,0,0,0,1,1
...,...,...,...,...,...,...
1723,3,3,3,2,1,1
1724,3,3,3,2,1,2
1725,3,3,3,2,2,0
1726,3,3,3,2,2,1


In [5]:
y['class'].value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [6]:
np.random.seed(42) 
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [7]:
# exibindo a distribuição de classes
print("Antes: ", Y_train['class'].value_counts())

sampling_strategy = {'unacc': 50, 
                     'acc': 50, 
                     'good': 50,
                     'vgood': 50}
undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)

# aplicando a transformação de reamsostragem
X_under, Y_under = undersample.fit_resample(X_train, Y_train)

# exibindo a distribuição de classes
print("Depois: ", Y_under['class'].value_counts())

Antes:  class
unacc    968
acc      307
good      55
vgood     52
Name: count, dtype: int64
Depois:  class
acc      50
good     50
unacc    50
vgood    50
Name: count, dtype: int64


In [8]:
model1 = LogisticRegression(solver='liblinear')

In [9]:
model1.fit(X_under, Y_under)

In [10]:
Y_predito1 = model1.predict(X_test)

In [11]:
print('Acurácia: %.2f' % accuracy_score(Y_test, Y_predito1))
print('Precisão: %.2f' % precision_score(Y_test, Y_predito1, pos_label='positive',average='micro'))
print('Recall: %.2f' % recall_score(Y_test, Y_predito1, pos_label='positive',average='micro'))
print('F-score: %.2f' % f1_score(Y_test, Y_predito1, pos_label='positive',average='micro'))

Acurácia: 0.65
Precisão: 0.65
Recall: 0.65
F-score: 0.65


In [12]:
print("Antes: ", Y_train.value_counts())
sampling_strategy1 = {'unacc': 968, 
                     'acc': 968, 
                     'good': 968,
                     'vgood': 968}
# definindo a estratégia de super-amostragem
oversample = SMOTE(sampling_strategy=sampling_strategy1)

# aplicando a transformação de reamsostragem
X_over, Y_over = oversample.fit_resample(X_train, Y_train)

# exibindo a distribuição de classes
print("Depois: ", Y_over['class'].value_counts())

Antes:  class
unacc    968
acc      307
good      55
vgood     52
Name: count, dtype: int64
Depois:  class
acc      968
unacc    968
good     968
vgood    968
Name: count, dtype: int64


In [13]:
model2 = LogisticRegression(solver='liblinear')

In [14]:
model2.fit(X_over, Y_over)

In [15]:
Y_predito2 = model2.predict(X_test)

In [16]:
print('Acurácia: %.2f' % accuracy_score(Y_test, Y_predito2))
print('Precisão: %.2f' % precision_score(Y_test, Y_predito2, pos_label='positive',average='micro'))
print('Recall: %.2f' % recall_score(Y_test, Y_predito2, pos_label='positive',average='micro'))
print('F-score: %.2f' % f1_score(Y_test, Y_predito2, pos_label='positive',average='micro'))

Acurácia: 0.74
Precisão: 0.74
Recall: 0.74
F-score: 0.74


In [17]:
# exibindo a distribuição de classes
print("Antes: ", Y_train.value_counts())
sampling_strategy3 = {'unacc': 968, 
                     'acc': 968, 
                     'good': 968,
                     'vgood': 968}

# definindo a estratégia de reamostragem
sample = SMOTEENN(sampling_strategy=sampling_strategy3)

# aplicando a transformação de reamsostragem
X_sample, Y_sample = sample.fit_resample(X_train, Y_train)

# exibindo a distribuição de classes
print("Depois: ", Y_sample['class'].value_counts())

Antes:  class
unacc    968
acc      307
good      55
vgood     52
Name: count, dtype: int64
Depois:  class
vgood    954
good     918
acc      764
unacc    641
Name: count, dtype: int64


In [18]:
model3 = LogisticRegression(solver='liblinear')

In [19]:
model3.fit(X_sample, Y_sample)

In [20]:
Y_predito3 = model3.predict(X_test)

In [21]:
print('Acurácia: %.2f' % accuracy_score(Y_test, Y_predito3))
print('Precisão: %.2f' % precision_score(Y_test, Y_predito3, pos_label='positive',average='micro'))
print('Recall: %.2f' % recall_score(Y_test, Y_predito3, pos_label='positive',average='micro'))
print('F-score: %.2f' % f1_score(Y_test, Y_predito3, pos_label='positive',average='micro'))

Acurácia: 0.67
Precisão: 0.67
Recall: 0.67
F-score: 0.67


In [22]:
model4 = LogisticRegression(solver='liblinear', class_weight='balanced')

In [23]:
model4.fit(X_train, Y_train)

In [24]:
Y_predito4 = model4.predict(X_test)

In [25]:
print('Acurácia: %.2f' % accuracy_score(Y_test, Y_predito4))
print('Precisão: %.2f' % precision_score(Y_test, Y_predito4, pos_label='positive',average='micro'))
print('Recall: %.2f' % recall_score(Y_test, Y_predito4, pos_label='positive',average='micro'))
print('F-score: %.2f' % f1_score(Y_test, Y_predito4, pos_label='positive',average='micro'))

Acurácia: 0.84
Precisão: 0.84
Recall: 0.84
F-score: 0.84
