# Machine Learning e Data Science com Python de A à Z (Classificacão) - IA Expert Academy

# Importação das bibliotecas básicas

In [2]:
!pip -q install plotly --upgrade

In [3]:
!pip -q install yellowbrick

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Base de dados de crédito

- Fonte (adaptado): https://www.kaggle.com/laotse/credit-risk-dataset

In [5]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

### Exploração dos dados

In [None]:
base_credit = pd.read_csv('/content/credit_data.csv')

In [None]:
base_credit # defaulted

In [None]:
base_credit.head(10)

In [None]:
base_credit.tail(8)

In [None]:
base_credit.describe()

In [None]:
base_credit[base_credit['income'] >= 69995.685578]

In [None]:
base_credit[base_credit['loan'] <= 1.377630]

### Visualização dos dados

In [None]:
np.unique(base_credit['default'], return_counts=True)

In [None]:
sns.countplot(x = base_credit['default']);

In [None]:
plt.hist(x = base_credit['age']);

In [None]:
plt.hist(x = base_credit['income']);

In [None]:
plt.hist(x = base_credit['loan']);

In [None]:
grafico = px.scatter_matrix(base_credit, dimensions=['age', 'income', 'loan'], color = 'default')
grafico.show()

### Tratamento de valores inconsistentes

In [None]:
base_credit.loc[base_credit['age'] < 0]

In [None]:
base_credit[base_credit['age'] < 0]

In [None]:
# Apagar a coluna inteira (de todos os registros da base de dados)
base_credit2 = base_credit.drop('age', axis = 1)
base_credit2

In [None]:
base_credit.index

In [None]:
base_credit[base_credit['age'] < 0].index

In [None]:
# Apagar somente os registros com valores inconsistentes
base_credit3 = base_credit.drop(base_credit[base_credit['age'] < 0].index)
base_credit3

In [None]:
base_credit3.loc[base_credit3['age'] < 0]

In [None]:
# Preencher os valores inconsistente manualmente

In [None]:
# Prencher a média

In [None]:
base_credit.mean()

In [None]:
base_credit['age'].mean()

In [None]:
base_credit['age'][base_credit['age'] > 0].mean()

In [None]:
base_credit.loc[base_credit['age'] < 0, 'age'] = 40.92

In [None]:
base_credit.loc[base_credit['age'] < 0]

In [None]:
base_credit.head(27)

### Tratamento de valores faltantes

In [None]:
base_credit.isnull()

In [None]:
base_credit.isnull().sum()

In [None]:
base_credit.loc[pd.isnull(base_credit['age'])]

In [None]:
base_credit['age'].fillna(base_credit['age'].mean(), inplace = True)

In [None]:
base_credit.loc[pd.isnull(base_credit['age'])]

In [None]:
base_credit.loc[(base_credit['clientid'] == 29) | (base_credit['clientid'] == 31) | (base_credit['clientid'] == 32)]

In [None]:
base_credit.loc[base_credit['clientid'].isin([29, 31, 32])]

### Divisão entre previsores e classe

In [None]:
type(base_credit)

In [None]:
X_credit = base_credit.iloc[:, 1:4].values

In [None]:
X_credit

In [None]:
type(X_credit)

In [None]:
y_credit = base_credit.iloc[:, 4].values

In [None]:
y_credit

In [None]:
type(y_credit)

### Escalonamento dos valores

In [None]:
X_credit

In [None]:
X_credit[:,0].min(), X_credit[:,1].min(), X_credit[:,2].min()

In [None]:
X_credit[:,0].max(), X_credit[:,1].max(), X_credit[:,2].max()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_credit = StandardScaler()
X_credit = scaler_credit.fit_transform(X_credit)

In [None]:
X_credit[:,0].min(), X_credit[:,1].min(), X_credit[:,2].min()

In [None]:
X_credit[:,0].max(), X_credit[:,1].max(), X_credit[:,2].max()

In [None]:
X_credit

# Base de dados do censo

- Fonte: https://archive.ics.uci.edu/ml/datasets/adult

## Exploração dos dados

In [None]:
base_census = pd.read_csv('/content/census.csv')

In [None]:
base_census.describe()

In [None]:
base_census.describe()

In [None]:
base_census.isnull().sum()

## Visualização dos dados

In [None]:
np.unique(base_census['income'], return_counts=True)

In [None]:
sns.countplot(x = base_census['income']);

In [None]:
plt.hist(x = base_census['age']);

In [None]:
plt.hist(x = base_census['education-num']);

In [None]:
plt.hist(x = base_census['hour-per-week']);

In [None]:
grafico = px.treemap(base_census, path=['workclass', 'age'])
grafico.show()

In [None]:
grafico = px.treemap(base_census, path=['occupation', 'relationship', 'age'])
grafico.show()

In [None]:
grafico = px.parallel_categories(base_census, dimensions=['occupation', 'relationship'])
grafico.show()

In [None]:
grafico = px.parallel_categories(base_census, dimensions=['workclass', 'occupation', 'income'])
grafico.show()

In [None]:
grafico = px.parallel_categories(base_census, dimensions=['education', 'income'])
grafico.show()

## Divisão entre previsores e classe

In [None]:
base_census.columns

In [None]:
X_census = base_census.iloc[:, 0:14].values

In [None]:
X_census

In [None]:
X_census[0]

In [None]:
y_census = base_census.iloc[:, 14].values

In [None]:
y_census

## Tratamento de atributos categóricos

### LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder_teste = LabelEncoder()

In [None]:
X_census[:,1]

In [None]:
teste = label_encoder_teste.fit_transform(X_census[:,1])

In [None]:
teste

In [None]:
X_census[0]

In [None]:
label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

In [None]:
X_census[:,1] = label_encoder_workclass.fit_transform(X_census[:,1])
X_census[:,3] = label_encoder_education.fit_transform(X_census[:,3])
X_census[:,5] = label_encoder_marital.fit_transform(X_census[:,5])
X_census[:,6] = label_encoder_occupation.fit_transform(X_census[:,6])
X_census[:,7] = label_encoder_relationship.fit_transform(X_census[:,7])
X_census[:,8] = label_encoder_race.fit_transform(X_census[:,8])
X_census[:,9] = label_encoder_sex.fit_transform(X_census[:,9])
X_census[:,13] = label_encoder_country.fit_transform(X_census[:,13])

In [None]:
X_census[0]

In [None]:
X_census

### OneHotEncoder

In [None]:
# Carro

# Gol Pálio Uno
#   1     2   3

# Gol   1 0 0
# Pálio 0 1 0
# Uno   0 0 1 # encode

In [None]:
len(np.unique(base_census['workclass'])) # 1 0 0 0 0 0 0 0, 0 0 0 0 1 0 0 0 0

In [None]:
len(np.unique(base_census['occupation']))

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
onehotencoder_census = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1,3,5,6,7,8,9,13])], remainder='passthrough')

In [None]:
X_census = onehotencoder_census.fit_transform(X_census).toarray()

In [None]:
X_census

In [None]:
X_census[0]

In [None]:
X_census.shape

## Escalonamento dos valores

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_census = StandardScaler()
X_census = scaler_census.fit_transform(X_census)

In [None]:
X_census[0]

# Divisão das bases em treinamento e teste

In [None]:
from sklearn.model_selection import train_test_split

## Credit data

In [None]:
X_credit_treinamento, X_credit_teste, y_credit_treinamento, y_credit_teste = train_test_split(X_credit, y_credit, test_size = 0.25, random_state = 0)

In [None]:
X_credit_treinamento.shape

In [None]:
y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

## Census

In [None]:
X_census_treinamento, X_census_teste, y_census_treinamento, y_census_teste = train_test_split(X_census, y_census, test_size = 0.15, random_state = 0)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

## Salvar as variáveis

In [None]:
import pickle

In [None]:
with open('credit.pkl', mode = 'wb') as f:
  pickle.dump([X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste], f)

In [None]:
with open('census.pkl', mode = 'wb') as f:
  pickle.dump([X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste], f)

# Naïve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

## Base risco de crédito

In [None]:
base_risco_credito = pd.read_csv('/content/risco_credito.csv')

In [None]:
base_risco_credito

In [None]:
X_risco_credito = base_risco_credito.iloc[:, 0:4].values
X_risco_credito

In [None]:
y_risco_credito = base_risco_credito.iloc[:, 4].values
y_risco_credito

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder_historia = LabelEncoder()
label_encoder_divida = LabelEncoder()
label_encoder_garantia = LabelEncoder()
label_encoder_renda = LabelEncoder()

In [None]:
X_risco_credito[:,0] = label_encoder_historia.fit_transform(X_risco_credito[:,0])
X_risco_credito[:,1] = label_encoder_divida.fit_transform(X_risco_credito[:,1])
X_risco_credito[:,2] = label_encoder_garantia.fit_transform(X_risco_credito[:,2])
X_risco_credito[:,3] = label_encoder_renda.fit_transform(X_risco_credito[:,3])

In [None]:
X_risco_credito

In [None]:
import pickle
with open('risco_credito.pkl', 'wb') as f:
  pickle.dump([X_risco_credito, y_risco_credito], f)

In [None]:
naive_risco_credito = GaussianNB()
naive_risco_credito.fit(X_risco_credito, y_risco_credito)

In [None]:
# história boa (0), dívida alta (0), garantias nenhuma (1), renda > 35 (2)
# história ruim (2), dívida alta (0), garantias adequada (0), renda < 15 (0)
previsao = naive_risco_credito.predict([[0,0,1,2], [2,0,0,0]])

In [None]:
previsao

In [None]:
naive_risco_credito.classes_

In [None]:
naive_risco_credito.class_count_

In [None]:
naive_risco_credito.class_prior_

## Base credit data - 93.80%

In [None]:
import pickle
with open('/content/census.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
naive_credit_data = GaussianNB()
naive_credit_data.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
previsoes = naive_credit_data.predict(X_credit_teste)

In [None]:
previsoes

In [None]:
y_credit_teste

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
accuracy_score(y_credit_teste, previsoes)

In [None]:
confusion_matrix(y_credit_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix

In [None]:
cm = ConfusionMatrix(naive_credit_data)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

## Base census - 47.67%

In [None]:
with open('census.pkl', 'rb') as f:
  X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
naive_census = GaussianNB()
naive_census.fit(X_census_treinamento, y_census_treinamento)
previsoes = naive_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
accuracy_score(y_census_teste, previsoes) # não executar o escalonamento

In [None]:
cm = ConfusionMatrix(naive_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
print(classification_report(y_census_teste, previsoes))

# Árvores de decisão

In [None]:
from sklearn.tree import DecisionTreeClassifier

## Base risco de crédito

In [None]:
import pickle
with open('risco_credito.pkl', 'rb') as f:
  X_risco_credito, y_risco_credito = pickle.load(f)

In [None]:
X_risco_credito

In [None]:
y_risco_credito

In [None]:
arvore_risco_credito = DecisionTreeClassifier(criterion='entropy')
arvore_risco_credito.fit(X_risco_credito, y_risco_credito)

In [None]:
arvore_risco_credito.feature_importances_

In [None]:
arvore_risco_credito.classes_

In [None]:
from sklearn import tree
previsores = ['história', 'dívida', 'garantias', 'renda']
figura, eixos = plt.subplots(nrows=1, ncols=1, figsize=(10,10))
tree.plot_tree(arvore_risco_credito, feature_names=previsores, class_names = arvore_risco_credito.classes_, filled=True);

In [None]:
# história boa, dívida alta, garantias nenhuma, renda > 35
# história ruim, dívida alta, garantias adequada, renda < 15
previsoes = arvore_risco_credito.predict([[0,0,1,2],[2,0,0,0]])
previsoes

## Base credit data - 98.20%

In [None]:
with open('credit.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
arvore_credit = DecisionTreeClassifier(criterion='entropy', random_state = 0)
arvore_credit.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
previsoes = arvore_credit.predict(X_credit_teste)
previsoes

In [None]:
y_credit_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
accuracy_score(y_credit_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(arvore_credit)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

In [None]:
arvore_credit.classes_

In [None]:
from sklearn import tree
previsores = ['income', 'age', 'loan']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (20,20))
tree.plot_tree(arvore_credit, feature_names=previsores, class_names=['0','1'], filled=True);
fig.savefig('arvore_credit.png')

## Base census - 81.04%

In [None]:
with open('census.pkl', 'rb') as f:
  X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
arvore_census = DecisionTreeClassifier(criterion='entropy', random_state=0)
arvore_census.fit(X_census_treinamento, y_census_treinamento)

In [None]:
previsoes = arvore_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
accuracy_score(y_census_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix

In [None]:
from yellowbrick.classifier import ConfusionMatrix
#cm = ConfusionMatrix(arvore_credit) corrigido 10/04/2021
cm = ConfusionMatrix(arvore_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
print(classification_report(y_census_teste, previsoes))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

## Base credit data - 98.40%

In [None]:
import pickle
with open('credit.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
random_forest_credit = RandomForestClassifier(n_estimators=40, criterion='entropy', random_state = 0)
random_forest_credit.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
previsoes = random_forest_credit.predict(X_credit_teste)
previsoes

In [None]:
y_credit_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_credit_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(random_forest_credit)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

## Base census - 85.07%

In [None]:
with open('census.pkl', 'rb') as f:
  X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
y_census_treinamento

In [None]:
random_forest_census = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state = 0)
random_forest_census.fit(X_census_treinamento, y_census_treinamento)

In [None]:
previsoes = random_forest_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_census_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(random_forest_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
print(classification_report(y_census_teste, previsoes))

# Regras

In [None]:
!pip install Orange3

In [None]:
import Orange

## Base risco de crédito

In [None]:
base_risco_credito = Orange.data.Table('risco_credito_regras.csv')

In [None]:
base_risco_credito

In [None]:
base_risco_credito.domain

In [None]:
cn2 = Orange.classification.rules.CN2Learner()
regras_risco_credito = cn2(base_risco_credito)

In [None]:
for regras in regras_risco_credito.rule_list:
  print(regras)

In [None]:
# história boa, dívida alta, garantias nenhuma, renda > 35
# história ruim, dívida alta, garantias adequada, renda < 15
previsoes = regras_risco_credito([['boa', 'alta', 'nenhuma', 'acima_35'], ['ruim', 'alta', 'adequada', '0_15']])
previsoes

In [None]:
base_risco_credito.domain.class_var.values

In [None]:
for i in previsoes:
  #print(i)
  print(base_risco_credito.domain.class_var.values[i])

## Base credit data - 97.40%

In [None]:
base_credit = Orange.data.Table('/content/credit_data_regras.csv')

In [None]:
base_credit.domain

In [None]:
base_dividida = Orange.evaluation.testing.sample(base_credit, n = 0.25)

In [None]:
base_dividida

In [None]:
base_dividida[0]

In [None]:
base_dividida[1]

In [None]:
base_treinamento = base_dividida[1]
base_teste = base_dividida[0]

In [None]:
len(base_treinamento), len(base_teste)

In [None]:
cn2 = Orange.classification.rules.CN2Learner()
regras_credit = cn2(base_treinamento)

In [None]:
for regras in regras_credit.rule_list:
  print(regras)

In [None]:
previsoes = Orange.evaluation.testing.TestOnTestData(base_treinamento, base_teste, [lambda testdata: regras_credit])

In [None]:
previsoes

In [None]:
Orange.evaluation.CA(previsoes)

## Base census - 78.90% (executado na interface gráfica do Orange)

# Classificador base - Majority learner

## Base credit data - 85.85%

In [None]:
base_credit = Orange.data.Table('credit_data_regras.csv')

In [None]:
base_credit.domain

In [None]:
majority = Orange.classification.MajorityLearner()

In [None]:
previsoes = Orange.evaluation.testing.TestOnTestData(base_credit, base_credit, [majority])

In [None]:
Orange.evaluation.CA(previsoes)

In [None]:
for registro in base_credit:
  print(registro.get_class())

In [None]:
from collections import Counter
Counter(str(registro.get_class()) for registro in base_credit)

In [None]:
1717 / 2000

## Base census - 75.91%

In [None]:
base_census = Orange.data.Table('census_regras.csv')

In [None]:
base_census.domain

In [None]:
majority = Orange.classification.MajorityLearner()
previsoes = Orange.evaluation.testing.TestOnTestData(base_census, base_census, [majority])
Orange.evaluation.CA(previsoes)

In [None]:
Counter(str(registro.get_class()) for registro in base_census)

In [None]:
24720 / (24720 + 7841)

# Aprendizagem baseada em instâncias - knn

In [None]:
from sklearn.neighbors import KNeighborsClassifier

## Base credit data - 98.60%

In [None]:
import pickle
with open('credit.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
knn_credit = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 2)
knn_credit.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
previsoes = knn_credit.predict(X_credit_teste)
previsoes

In [None]:
y_credit_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_credit_teste, previsoes) # padronização

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(knn_credit)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

## Base census - 82.90%

In [None]:
with open('census.pkl', 'rb') as f:
  X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
knn_census = KNeighborsClassifier(n_neighbors=10)
knn_census.fit(X_census_treinamento, y_census_treinamento)

In [None]:
previsoes = knn_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_census_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(knn_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
print(classification_report(y_census_teste, previsoes))

# Regressão logística

In [None]:
from sklearn.linear_model import LogisticRegression

## Base risco de crédito

In [None]:
import pickle
with open('risco_credito.pkl', 'rb') as f:
  X_risco_credito, y_risco_credito = pickle.load(f)

In [None]:
X_risco_credito

In [None]:
y_risco_credito # 2, 7, 11

In [None]:
X_risco_credito = np.delete(X_risco_credito, [2, 7, 11], axis = 0)
y_risco_credito = np.delete(y_risco_credito, [2, 7, 11], axis = 0)

In [None]:
X_risco_credito

In [None]:
y_risco_credito

In [None]:
logistic_risco_credito = LogisticRegression(random_state = 1)
logistic_risco_credito.fit(X_risco_credito, y_risco_credito)

In [None]:
logistic_risco_credito.intercept_

In [None]:
logistic_risco_credito.coef_

In [None]:
# história boa, dívida alta, garantias nenhuma, renda > 35
# história ruim, dívida alta, garantias adequada, renda < 15
previsoes1 = logistic_risco_credito.predict([[0,0,1,2], [2,0,0,0]])
previsoes1

## Base credit data - 94.60%

In [None]:
import pickle
with open('credit.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
logistic_credit = LogisticRegression(random_state=1)
logistic_credit.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
logistic_credit.intercept_

In [None]:
logistic_credit.coef_

In [None]:
previsoes = logistic_credit.predict(X_credit_teste)
previsoes

In [None]:
y_credit_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_credit_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(logistic_credit)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

## Base census - 84.95%

In [None]:
with open('census.pkl', 'rb') as f:
  X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
logistic_census = LogisticRegression(random_state = 1)
logistic_census.fit(X_census_treinamento, y_census_treinamento)

In [None]:
previsoes = logistic_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_census_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(logistic_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
print(classification_report(y_census_teste, previsoes))

# SVM

In [None]:
from sklearn.svm import SVC

## Base credit data - 98.80%

In [None]:
import pickle
with open('credit.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
svm_credit = SVC(kernel='rbf', random_state=1, C = 2.0) # 2 -> 4
svm_credit.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
previsoes = svm_credit.predict(X_credit_teste)
previsoes

In [None]:
y_credit_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_credit_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(svm_credit)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

## Base census - 85.07%

In [None]:
with open('census.pkl', 'rb') as f:
  X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
svm_census = SVC(kernel='linear', random_state=1)
svm_census.fit(X_census_treinamento, y_census_treinamento)

In [None]:
previsoes = svm_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_census_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(svm_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
print(classification_report(y_census_teste, previsoes))

# Redes neurais artificiais

In [None]:
from sklearn.neural_network import MLPClassifier

## Base credit data - 99.80%

In [None]:
import pickle
with open('credit.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
(3 + 1) / 2

In [None]:
# 3 -> 100 -> 100 -> 1
# 3 -> 2 -> 2 -> 1
rede_neural_credit = MLPClassifier(max_iter=1500, verbose=True, tol=0.0000100,
                                   solver = 'adam', activation = 'relu',
                                   hidden_layer_sizes = (20,20))
rede_neural_credit.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
previsoes = rede_neural_credit.predict(X_credit_teste)
previsoes

In [None]:
y_credit_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_credit_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(rede_neural_credit)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

## Base census - 81.53%

In [None]:
with open('census.pkl', 'rb') as f:
  X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
(108 + 1) / 2

In [None]:
# 108 -> 55 -> 55 -> 1
rede_neural_census = MLPClassifier(verbose=True, max_iter = 1000, tol=0.000010,
                                  hidden_layer_sizes = (55,55))
rede_neural_census.fit(X_census_treinamento, y_census_treinamento)

In [None]:
previsoes = rede_neural_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_census_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(rede_neural_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
print(classification_report(y_census_teste, previsoes))

# Avaliação dos algoritmos

- Naïve Bayes: 93.80
- Árvore de decisão: 98.20
- Random forest: 98.40
- Regras: 97.40
- Knn: 98.60
- Regressão logística: 94.60
- SVM: 98.80
- Redes neurais: 99.60

## Tuning dos parâmetros com GridSearch

### Preparação dos dados

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [None]:
import pickle
with open('credit.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
X_credit = np.concatenate((X_credit_treinamento, X_credit_teste), axis = 0)
X_credit.shape

In [None]:
X_credit

In [None]:
y_credit = np.concatenate((y_credit_treinamento, y_credit_teste), axis = 0)
y_credit.shape

In [None]:
y_credit

### Árvore de decisão

In [None]:
parametros = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10]}

In [None]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

### Random forest

In [None]:
parametros = {'criterion': ['gini', 'entropy'],
              'n_estimators': [10, 40, 100, 150],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10]}

In [None]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

### Knn

In [None]:
parametros = {'n_neighbors': [3, 5, 10, 20],
              'p': [1, 2]}

In [None]:
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

### Regressão logística

In [None]:
parametros = {'tol': [0.0001, 0.00001, 0.000001],
              'C': [1.0, 1.5, 2.0],
              'solver': ['lbfgs', 'sag', 'saga']}

In [None]:
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

### SVM

In [None]:
parametros = {'tol': [0.001, 0.0001, 0.00001],
              'C': [1.0, 1.5, 2.0],
              'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}

In [None]:
grid_search = GridSearchCV(estimator=SVC(), param_grid=parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

### Redes neurais

In [None]:
parametros = {'activation': ['relu', 'logistic', 'tahn'],
              'solver': ['adam', 'sgd'],
              'batch_size': [10, 56]}

In [None]:
grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_

In [None]:
print(melhores_parametros)
print(melhor_resultado)

## Validação cruzada

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
10 * 30

In [None]:
resultados_arvore = []
resultados_random_forest = []
resultados_knn = []
resultados_logistica = []
resultados_svm = []
resultados_rede_neural = []

for i in range(30):
  print(i)
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)

  arvore = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, splitter='best')
  scores = cross_val_score(arvore, X_credit, y_credit, cv = kfold)
  #print(scores)
  #print(scores.mean())
  resultados_arvore.append(scores.mean())

  random_forest = RandomForestClassifier(criterion = 'entropy', min_samples_leaf = 1, min_samples_split=5, n_estimators = 10)
  scores = cross_val_score(random_forest, X_credit, y_credit, cv = kfold)
  resultados_random_forest.append(scores.mean())

  knn = KNeighborsClassifier()
  scores = cross_val_score(knn, X_credit, y_credit, cv = kfold)
  resultados_knn.append(scores.mean())

  logistica = LogisticRegression(C = 1.0, solver = 'lbfgs', tol = 0.0001)
  scores = cross_val_score(logistica, X_credit, y_credit, cv = kfold)
  resultados_logistica.append(scores.mean())

  svm = SVC(kernel = 'rbf', C = 2.0)
  scores = cross_val_score(svm, X_credit, y_credit, cv = kfold)
  resultados_svm.append(scores.mean())

  rede_neural = MLPClassifier(activation = 'relu', batch_size = 56, solver = 'adam')
  scores = cross_val_score(rede_neural, X_credit, y_credit, cv = kfold)
  resultados_rede_neural.append(scores.mean())


In [None]:
resultados = pd.DataFrame({'Arvore': resultados_arvore, 'Random forest': resultados_random_forest,
                           'KNN': resultados_knn, 'Logistica': resultados_logistica,
                           'SVM': resultados_svm, 'Rede neural': resultados_rede_neural})
resultados

In [None]:
resultados.describe()

In [None]:
resultados.var()

In [None]:
(resultados.std() / resultados.mean()) * 100

## Teste de normalidade nos resultados

- Shapiro: https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test

In [None]:
alpha = 0.05

In [None]:
from scipy.stats import shapiro

In [None]:
shapiro(resultados_arvore), shapiro(resultados_random_forest), shapiro(resultados_knn), shapiro(resultados_logistica), shapiro(resultados_svm), shapiro(resultados_rede_neural)

In [None]:
sns.displot(resultados_arvore, kind = 'kde');

In [None]:
sns.displot(resultados_random_forest, kind = 'kde');

In [None]:
sns.displot(resultados_knn, kind = 'kde');

In [None]:
sns.displot(resultados_logistica, kind = 'kde');

In [None]:
sns.displot(resultados_svm, kind = 'kde');

In [None]:
sns.displot(resultados_rede_neural, kind = 'kde');

## Teste de hipótese com ANOVA e Tukey

In [None]:
from scipy.stats import f_oneway

In [None]:
_, p = f_oneway(resultados_arvore, resultados_random_forest, resultados_knn, resultados_logistica, resultados_svm, resultados_rede_neural)
p

In [None]:
alpha = 0.05
if p <= alpha:
  print('Hipótese nula rejeitada. Dados são diferentes')
else:
  print('Hipótese alternativa rejeitada. Resultados são iguais')

In [None]:
resultados_algoritmos = {'accuracy': np.concatenate([resultados_arvore, resultados_random_forest, resultados_knn, resultados_logistica, resultados_svm, resultados_rede_neural]),
                         'algoritmo': ['arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore',
                          'random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest',
                          'knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn','knn',
                          'logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica','logistica',
                          'svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm',
                          'rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural']}

In [None]:
resultados_df = pd.DataFrame(resultados_algoritmos)
resultados_df

In [None]:
from statsmodels.stats.multicomp import MultiComparison

In [None]:
compara_algoritmos = MultiComparison(resultados_df['accuracy'], resultados_df['algoritmo'])

In [None]:
teste_estatistico = compara_algoritmos.tukeyhsd()
print(teste_estatistico)

In [None]:
resultados.mean()

In [None]:
teste_estatistico.plot_simultaneous();

# Salvar um classificador já treinado

In [None]:
with open('credit.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit = np.concatenate((X_credit_treinamento, X_credit_teste), axis = 0)
y_credit = np.concatenate((y_credit_treinamento, y_credit_teste), axis = 0)

In [None]:
X_credit.shape, y_credit.shape

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [None]:
classificador_rede_neural = MLPClassifier(activation='relu', batch_size = 56, solver='adam')
classificador_rede_neural.fit(X_credit, y_credit)

In [None]:
classificador_arvore = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, splitter='best')
classificador_arvore.fit(X_credit, y_credit)

In [None]:
classificador_svm = SVC(C = 2.0, kernel='rbf', probability=True)
classificador_svm.fit(X_credit, y_credit)

In [None]:
import pickle
pickle.dump(classificador_rede_neural, open('rede_neural_finalizado.sav', 'wb'))
pickle.dump(classificador_arvore, open('arvore_finalizado.sav', 'wb'))
pickle.dump(classificador_svm, open('svm_finalizado.sav', 'wb'))

# Carregar um classificador já treinado

In [None]:
rede_neural = pickle.load(open('rede_neural_finalizado.sav', 'rb'))
arvore = pickle.load(open('arvore_finalizado.sav', 'rb'))
svm = pickle.load(open('svm_finalizado.sav', 'rb'))

In [None]:
novo_registro = X_credit[1999]
novo_registro

In [None]:
novo_registro.shape

In [None]:
novo_registro = novo_registro.reshape(1, -1)
novo_registro.shape

In [None]:
novo_registro

In [None]:
rede_neural.predict(novo_registro)

In [None]:
arvore.predict(novo_registro)

In [None]:
svm.predict(novo_registro)

# Combinação de classificadores

In [None]:
novo_registro = X_credit[1999]
novo_registro = novo_registro.reshape(1, -1)
novo_registro, novo_registro.shape

In [None]:
resposta_rede_neural = rede_neural.predict(novo_registro)
resposta_arvore = arvore.predict(novo_registro)
resposta_svm = svm.predict(novo_registro)

In [None]:
resposta_rede_neural[0], resposta_arvore[0], resposta_svm[0]

In [None]:
paga = 0
nao_paga = 0

if resposta_rede_neural[0] == 1:
  nao_paga += 1
else:
  paga += 1

if resposta_arvore[0] == 1:
  nao_paga += 1
else:
  paga += 1

if resposta_svm[0] == 1:
  nao_paga += 1
else:
  paga += 1

if paga > nao_paga:
  print('Cliente pagará o empréstimo')
elif paga == nao_paga:
  print('Empate')
else:
  print('Cliente não pagará o empréstimo')

# Rejeição de classificadores

In [None]:
novo_registro = X_credit[1999]
novo_registro = novo_registro.reshape(1, -1)
novo_registro, novo_registro.shape

In [None]:
resposta_rede_neural = rede_neural.predict(novo_registro)
resposta_arvore = arvore.predict(novo_registro)
resposta_svm = svm.predict(novo_registro)

In [None]:
resposta_rede_neural[0], resposta_arvore[0], resposta_svm[0]

In [None]:
probabilidade_rede_neural = rede_neural.predict_proba(novo_registro)
probabilidade_rede_neural

In [None]:
confianca_rede_neural = probabilidade_rede_neural.max()
confianca_rede_neural

In [None]:
probabilidade_arvore = arvore.predict_proba(novo_registro)
confianca_arvore = probabilidade_arvore.max()
confianca_arvore

In [None]:
probabilidade_svm = svm.predict_proba(novo_registro)
confianca_svm = probabilidade_svm.max()
confianca_svm

In [None]:
paga = 0
nao_paga = 0
confianca_minima = 0.999999
algoritmos = 0

if confianca_rede_neural >= confianca_minima:
  algoritmos += 1
  if resposta_rede_neural[0] == 1:
    nao_paga += 1
  else:
    paga += 1

if confianca_arvore >= confianca_minima:
  algoritmos += 1
  if resposta_arvore[0] == 1:
    nao_paga += 1
  else:
    paga += 1

if confianca_svm >= confianca_minima:
  algoritmos += 1
  if resposta_svm[0] == 1:
    nao_paga += 1
  else:
    paga += 1

if paga > nao_paga:
  print('Cliente pagará o empréstimo, baseado em {} algoritmos'.format(algoritmos))
elif paga == nao_paga:
  print('Empate, baseado em {} algoritmos'.format(algoritmos))
else:
  print('Cliente não pagará o empréstimo, baseado em {} algoritmos'.format(algoritmos))

# Redução de dimensionalidade

## Preparacão da base de dados

In [None]:
base_census = pd.read_csv('/content/census.csv')
base_census

In [None]:
X_census = base_census.iloc[:, 0:14].values
X_census

In [None]:
y_census = base_census.iloc[:, 14].values
y_census

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

In [None]:
X_census[:,1] = label_encoder_workclass.fit_transform(X_census[:,1])
X_census[:,3] = label_encoder_education.fit_transform(X_census[:,3])
X_census[:,5] = label_encoder_marital.fit_transform(X_census[:,5])
X_census[:,6] = label_encoder_occupation.fit_transform(X_census[:,6])
X_census[:,7] = label_encoder_relationship.fit_transform(X_census[:,7])
X_census[:,8] = label_encoder_race.fit_transform(X_census[:,8])
X_census[:,9] = label_encoder_sex.fit_transform(X_census[:,9])
X_census[:,13] = label_encoder_country.fit_transform(X_census[:,13])

In [None]:
X_census[0]

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_census = StandardScaler()
X_census = scaler_census.fit_transform(X_census)

In [None]:
X_census

In [None]:
from sklearn.model_selection import train_test_split
X_census_treinamento, X_census_teste, y_census_treinamento, y_census_teste = train_test_split(X_census, y_census, test_size=0.15, random_state=0)

In [None]:
X_census_treinamento.shape, X_census_teste.shape

## PCA (Principal component analysis)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=8)

In [None]:
X_census_treinamento_pca = pca.fit_transform(X_census_treinamento)
X_census_testes_pca = pca.transform(X_census_teste)

In [None]:
X_census_treinamento_pca.shape, X_census_testes_pca.shape

In [None]:
X_census_treinamento

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_forest_census_pca = RandomForestClassifier(n_estimators=40, random_state=0, criterion = 'entropy')
random_forest_census_pca.fit(X_census_treinamento_pca, y_census_treinamento)

In [None]:
previsoes = random_forest_census_pca.predict(X_census_testes_pca)
previsoes

In [None]:
y_census_teste

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_census_teste, previsoes)

## Kernel PCA

In [None]:
from sklearn.decomposition import KernelPCA

In [None]:
kpca = KernelPCA(n_components=8, kernel='rbf')
X_census_treinamento_kpca = kpca.fit_transform(X_census_treinamento)
X_census_teste_kpca = kpca.transform(X_census_teste)

In [None]:
X_census_treinamento_kpca.shape, X_census_teste_kpca.shape

In [None]:
X_census_treinamento_kpca

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_census_kpca = RandomForestClassifier(n_estimators = 40, criterion = 'entropy', random_state = 0)
random_forest_census_kpca.fit(X_census_treinamento_kpca, y_census_treinamento)

In [None]:
previsoes = random_forest_census_kpca.predict(X_census_teste_kpca)
previsoes

In [None]:
y_census_teste

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_census_teste, previsoes)

## LDA (Linear discriminant analysis)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = 8)

In [None]:
X_census_treinamento_lda = lda.fit_transform(X_census_treinamento, y_census_treinamento)
X_census_teste_lda = lda.transform(X_census_teste)

In [None]:
X_census_treinamento_lda.shape, X_census_teste_lda.shape

In [None]:
X_census_treinamento_lda

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_census_lda = RandomForestClassifier(n_estimators = 40, criterion = 'entropy', random_state = 0)
random_forest_census_lda.fit(X_census_treinamento_lda, y_census_treinamento)

In [None]:
previsoes = random_forest_census_lda.predict(X_census_teste_lda)
previsoes

In [None]:
y_census_teste

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_census_teste, previsoes)

# Detecção de outliers

## Boxplot

In [None]:
base_credit = pd.read_csv('credit_data.csv')
base_credit

In [None]:
base_credit.isnull().sum()

In [None]:
base_credit.dropna(inplace=True)

In [None]:
base_credit.isnull().sum()

In [None]:
1997 / 2

In [None]:
# Outliers idade
grafico = px.box(base_credit, y = 'age')
grafico.show()

In [None]:
outliers_age = base_credit[base_credit['age'] < 0]
outliers_age

In [None]:
# Outliers loan
grafico = px.box(base_credit, y='loan')
grafico.show()

In [None]:
outliers_loan = base_credit[base_credit['loan'] > 13300]
outliers_loan

## Gráfico de dispersão

In [None]:
# Income x age
grafico = px.scatter(x = base_credit['income'], y = base_credit['age'])
grafico.show()

In [None]:
# Income x loan
grafico = px.scatter(x = base_credit['income'], y = base_credit['loan'])
grafico.show()

In [None]:
# Age x loan
grafico = px.scatter(x = base_credit['age'], y = base_credit['loan'])
grafico.show()

In [None]:
base_census = pd.read_csv('census.csv')
base_census

In [None]:
# Age x final weight
grafico = px.scatter(x = base_census['age'], y = base_census['final-weight'])
grafico.show()

## Biblioteca PyOD

- Documentação: https://pyod.readthedocs.io/en/latest/#

In [None]:
!pip install pyod

In [None]:
from pyod.models.knn import KNN

In [None]:
base_credit.head(1)

In [None]:
detector = KNN()
detector.fit(base_credit.iloc[:,1:4])

In [None]:
previsoes = detector.labels_
previsoes

In [None]:
np.unique(previsoes, return_counts=True)

In [None]:
confianca_previsoes = detector.decision_scores_
confianca_previsoes

In [None]:
outliers = []
for i in range(len(previsoes)):
  #print(i)
  if previsoes[i] == 1:
    outliers.append(i)

In [None]:
print(outliers)

In [None]:
lista_outliers = base_credit.iloc[outliers,:]
lista_outliers