# Modelagem de Dados

## Dependências

In [5]:
import pandas as pd


#Libs de Preprocessamento
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#Libs de Seleção
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#Libs de ML
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier

#Libs de Métricas
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score


#Libs auxiliares
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import category_encoders as ce

### Coleta de Dados

In [56]:
df = pd.read_csv('../Data/adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Pré-Processamento

In [57]:
#Remove colunas desnecessárias para análise
df.drop(['education'], axis=1, inplace=True)

#Separa os atribulos dos rotulos
X = df.drop(['income'], axis=1, inplace=False)
y = df['income']

## Método Tradicional

In [65]:
#Aplica o One Hot Encoder
ohe = ce.OneHotEncoder(use_cat_names=True)

X = ohe.fit_transform(X)


#Aplica a Padronização
scaler = StandardScaler().fit(X)

X = scaler.transform(X)

  elif pd.api.types.is_categorical(cols):


In [66]:
# Separa os dados em treino e teste

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.2)

In [67]:
model = tree.DecisionTreeClassifier()

model.fit(X_train,y_train)

#Retorna com os valores de predição do conjunto de treinamento
pred = model.predict(X_train)

#Media de acerto do modelo para o conjunto de treinamento
media = accuracy_score(pred, y_train)
media

1.0

In [68]:
#Algoritmos utilizados na análise

classifiers = [
    KNeighborsClassifier(),
    SVC(gamma='scale'),
    LogisticRegression(solver='lbfgs'),
    BaggingClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(),
    AdaBoostClassifier(),
    MultinomialNB()]

#Nomes dos algoritmos

names = ["Nearest Neighbors", "SVM", "LogisticRegression", "BaggingClassifier", "Decision Tree", "Random Forest", "MLP", "AdaBoost", "Naive Bayes"]

In [69]:
#Salva as informações
scores = []

#Aplicação dos modelos
for name, model in zip(names, classifiers):
    
    #Treina
    model.fit(X_train, y_train)
    
    #Prediz
    predict = model.predict(X_test)
    
    #Métrica de avalização
    #score = f1_score(y_test, predict)
    
    #Salva a métrica de avaliação
    #scores.append(score)



ValueError: Negative values in data passed to MultinomialNB (input X)

## Pipelines

In [9]:
pip_1 = Pipeline([
    ('ohe', ce.OneHotEncoder()),
    ('scaler', StandardScaler()),
    ('model', tree.DecisionTreeClassifier())
])

pip_1.steps

[('ohe', OneHotEncoder()),
 ('scaler', StandardScaler()),
 ('model', DecisionTreeClassifier())]

In [10]:
# Separa em treino e teste

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.2)

In [11]:
# Aplica as etapas de modelagem

pip_1.fit(X_train,y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('ohe',
                 OneHotEncoder(cols=['workclass', 'marital-status',
                                     'occupation', 'relationship', 'race',
                                     'gender', 'native-country'])),
                ('scaler', StandardScaler()),
                ('model', DecisionTreeClassifier())])

In [13]:
# Média

media = pip_1.score(X_test, y_test)
media

0.8083611654957964

In [14]:
# Pipelines

pip_2 = Pipeline([
    ('ohe', ce.OneHotEncoder()),
    ('scaler', MinMaxScaler()),
    ('model', tree.DecisionTreeClassifier())
])

pip_3 = Pipeline([
    ('ohe', ce.OneHotEncoder()),
    ('scaler', MinMaxScaler()),
    ('model', tree.DecisionTreeClassifier(max_depth=3))
])

pip_4 = Pipeline([
    ('ohe', ce.OneHotEncoder()),
    ('scaler', StandardScaler()),
    ('model', tree.DecisionTreeClassifier(max_depth=3))
])

In [15]:
pip_2.fit(X_train,y_train)
media = pip_2.score(X_test, y_test)
media

  elif pd.api.types.is_categorical(cols):


0.8069023762908365

In [16]:
pip_3.fit(X_train,y_train)
media = pip_3.score(X_test, y_test)
media

  elif pd.api.types.is_categorical(cols):


0.8419900955890821

In [17]:
pip_4.fit(X_train,y_train)
media = pip_4.score(X_test, y_test)
media

  elif pd.api.types.is_categorical(cols):


0.8419900955890821

In [18]:
# Aplicando funções

frequente = Pipeline(steps=[
    ('frequente', SimpleImputer(strategy='most_frequent'))
])

mediana = Pipeline(steps=[
    ('mediana', SimpleImputer(strategy='median'))
])

data_cleaning = ColumnTransformer(transformers=[
    ('mediana', mediana, ['educational-num']),
    ('frequente', frequente, ['race']),
])

In [19]:
# Buscando melhorias

pip_5 = Pipeline([
    ('datacleaning', data_cleaning),
    ('ohe', ce.OneHotEncoder()),
    ('scaler', StandardScaler()),
    ('model', tree.DecisionTreeClassifier()),
])

pip_5.fit(X_train, y_train)

pip_5.predict(X_test)

  elif pd.api.types.is_categorical(cols):


array([' >50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' <=50K'],
      dtype=object)

In [20]:
media = pip_5.score(X_test, y_test)
media

0.7781488732772851

In [22]:
# Aplicação de Pipelines com GridSearch

parametros_grid = dict(model__max_depth=[3,4,5,6,7,8,9,10])

grid = GridSearchCV(pip_5, param_grid = parametros_grid, cv = 5, scoring = 'accuracy')

grid.fit(X, y)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('datacleaning',
                                        ColumnTransformer(transformers=[('mediana',
                                                                         Pipeline(steps=[('mediana',
                                                                                          SimpleImputer(strategy='median'))]),
                                                                         ['educational-num']),
                                                                        ('frequente',
                                                                         Pipeline(steps=[('frequente',
                                                                                          SimpleImputer(strategy='most_frequent'))]),
                                                                         ['race'])])),
                                       ('ohe', OneHotEncoder(cols=[0, 1])),
                                

In [23]:
#Comando importante para encontrar os parâmetros

sorted(pip_5.get_params().keys())

['datacleaning',
 'datacleaning__frequente',
 'datacleaning__frequente__frequente',
 'datacleaning__frequente__frequente__add_indicator',
 'datacleaning__frequente__frequente__copy',
 'datacleaning__frequente__frequente__fill_value',
 'datacleaning__frequente__frequente__missing_values',
 'datacleaning__frequente__frequente__strategy',
 'datacleaning__frequente__frequente__verbose',
 'datacleaning__frequente__memory',
 'datacleaning__frequente__steps',
 'datacleaning__frequente__verbose',
 'datacleaning__mediana',
 'datacleaning__mediana__mediana',
 'datacleaning__mediana__mediana__add_indicator',
 'datacleaning__mediana__mediana__copy',
 'datacleaning__mediana__mediana__fill_value',
 'datacleaning__mediana__mediana__missing_values',
 'datacleaning__mediana__mediana__strategy',
 'datacleaning__mediana__mediana__verbose',
 'datacleaning__mediana__memory',
 'datacleaning__mediana__steps',
 'datacleaning__mediana__verbose',
 'datacleaning__n_jobs',
 'datacleaning__remainder',
 'datacleani

In [24]:
#Resultados

grid.cv_results_

{'mean_fit_time': array([0.86320019, 0.47820029, 0.3994    , 0.44979978, 0.42659926,
        0.42419915, 0.42940097, 0.44300041]),
 'std_fit_time': array([0.65269459, 0.1167353 , 0.01078451, 0.10687549, 0.01675637,
        0.01599456, 0.01544603, 0.01316159]),
 'mean_score_time': array([0.1242044 , 0.07039943, 0.07299876, 0.06960077, 0.07319899,
        0.07560248, 0.07140036, 0.07680292]),
 'std_score_time': array([0.08533994, 0.00265487, 0.00468808, 0.00508314, 0.00716821,
        0.00422233, 0.00250251, 0.01772702]),
 'param_model__max_depth': masked_array(data=[3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'model__max_depth': 3},
  {'model__max_depth': 4},
  {'model__max_depth': 5},
  {'model__max_depth': 6},
  {'model__max_depth': 7},
  {'model__max_depth': 8},
  {'model__max_depth': 9},
  {'model__max_depth': 10}],
 'split0_test_score': array([0.77199447, 0.77