## Rascunho - Regressão Logística

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import (plot_class_regions_for_classifier_subplot)
import numpy as np

In [5]:
import pandas as pd
credit_df = pd.read_csv('german_credit_data.csv')
credit_df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [6]:
credit_df['Risk_Num'] = np.where(credit_df['Risk']=='good', 1, 0)
credit_df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Risk_Num
0,0,67,male,2,own,,little,1169,6,radio/TV,good,1
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,0
2,2,49,male,1,own,little,,2096,12,education,good,1
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good,1
4,4,53,male,2,free,little,little,4870,24,car,bad,0


In [7]:
# Cria classe nova para NAs
credit_df=credit_df.fillna('Null')

In [8]:
# Entendendo as variaveis categoricas
print(credit_df['Job'].value_counts())
print(credit_df['Housing'].value_counts())
print(credit_df['Saving accounts'].value_counts())
print(credit_df['Checking account'].value_counts())
print(credit_df['Purpose'].value_counts())

2    630
1    200
3    148
0     22
Name: Job, dtype: int64
own     713
rent    179
free    108
Name: Housing, dtype: int64
little        603
Null          183
moderate      103
quite rich     63
rich           48
Name: Saving accounts, dtype: int64
Null        394
little      274
moderate    269
rich         63
Name: Checking account, dtype: int64
car                    337
radio/TV               280
furniture/equipment    181
business                97
education               59
repairs                 22
domestic appliances     12
vacation/others         12
Name: Purpose, dtype: int64


In [9]:
# Converte as variaveis categoricas em dummies
credit_df=pd.get_dummies(credit_df)

In [10]:
credit_df.head()

Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration,Risk_Num,Sex_female,Sex_male,Housing_free,Housing_own,...,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good
0,0,67,2,1169,6,1,0,1,0,1,...,0,0,0,0,0,1,0,0,0,1
1,1,22,2,5951,48,0,1,0,0,1,...,0,0,0,0,0,1,0,0,1,0
2,2,49,1,2096,12,1,0,1,0,1,...,0,0,0,1,0,0,0,0,0,1
3,3,45,2,7882,42,1,0,1,1,0,...,0,0,0,0,1,0,0,0,0,1
4,4,53,2,4870,24,0,0,1,1,0,...,0,1,0,0,0,0,0,0,1,0


In [11]:
# As variaveis numericas nao sao convertidas, entao
# vamos primeiro criar um df com as dummies dela, depois adicionamos ao nosso df original
dummy_job = pd.get_dummies(credit_df['Job'])
dummy_job = dummy_job.rename(columns={0: "Job 0", 1: "Job 1", 2: "Job 2", 3: "Job 3"})
dummy_job.head()

Unnamed: 0,Job 0,Job 1,Job 2,Job 3
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0


In [12]:
# Passa as variaveis Job para nossa base
credit_df["Job_0"] = dummy_job["Job 0"]
credit_df["Job_1"] = dummy_job["Job 1"]
credit_df["Job_2"] = dummy_job["Job 2"]
credit_df["Job_3"] = dummy_job["Job 3"]
credit_df.head()

Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration,Risk_Num,Sex_female,Sex_male,Housing_free,Housing_own,...,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good,Job_0,Job_1,Job_2,Job_3
0,0,67,2,1169,6,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0
1,1,22,2,5951,48,0,1,0,0,1,...,0,1,0,0,1,0,0,0,1,0
2,2,49,1,2096,12,1,0,1,0,1,...,0,0,0,0,0,1,0,1,0,0
3,3,45,2,7882,42,1,0,1,1,0,...,1,0,0,0,0,1,0,0,1,0
4,4,53,2,4870,24,0,0,1,1,0,...,0,0,0,0,1,0,0,0,1,0


In [13]:
X_train, X_test, y_train, y_test = (train_test_split(credit_df[['Age', 'Sex_female', 'Job_0', 'Job_1', 'Job_2',
                                                                'Credit amount', 'Duration',
                                                                'Housing_free', 'Housing_own',
                                                               'Saving accounts_Null', 'Saving accounts_little', 
                                                                'Saving accounts_moderate',
                                                                'Saving accounts_quite rich',
                                                                'Checking account_Null',
                                                                'Checking account_little',
                                                                'Checking account_moderate',
                                                                'Purpose_business',
                                                               'Purpose_car',
                                                               'Purpose_domestic appliances',
                                                               'Purpose_education',
                                                               'Purpose_furniture/equipment',
                                                               'Purpose_radio/TV',
                                                               'Purpose_repairs']],
                                                     credit_df['Risk_Num'],
                                                     random_state = 0))

In [14]:
X_train.head()

Unnamed: 0,Age,Sex_female,Job_0,Job_1,Job_2,Credit amount,Duration,Housing_free,Housing_own,Saving accounts_Null,...,Checking account_Null,Checking account_little,Checking account_moderate,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs
253,35,0,0,0,1,4151,24,0,1,0,...,1,0,0,0,0,0,0,1,0,0
667,27,1,0,0,1,3609,48,0,1,0,...,1,0,0,1,0,0,0,0,0,0
85,29,1,0,0,0,1412,12,0,1,0,...,1,0,0,1,0,0,0,0,0,0
969,40,0,0,1,0,3939,11,0,1,0,...,0,1,0,0,1,0,0,0,0,0
75,66,0,0,0,0,1526,12,1,0,0,...,0,1,0,0,1,0,0,0,0,0


In [15]:
# Roda logit
clf = LogisticRegression(C=100).fit(X_train, y_train)
# verifica acuracia
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.74
Accuracy of Logistic regression classifier on test set: 0.75


