In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import plot_tree
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier



In [2]:
df = pd.read_csv('c://bd092_data/BankPersonalLoan.csv')
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [4]:
df.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [5]:
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [6]:
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [7]:
df.drop(['ID','Experience', 'ZIP Code'], axis=1, inplace=True)

In [9]:
df.head()
df.describe()

Unnamed: 0,Age,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,45.3384,73.7742,2.3964,1.937938,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,11.463166,46.033729,1.147663,1.747659,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,23.0,8.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35.0,39.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,45.0,64.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,55.0,98.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,67.0,224.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 5000 non-null   int64  
 1   Income              5000 non-null   int64  
 2   Family              5000 non-null   int64  
 3   CCAvg               5000 non-null   float64
 4   Education           5000 non-null   int64  
 5   Mortgage            5000 non-null   int64  
 6   Personal Loan       5000 non-null   int64  
 7   Securities Account  5000 non-null   int64  
 8   CD Account          5000 non-null   int64  
 9   Online              5000 non-null   int64  
 10  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(10)
memory usage: 429.8 KB


In [14]:
x = df.drop('Personal Loan', axis=1)
y = df['Personal Loan']
x.head()
import statsmodels.api as sm
x1 = sm.add_constant(x, has_constant='add')


In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1, stratify=y)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
stdsc = StandardScaler()

x_train.iloc[:,:] = stdsc.fit_transform(x_train.iloc[:,:])
x_test.iloc[:,:] = stdsc.transform(x_test.iloc[:,:])


(3500, 10) (1500, 10) (3500,) (1500,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [22]:
logistic_m = LogisticRegression(solver='liblinear', penalty='l2', C=0.001, random_state=1 ) # 모형 출력
logistic_m.fit(x_train, y_train)

LogisticRegression(C=0.001, random_state=1, solver='liblinear')

In [24]:
y_pred = logistic_m.predict(x_test)
logistic_m.score(x_test, y_test)

0.9326666666666666

In [27]:
y_pred_p = logistic_m.predict_proba(x_test)
y_pred_p
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [30]:
from sklearn.metrics import confusion_matrix, classification_report
confmat = pd.DataFrame(confusion_matrix(y_test, y_pred))
confmat

Unnamed: 0,0,1
0,1337,19
1,82,62


In [31]:
print('오분류 갯수 : %d ' %(y_test!=y_pred).sum())
print('정확도 : %.3f' %accuracy_score(y_test, y_pred))
print('f1 점수 : %.3f' %f1_score(y_test, y_pred))

오분류 갯수 : 101 
정확도 : 0.933
f1 점수 : 0.551


In [35]:
# 최적화 하기 
from sklearn.pipeline import make_pipeline, Pipeline
model1 = make_pipeline(LogisticRegression())
model1.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'logisticregression', 'logisticregression__C', 'logisticregression__class_weight', 'logisticregression__dual', 'logisticregression__fit_intercept', 'logisticregression__intercept_scaling', 'logisticregression__l1_ratio', 'logisticregression__max_iter', 'logisticregression__multi_class', 'logisticregression__n_jobs', 'logisticregression__penalty', 'logisticregression__random_state', 'logisticregression__solver', 'logisticregression__tol', 'logisticregression__verbose', 'logisticregression__warm_start'])

In [37]:
from sklearn.model_selection import GridSearchCV

param_range1 = ['l1', 'l2', 'elasticnet', 'none']
param_range2 = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
param_range3 = range(1,6)

param_grid = [{'logisticregression__penalty' : param_range1, 
              'logisticregression__solver' : param_range2, 
              'logisticregression__C' : param_range3}]
clf_cv = GridSearchCV(estimator=model1, 
                     param_grid=param_grid, 
                     scoring='accuracy',
                     n_jobs=-1,
                     cv=10)
clf_cv.fit(x_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('logisticregression',
                                        LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'logisticregression__C': range(1, 6),
                          'logisticregression__penalty': ['l1', 'l2',
                                                          'elasticnet',
                                                          'none'],
                          'logisticregression__solver': ['newton-cg', 'lbfgs',
                                                         'liblinear', 'sag',
                                                         'saga']}],
             scoring='accuracy')

In [40]:
print(clf_cv.best_score_)
print(clf_cv.best_params_)

0.9522857142857143
{'logisticregression__C': 1, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}


In [41]:
logistic_m_new = LogisticRegression(solver='newton-cg', penalty='l2', C=1, random_state=1)
logistic_m_new.fit(x_train, y_train)

LogisticRegression(C=1, random_state=1, solver='newton-cg')

In [43]:
y_pred = logistic_m_new.predict(x_test)
y_pred
logistic_m_new.score(x_test, y_test)

0.9473333333333334

In [44]:
print('오분류 갯수 : %d' %(y_pred!=y_test).sum())
print('정확도 : %.3f' %accuracy_score(y_test, y_pred))
print('f1점수 : %.3f' %f1_score(y_test, y_pred))

오분류 갯수 : 79
정확도 : 0.947
f1점수 : 0.683
