### 지도학습

#### 분류 - 이진분류

In [2]:
import pandas as pd

In [3]:
df_TFD = pd.read_csv('../../../datasets/TitanicFromDisaster_train.csv')
df_TFD[:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### 전처리

In [4]:
df_TFD.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
df_TFD_extract = df_TFD[['Survived', 'Pclass', 'Age']]
df_TFD_extract.isnull().sum()

Survived      0
Pclass        0
Age         177
dtype: int64

In [6]:
df_TFD_extract_preprocess = df_TFD_extract.dropna()
df_TFD_extract_preprocess[:2]

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0


In [7]:
df_TFD_extract_preprocess.shape

(714, 3)

#### Scaling & Encoding

In [8]:
##### Encoding with OneHotEncoding

In [9]:
df_TFD_extract_preprocess['Pclass'].value_counts()

3    355
1    186
2    173
Name: Pclass, dtype: int64

In [10]:
df_TFD_extract_preprocess.shape

(714, 3)

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
oneHotEncoder = OneHotEncoder()
oneHotEncoder.fit(df_TFD_extract_preprocess[['Pclass']])  # 해당 항목 학습한 것임.

OneHotEncoder()

In [13]:
columns_name = oneHotEncoder.categories_

In [14]:
# oneHotEncoder.transform(df_TFD_extract_preprocess[['Pclass']]).toarray() # 실제값 확인용
encoded_data = oneHotEncoder.transform(df_TFD_extract_preprocess[['Pclass']]).toarray()

In [15]:
encoded_data.shape

(714, 3)

In [16]:
# 병합 위해 numpy array to DataFrame
df_encoded_data = pd.DataFrame(data=encoded_data, columns=oneHotEncoder.get_feature_names_out(['Pclass']))
df_encoded_data[:2]

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0.0,0.0,1.0
1,1.0,0.0,0.0


In [17]:
df_encoded_data.index, df_encoded_data.shape

(RangeIndex(start=0, stop=714, step=1), (714, 3))

In [18]:
df_TFD_extract_preprocess.index, df_TFD_extract_preprocess.shape

(Int64Index([  0,   1,   2,   3,   4,   6,   7,   8,   9,  10,
             ...
             880, 881, 882, 883, 884, 885, 886, 887, 889, 890],
            dtype='int64', length=714),
 (714, 3))

In [19]:
df_TFD_extract_preprocess.isnull().sum()

Survived    0
Pclass      0
Age         0
dtype: int64

In [20]:
# df_encoded_data = pd.get_dummies(df_TFD_extract_preprocess['Pclass'], prefix='Pclass')

In [21]:
df_TFD_extract_preprocess = pd.concat([df_TFD_extract_preprocess.reset_index(drop=True)
                                       , df_encoded_data.reset_index(drop=True)], axis=1)
df_TFD_extract_preprocess[:2]

Unnamed: 0,Survived,Pclass,Age,Pclass_1,Pclass_2,Pclass_3
0,0,3,22.0,0.0,0.0,1.0
1,1,1,38.0,1.0,0.0,0.0


In [22]:
df_TFD_extract_preprocess.shape

(714, 6)

In [21]:
df_TFD_extract_preprocess.columns

Index(['Survived', 'Pclass', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3'], dtype='object')

In [23]:
target = df_TFD_extract_preprocess['Survived']

In [24]:
features = df_TFD_extract_preprocess.drop(columns=['Survived', 'Pclass',])

In [25]:
features.columns

Index(['Age', 'Pclass_1', 'Pclass_2', 'Pclass_3'], dtype='object')

#### MinMaxScaler

In [26]:
from sklearn.preprocessing import MinMaxScaler

In [27]:
minMaxScaler = MinMaxScaler() # 인스턴스화
features = minMaxScaler.fit_transform(features)
features.shape

(714, 4)

#### 정형화 단계

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=111)
features_train.shape, target_train.shape, features_test.shape, target_test.shape

((535, 4), (535,), (179, 4), (179,))

#### 모델학습

In [30]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(features_train, target_train)

LogisticRegression()

In [31]:
model.coef_, model.intercept_
# (array([[-1.25703902, -0.04521969]]), array([3.78621796])) 정규화 이전

(array([[-2.59255724,  1.11141847,  0.02997489, -1.14146766]]),
 array([0.86447724]))

#### 예측

In [32]:
df_TFD_extract_preprocess[10:15] # index가 features_train 같다.

Unnamed: 0,Survived,Pclass,Age,Pclass_1,Pclass_2,Pclass_3
10,1,1,58.0,1.0,0.0,0.0
11,0,3,20.0,0.0,0.0,1.0
12,0,3,39.0,0.0,0.0,1.0
13,0,3,14.0,0.0,0.0,1.0
14,1,2,55.0,0.0,1.0,0.0


In [33]:
model.predict(features_train[10:15])

array([0, 0, 0, 1, 0], dtype=int64)

In [34]:
model.predict_proba(features_train[10:15])

array([[0.59715375, 0.40284625],
       [0.63564456, 0.36435544],
       [0.75219168, 0.24780832],
       [0.47500723, 0.52499277],
       [0.73984819, 0.26015181]])

#### 평가

In [35]:
target_train_predict = model.predict(features_train)
target_train_predict.shape # target_train.shape 동일

(535,)

In [36]:
from sklearn.metrics import accuracy_score # 정확도

In [37]:
accuracy_score(target_train, target_train_predict) # 교내 시험
# 0.7065420560747664 정규화 이전

0.708411214953271

In [38]:
target_test_perdict = model.predict(features_test)
target_test_perdict.shape # target_test.shape 동일

(179,)

In [39]:
accuracy_score(target_test, target_test_perdict) # 교외 시험
# 0.659217877094972 정규화 이전

0.6480446927374302

In [40]:
from sklearn.metrics import classification_report

In [41]:
print(classification_report(target_train, target_train_predict)) 

              precision    recall  f1-score   support

           0       0.72      0.82      0.77       312
           1       0.69      0.55      0.61       223

    accuracy                           0.71       535
   macro avg       0.70      0.69      0.69       535
weighted avg       0.71      0.71      0.70       535



In [42]:
# 정규화 이전
# print(classification_report(target_test, target_test_perdict))

In [43]:
print(classification_report(target_test, target_test_perdict))

              precision    recall  f1-score   support

           0       0.72      0.71      0.71       112
           1       0.53      0.55      0.54        67

    accuracy                           0.65       179
   macro avg       0.63      0.63      0.63       179
weighted avg       0.65      0.65      0.65       179



#### 오차 행렬

In [44]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [45]:
confusion_matrix(target_train, target_train_predict)

array([[257,  55],
       [101, 122]], dtype=int64)

In [46]:
precision_score(target_train, target_train_predict)

0.6892655367231638

In [47]:
recall_score(target_train, target_train_predict)

0.547085201793722

In [48]:
f1_score(target_train, target_train_predict)

0.6100000000000001

#### 서비스

In [49]:
df_TFD_extract[100:103]

Unnamed: 0,Survived,Pclass,Age
100,0,3,28.0
101,0,3,
102,0,1,21.0


In [53]:
# 예측 대상자 입력값 Pclass:3, Age 28.0
encoder_pclass_ = oneHotEncoder.transform([[3]]).toarray()
encoder_pclass_, encoder_pclass_.flatten()



(array([[0., 0., 1.]]), array([0., 0., 1.]))

In [55]:
#[[28.0, [0., 0., 1.]]]
import numpy as np
inputs_data = np.concatenate(([28.0], encoder_pclass_.flatten()))
inputs_data

array([28.,  0.,  0.,  1.])

In [57]:
model.predict([inputs_data])  # 목표변수 얻기 

array([0], dtype=int64)