### 지도학습

#### 분류 - 이진분류

In [1]:
import pandas as pd

In [2]:
df_TFD = pd.read_csv('../../../datasets/TitanicFromDisaster_train.csv')
df_TFD[:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### 전처리

In [3]:
df_TFD.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df_TFD_extract = df_TFD[['Survived', 'Pclass', 'Age']]
df_TFD_extract.isnull().sum()

Survived      0
Pclass        0
Age         177
dtype: int64

In [5]:
df_TFD_extract_preprocess = df_TFD_extract.dropna()
df_TFD_extract_preprocess[:2]

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0


#### 정형화 단계

In [6]:
from sklearn.model_selection import train_test_split
target = df_TFD_extract_preprocess['Survived']
features = df_TFD_extract_preprocess[['Pclass', 'Age']]

In [7]:
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=111)
features_train.shape, target_train.shape, features_test.shape, target_test.shape

((535, 2), (535,), (179, 2), (179,))

#### 모델학습

In [8]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(features_train, target_train)

LogisticRegression()

In [9]:
model.coef_, model.intercept_

(array([[-1.25703902, -0.04521969]]), array([3.78621796]))

#### 예측

In [10]:
df_TFD_extract_preprocess[10:15] # index가 features_train 같다.

Unnamed: 0,Survived,Pclass,Age
11,1,1,58.0
12,0,3,20.0
13,0,3,39.0
14,0,3,14.0
15,1,2,55.0


In [11]:
model.predict(features_train[10:15])

array([0, 0, 0, 0, 0], dtype=int64)

In [12]:
model.predict_proba(features_train[10:15])

array([[0.54134981, 0.45865019],
       [0.5967343 , 0.4032657 ],
       [0.76144372, 0.23855628],
       [0.52337371, 0.47662629],
       [0.74462956, 0.25537044]])

#### 평가

In [13]:
target_train_predict = model.predict(features_train)
target_train_predict.shape # target_train.shape 동일

(535,)

In [14]:
from sklearn.metrics import accuracy_score # 정확도

In [15]:
accuracy_score(target_train, target_train_predict) # 교내 시험

0.7065420560747664

In [16]:
target_test_perdict = model.predict(features_test)
target_test_perdict.shape # target_test.shape 동일

(179,)

In [17]:
accuracy_score(target_test, target_test_perdict) # 교외 시험

0.659217877094972

In [18]:
from sklearn.metrics import classification_report

In [19]:
print(classification_report(target_train, target_train_predict))

              precision    recall  f1-score   support

           0       0.71      0.83      0.77       312
           1       0.69      0.54      0.60       223

    accuracy                           0.71       535
   macro avg       0.70      0.68      0.69       535
weighted avg       0.70      0.71      0.70       535



In [20]:
print(classification_report(target_test, target_test_perdict))

              precision    recall  f1-score   support

           0       0.73      0.72      0.73       112
           1       0.54      0.55      0.55        67

    accuracy                           0.66       179
   macro avg       0.64      0.64      0.64       179
weighted avg       0.66      0.66      0.66       179



#### 오차 행렬

In [24]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [23]:
confusion_matrix(target_train, target_train_predict)

array([[258,  54],
       [103, 120]], dtype=int64)

In [25]:
precision_score(target_train, target_train_predict)

0.6896551724137931

In [26]:
recall_score(target_train, target_train_predict)

0.5381165919282511

In [27]:
f1_score(target_train, target_train_predict)

0.6045340050377833