### 지도학습

#### 분류 - 이진분류

In [1]:
import pandas as pd

In [2]:
df_TFD = pd.read_csv('../../../datasets/TitanicFromDisaster_train.csv')
df_TFD[:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### 전처리

In [3]:
df_TFD.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df_TFD_extract = df_TFD[['Survived', 'Pclass', 'Age']]
df_TFD_extract.isnull().sum()

Survived      0
Pclass        0
Age         177
dtype: int64

In [5]:
df_TFD_extract_preprocess = df_TFD_extract.dropna()
df_TFD_extract_preprocess[:2]

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0


In [6]:
df_TFD_extract_preprocess.shape

(714, 3)

#### Scaling & Encoding

In [7]:
##### Encoding with OneHotEncoding

In [8]:
df_TFD_extract_preprocess['Pclass'].value_counts()

3    355
1    186
2    173
Name: Pclass, dtype: int64

In [9]:
df_TFD_extract_preprocess.shape

(714, 3)

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
oneHotEncoder = OneHotEncoder()
oneHotEncoder.fit(df_TFD_extract_preprocess[['Pclass']])  # 해당 항목 학습한 것임.

OneHotEncoder()

In [12]:
columns_name = oneHotEncoder.categories_

In [13]:
# oneHotEncoder.transform(df_TFD_extract_preprocess[['Pclass']]).toarray() # 실제값 확인용
encoded_data = oneHotEncoder.transform(df_TFD_extract_preprocess[['Pclass']]).toarray()

In [14]:
encoded_data.shape

(714, 3)

In [15]:
# 병합 위해 numpy array to DataFrame
df_encoded_data = pd.DataFrame(data=encoded_data, columns=oneHotEncoder.get_feature_names_out(['Pclass']))
df_encoded_data[:2]

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0.0,0.0,1.0
1,1.0,0.0,0.0


In [16]:
df_encoded_data.index, df_encoded_data.shape

(RangeIndex(start=0, stop=714, step=1), (714, 3))

In [17]:
df_TFD_extract_preprocess.index, df_TFD_extract_preprocess.shape

(Int64Index([  0,   1,   2,   3,   4,   6,   7,   8,   9,  10,
             ...
             880, 881, 882, 883, 884, 885, 886, 887, 889, 890],
            dtype='int64', length=714),
 (714, 3))

In [18]:
df_TFD_extract_preprocess.isnull().sum()

Survived    0
Pclass      0
Age         0
dtype: int64

In [19]:
# df_encoded_data = pd.get_dummies(df_TFD_extract_preprocess['Pclass'], prefix='Pclass')

In [20]:
df_TFD_extract_preprocess = pd.concat([df_TFD_extract_preprocess, df_encoded_data], axis=1)
df_TFD_extract_preprocess[:2]

Unnamed: 0,Survived,Pclass,Age,Pclass_1,Pclass_2,Pclass_3
0,0.0,3.0,22.0,0.0,0.0,1.0
1,1.0,1.0,38.0,1.0,0.0,0.0


In [21]:
df_TFD_extract_preprocess.columns

Index(['Survived', 'Pclass', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3'], dtype='object')

In [22]:
target = df_TFD_extract_preprocess['Survived']

In [23]:
features = df_TFD_extract_preprocess.drop(columns=['Survived', 'Pclass',])

In [24]:
features.columns

Index(['Age', 'Pclass_1', 'Pclass_2', 'Pclass_3'], dtype='object')

#### MinMaxScaler

In [25]:
from sklearn.preprocessing import MinMaxScaler

In [26]:
minMaxScaler = MinMaxScaler() # 인스턴스화
features = minMaxScaler.fit_transform(features)
features.shape

(861, 4)

#### 정형화 단계

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=111)
features_train.shape, target_train.shape, features_test.shape, target_test.shape

((645, 4), (645,), (216, 4), (216,))

#### 모델학습

In [29]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(features_train, target_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
model.coef_, model.intercept_
# (array([[-1.25703902, -0.04521969]]), array([3.78621796])) 정규화 이전

#### 예측

In [None]:
df_TFD_extract_preprocess[10:15] # index가 features_train 같다.

In [None]:
model.predict(features_train[10:15])

In [None]:
model.predict_proba(features_train[10:15])

#### 평가

In [None]:
target_train_predict = model.predict(features_train)
target_train_predict.shape # target_train.shape 동일

In [None]:
from sklearn.metrics import accuracy_score # 정확도

In [None]:
accuracy_score(target_train, target_train_predict) # 교내 시험
# 0.7065420560747664 정규화 이전

In [None]:
target_test_perdict = model.predict(features_test)
target_test_perdict.shape # target_test.shape 동일

In [None]:
accuracy_score(target_test, target_test_perdict) # 교외 시험
# 0.659217877094972 정규화 이전

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(target_train, target_train_predict)) 

In [None]:
# 정규화 이전
# print(classification_report(target_test, target_test_perdict))

In [None]:
print(classification_report(target_test, target_test_perdict))

#### 오차 행렬

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [None]:
confusion_matrix(target_train, target_train_predict)

In [None]:
precision_score(target_train, target_train_predict)

In [None]:
recall_score(target_train, target_train_predict)

In [None]:
f1_score(target_train, target_train_predict)