# Machine Learning: Classification
### 작성: 고우주 | kubwa 쿱와

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## 1. 데이터 불러오기

In [None]:
df = pd.read_csv('../dataset/spaceship-titanic.csv')
df.head()

## 2. 데이터 탐색

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

## 3. 데이터 전처리

In [None]:
RANDOM_STATE = 100
FOLDS = 5
STRATEGY = 'median'

In [None]:
df.drop(["PassengerId"] , axis = 1 , inplace = True)

TARGET = 'Transported'
FEATURES = [col for col in df.columns if col != TARGET]

In [None]:
FEATURES

### 3.1 Numerical Features

#### 3.1.1 결측치 처리: Imputing Missing Values

In [None]:
from sklearn.impute import SimpleImputer

imputer_cols = ["Age", "FoodCourt", "ShoppingMall", "Spa", "VRDeck" ,"RoomService"]

imputer = SimpleImputer(strategy=STRATEGY)
imputer.fit(df[imputer_cols])

df[imputer_cols] = imputer.transform(df[imputer_cols])
df["HomePlanet"].fillna('Z', inplace=True)

In [None]:
df.isnull().sum()

#### 3.1.2 TotalPayement 추출

In [None]:
df['TotalPayment'] = df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"] + df["RoomService"]
df.head()

### 3.2 Categorical Features

#### 3.2.1 AgeGroup 추출

In [None]:
df['AgeGroup'] = np.where(df['Age'] <= 3, 'Baby', np.where(
    df['Age'] <= 12, 'Child', np.where(
    df['Age'] <= 19, 'Adolescent', np.where(
    df['Age'] <= 50, 'Adult', 'Eldery'))))
df.head()

#### 3.2.2 Cabin > Dec, Num, Side 추출

In [None]:
df[['Deck', 'Num','Side']] = df['Cabin'].str.split('/', expand = True,)
df.head()

#### 3.2.3 FamilyName 추출

In [None]:
df["FamilyName"] = df["Name"].str.split(' ').str[-1]
df.head()

In [None]:
df.columns

### 3.3 Encoding Categorical Features

#### 3.3.1 Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

label_cols = ["HomePlanet", "CryoSleep","Cabin", "Destination" ,"VIP", "Transported", "Deck", "Num", "Side", "FamilyName"]

def label_encoder(df,columns):
    for col in columns:
        df[col] = df[col].astype(str)
        df[col] = LabelEncoder().fit_transform(df[col])
    return df

df = label_encoder(df ,label_cols)
df.head()

#### 3.3.2 One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

onehot_cols = ["AgeGroup"]

def one_hot_encoder(df, columns):
    for col in columns:
        df_onehot = pd.get_dummies(df[col], drop_first=True, dtype=float)
        df = pd.concat([df, df_onehot], axis=1)
        df.drop(col, axis=1, inplace=True)  
    return df

df = one_hot_encoder(df ,onehot_cols)
df.head()

In [None]:
df.drop(["Name" ,"Cabin"] , axis = 1 ,inplace = True)
df.head()

### 3.5 전처리 완료 저장

In [None]:
# 전처리 파일 저장
df.to_csv('../dataset/spaceship-preprocessing.csv', index=False)

## 4. 상관관계

In [None]:
import seaborn as sns

plt.figure(figsize=(12, 12))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

## 5. Feature Selection: Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(TARGET , axis =1 )
y = df[TARGET]
X_train , X_test , y_train , y_test = train_test_split(X , 
                                                       y, 
                                                       random_state = RANDOM_STATE,
                                                       train_size =0.8)

X_train.shape, X_test.shape

## 6. Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## 7. Modeling

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, RocCurveDisplay

def plot_confusion_matrix(model, X_test, y_test):
    disp = ConfusionMatrixDisplay.from_estimator(model,
                                                 X_test, y_test,
                                                 cmap=plt.cm.Blues)
def plot_roc_curve(model, X_test, y_test):
    RocCurveDisplay.from_estimator(model, X_test, y_test)
    plt.show()
    
def evaluate(model, y_pred):
    plot_confusion_matrix(model, X_test, y_test)
    print(classification_report(y_test, y_pred))
    plot_roc_curve(model, X_test, y_test)

### 7.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression()
lgr.fit(X_train, y_train)

lgr_preds = lgr.predict(X_test)

In [None]:
lgr_preds

In [None]:
lgr_pred_proba = lgr.predict_proba(X_test)
lgr_pred_proba

In [None]:
lgr_pred_proba[-2]

In [None]:
lgr_pred_proba[-2].argmax()

In [None]:
lgr_preds

In [None]:
evaluate(lgr, lgr_preds)

### 7.2 RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=16, n_jobs=-1)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

In [None]:
evaluate(rf, rf_preds)

### 7.3 XGBoost

In [None]:
#!pip install xgboost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

In [None]:
evaluate(xgb, xgb_preds)

### 7.4 CatBoost

In [None]:
#%pip install catboost

In [None]:
from catboost import CatBoostClassifier

cbc = CatBoostClassifier()
cbc.fit(X_train, y_train, verbose=0)
cbc_preds = cbc.predict(X_test)

In [None]:
evaluate(cbc, cbc_preds)

### 7.5 MLP(Multi Layered Perceptron)
> `MLPClassifier`(hidden_layer_sizes=(100,), activation='relu', *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)[source]

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(512,), 
                    max_iter=300, 
                    early_stopping=True, 
                    validation_fraction=0.2, 
                    random_state=1)
mlp.fit(X_train, y_train)
mlp_preds = mlp.predict(X_test)

In [None]:
evaluate(mlp, mlp_preds)