In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
device=('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
df = pd.read_csv('data/train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:

df.drop(['PassengerId','Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df=pd.get_dummies(df) #one-hot encoding 방식으로 펼쳐주니까 col개수 증가

In [5]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
imputed = imputer.fit_transform(df)
df_imputed=pd.DataFrame(imputed, columns = df.columns)
df_imputed

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,0.0,3.0,22.0,1.0,0.0,7.2500,0.0,1.0
1,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0
2,1.0,3.0,26.0,0.0,0.0,7.9250,1.0,0.0
3,1.0,1.0,35.0,1.0,0.0,53.1000,1.0,0.0
4,0.0,3.0,35.0,0.0,0.0,8.0500,0.0,1.0
...,...,...,...,...,...,...,...,...
886,0.0,2.0,27.0,0.0,0.0,13.0000,0.0,1.0
887,1.0,1.0,19.0,0.0,0.0,30.0000,1.0,0.0
888,0.0,3.0,26.8,1.0,2.0,23.4500,1.0,0.0
889,1.0,1.0,26.0,0.0,0.0,30.0000,0.0,1.0


In [6]:
X=df_imputed.iloc[:,1:]
y=df_imputed.iloc[:, 0]

In [7]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,3.0,22.0,1.0,0.0,7.2500,0.0,1.0
1,1.0,38.0,1.0,0.0,71.2833,1.0,0.0
2,3.0,26.0,0.0,0.0,7.9250,1.0,0.0
3,1.0,35.0,1.0,0.0,53.1000,1.0,0.0
4,3.0,35.0,0.0,0.0,8.0500,0.0,1.0
...,...,...,...,...,...,...,...
886,2.0,27.0,0.0,0.0,13.0000,0.0,1.0
887,1.0,19.0,0.0,0.0,30.0000,1.0,0.0
888,3.0,26.8,1.0,2.0,23.4500,1.0,0.0
889,1.0,26.0,0.0,0.0,30.0000,0.0,1.0


In [8]:
y

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

In [9]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, shuffle=True)

In [10]:
def df_to_tensor(df):
    return torch.from_numpy(df.values).float().to(device)

In [11]:
X_train=df_to_tensor(X_train)
X_test=df_to_tensor(X_test)
y_train=df_to_tensor(y_train)
y_test=df_to_tensor(y_test)

In [12]:
class Mineral(nn.Module):
    def __init__(self):
        super(Mineral, self).__init__()
        
        self.hidden_linear1=nn.Linear(7, 30)
        self.hidden_linear2=nn.Linear(30, 12)
        self.hidden_linear3=nn.Linear(12, 8)
        self.output_linear=nn.Linear(8, 1)
    
    def forward(self, input):
        out = torch.relu(self.hidden_linear1(input))
        out = torch.relu(self.hidden_linear2(out))
        out = torch.relu(self.hidden_linear3(out))
        out = torch.sigmoid(self.output_linear(out))
        
        return out

In [13]:
model=Mineral().to(device)
model

Mineral(
  (hidden_linear1): Linear(in_features=7, out_features=30, bias=True)
  (hidden_linear2): Linear(in_features=30, out_features=12, bias=True)
  (hidden_linear3): Linear(in_features=12, out_features=8, bias=True)
  (output_linear): Linear(in_features=8, out_features=1, bias=True)
)

In [14]:
class EarlyStopping:
    def __init__(self, patience=3, delta=0.0, mode='min', verbose=True):
        """
        patience (int): loss or score가 개선된 후 기다리는 기간. default: 3
        delta  (float): 개선시 인정되는 최소 변화 수치. default: 0.0
        mode     (str): 개선시 최소/최대값 기준 선정('min' or 'max'). default: 'min'.-- loss 가 min
        verbose (bool): 메시지 출력. default: True
        """
        self.early_stop = False
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        
        self.best_score = np.Inf if mode == 'min' else 0
        self.mode = mode
        self.delta = delta
        

    def __call__(self, score):
        
        #__init__ 메서드는 클래스의 인스턴스를 초기화하기 위해 사용됩니다. 이는 새로운 객체, 즉 인스턴스를 생성할 때 호출되는 메서드입니다. 반면에,
        #__call__ 메서드는 인스턴스를 함수처럼 호출 가능하게 만듭니다.
        #기억해야 할 것은 __init__은 객체가 생성될 때 한 번 호출된다는 것입니다. 그러나 __call__은 인스턴스가 호출될 때마다 여러 번 호출될 수 있습니다.

        if self.best_score is None: # X 왜냐면 infinity or 0
            self.best_score = score
            self.counter = 0 #patience counting 
        elif self.mode == 'min':
            if score < (self.best_score - self.delta): #loss < infinity - 0
                self.counter = 0
                self.best_score = score
                if self.verbose: #verbose= True
                    print(f'[EarlyStopping] (Update) Best Score: {self.best_score:.5f}')
            else: #loss 값이 작아지지 않을 경우
                self.counter += 1 #patience 증가
                # if self.verbose:
                #     print(f'[EarlyStopping] (Patience) {self.counter}/{self.patience}, ' \
                #           f'Best: {self.best_score:.5f}' \
                #           f', Current: {score:.5f}, Delta: {np.abs(self.best_score - score):.5f}')
                
        elif self.mode == 'max': #accuracy 셀 때 max
            if score > (self.best_score + self.delta): #loss < 0+ 0
                self.counter = 0
                self.best_score = score
                if self.verbose:
                    print(f'[EarlyStopping] (Update) Best Score: {self.best_score:.5f}')
            else:
                self.counter += 1
                # if self.verbose:
                #     print(f'[EarlyStopping] (Patience) {self.counter}/{self.patience}, ' \
                #           f'Best: {self.best_score:.5f}' \
                #           f', Current: {score:.5f}, Delta: {np.abs(self.best_score - score):.5f}')
                
            
        if self.counter >= self.patience:
            if self.verbose:
                print(f'[EarlyStop Triggered] Best Score: {self.best_score:.5f}')
            # Early Stop
            self.early_stop = True
        else:
            # Continue
            self.early_stop = False

In [15]:
def model_validation(model, dataloader):
    correct = 0
    with torch.no_grad():
        for data, label in dataloader:
            pred = model(data.to(device))
            result = pred.squeeze(1).ge(torch.tensor(0.5).to(device))
            correct += result.long().eq(label.to(device)).sum().item()
    return (correct / len(dataloader.dataset))

In [16]:
#model 전체 저장
torch.save(model.state_dict(), 'model/titanic.pth')

In [17]:
def model_check_point(loss, model_path, boundary):
    if loss.item() < boundary: #boundary: 이전 loss 값
        torch.save( model.state_dict(), model_path) #state_dict 는 간단히 말해 각 계층을 매개변수 텐서로 매핑되는 Python 사전(dict) 객체 -- 가중치와 bias parameter
        return loss.item()
    else:
        return boundary

In [18]:
early_stopping=EarlyStopping(patience=3, delta=0, mode='min', verbose=True) #default 값이니까 굳이 parameter 넣어주지 않아도 됨

In [19]:
batch_size=300
ds=TensorDataset(X_train, y_train)
dataloader=DataLoader(ds, batch_size=batch_size)

val_ds=TensorDataset(X_test, y_test)
val_loader=DataLoader(val_ds)

optimizer=optim.Adam(model.parameters())

loss_fn=nn.BCELoss()

n_epochs=2000
loss=0.0

save_loss=1.0

list_accuracy=[]
list_loss=[]

for epoch in range(n_epochs+1):
    model.train()
    for data, label in dataloader:
        #data = data.type(torch.FloatTensor)
        out = model(data.to(device))
        loss = loss_fn(out, label.unsqueeze(1).float().to(device))
                
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    accuracy = model_validation(model, val_loader)
    
    model_path = "model/titanic.pth"
    
    save_loss = model_check_point(loss, model_path, save_loss) #저장된 손실 return
    
    list_loss.append(loss.detach().cpu()) #loss가 gpu에 있을 때 cpu로 옮김 (.detach()) 역전파 사슬을 끊겠다 -- 끊은 상태에서만 옮겨짐 
    list_accuracy.append(accuracy)
    
    #if epoch % 20 == 0:
    #        print(f"Epoch {epoch}, Training loss {loss:.4f},")
    #        print("Validation Accuracy: %f" % accuracy)
            
    early_stopping(loss.item()) #__call__
    if early_stopping.early_stop:
        break

[EarlyStopping] (Update) Best Score: 0.71407
[EarlyStopping] (Update) Best Score: 0.70122
[EarlyStopping] (Update) Best Score: 0.68957
[EarlyStopping] (Update) Best Score: 0.67930
[EarlyStopping] (Update) Best Score: 0.67256
[EarlyStopping] (Update) Best Score: 0.66683
[EarlyStopping] (Update) Best Score: 0.66086
[EarlyStopping] (Update) Best Score: 0.65498
[EarlyStopping] (Update) Best Score: 0.64861
[EarlyStopping] (Update) Best Score: 0.64133
[EarlyStopping] (Update) Best Score: 0.63428
[EarlyStopping] (Update) Best Score: 0.62721
[EarlyStopping] (Update) Best Score: 0.62006
[EarlyStopping] (Update) Best Score: 0.61327
[EarlyStopping] (Update) Best Score: 0.60676
[EarlyStopping] (Update) Best Score: 0.60090
[EarlyStopping] (Update) Best Score: 0.59572
[EarlyStopping] (Update) Best Score: 0.59115
[EarlyStopping] (Update) Best Score: 0.58739
[EarlyStopping] (Update) Best Score: 0.58395
[EarlyStopping] (Update) Best Score: 0.58113
[EarlyStopping] (Update) Best Score: 0.57898
[EarlyStop

In [20]:
model.load_state_dict(torch.load('model/titanic.pth', map_location=device))
test_ds=TensorDataset(X_test, y_test)
test_loader=DataLoader(test_ds, batch_size=batch_size)

correct=0

with torch.no_grad():
    for data, label in test_loader:
        pred=model(data.to(device))
        result=pred.squeeze(1).ge(torch.tensor(0.5).to(device)) #1차원으로 변경, greater than equal, batch size 만큼의 TF
        correct+=result.long().eq(label.to(device)).sum().item() #T=1, F=0
        
print("Accuracy: %.4f" % (correct / len(test_loader.dataset)))

Accuracy: 0.8101
