Logistic Regression

In [73]:
# 이진 분류로 진행, Logistic Regression
# 이진 분류를 위한 활성화 함수 sigmoid 함수 사용
# 최적의 W, b를 찾기 위한 함수 cost 함수 사용
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [74]:
# 랜덤 시드 설정
torch.manual_seed(1)

<torch._C.Generator at 0x1c54862e7d0>

In [75]:
# 데이터 로딩
import numpy as np
import pandas as pd

dataDF = pd.read_csv('winequality-white.csv', sep=';')
dataDF.info(), dataDF.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


(None,
        fixed acidity  volatile acidity  citric acid  residual sugar  \
 count    4898.000000       4898.000000  4898.000000     4898.000000   
 mean        6.854788          0.278241     0.334192        6.391415   
 std         0.843868          0.100795     0.121020        5.072058   
 min         3.800000          0.080000     0.000000        0.600000   
 25%         6.300000          0.210000     0.270000        1.700000   
 50%         6.800000          0.260000     0.320000        5.200000   
 75%         7.300000          0.320000     0.390000        9.900000   
 max        14.200000          1.100000     1.660000       65.800000   
 
          chlorides  free sulfur dioxide  total sulfur dioxide      density  \
 count  4898.000000          4898.000000           4898.000000  4898.000000   
 mean      0.045772            35.308085            138.360657     0.994027   
 std       0.021848            17.007137             42.498065     0.002991   
 min       0.009000        

In [88]:
# Train Data
feature = dataDF['alcohol']
target = dataDF['quality']

# Transform into tensor
x_train = torch.tensor(feature.values, dtype=torch.float32)
y_train_init = torch.tensor(target.values)
x_train = x_train.unsqueeze(1)
y_train_init = y_train_init.unsqueeze(1)

# Transfrom into bionary value
y_train = torch.where(y_train_init < 6, 0., 1.)
x_train, torch.mean(y_train)

(tensor([[ 8.8000],
         [ 9.5000],
         [10.1000],
         ...,
         [ 9.4000],
         [12.8000],
         [11.8000]]),
 tensor(0.6652))

In [77]:
x_train.shape, y_train.shape

(torch.Size([4898, 1]), torch.Size([4898, 1]))

In [92]:
# Design Model
model = nn.Sequential(
    nn.Linear(1,1),
    nn.Sigmoid()
)

# Set Optimizer : SGD
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [109]:
def training():
    nb_epochs = 1000
    for epoch in range(1, nb_epochs + 1):
        
        # H(x)
        hypothesis = model(x_train)
        
        # Cost
        # .binary_cross_entropy : 이진분류 손실함수
        cost = F.binary_cross_entropy(hypothesis, y_train)
        
        # Update W, b
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        # Print log
        if epoch % 20 == 0:
            prediction = hypothesis >= torch.FloatTensor([0.5])     # 예측값이 0.5를 넘으면 True로 간주
            correct_prediction = prediction.float() == y_train      # 실제값과 일치하는 경우만 True로 간주
            accuracy = correct_prediction.sum().item() / len(correct_prediction) # 정확도를 계산
 
            print(f'Epoch {epoch:4d}/{nb_epochs} Cost: {cost.item():.6f} Accuracy {accuracy * 100:2.2f}%')
            

In [110]:
training()

Epoch   20/1000 Cost: 0.688145 Accuracy 53.86%
Epoch   40/1000 Cost: 0.687514 Accuracy 53.86%
Epoch   60/1000 Cost: 0.686887 Accuracy 53.88%
Epoch   80/1000 Cost: 0.686263 Accuracy 53.90%
Epoch  100/1000 Cost: 0.685643 Accuracy 53.96%
Epoch  120/1000 Cost: 0.685027 Accuracy 55.66%
Epoch  140/1000 Cost: 0.684414 Accuracy 55.66%
Epoch  160/1000 Cost: 0.683804 Accuracy 55.66%
Epoch  180/1000 Cost: 0.683198 Accuracy 55.68%
Epoch  200/1000 Cost: 0.682596 Accuracy 55.68%
Epoch  220/1000 Cost: 0.681996 Accuracy 55.68%
Epoch  240/1000 Cost: 0.681400 Accuracy 55.68%
Epoch  260/1000 Cost: 0.680807 Accuracy 57.19%
Epoch  280/1000 Cost: 0.680218 Accuracy 57.19%
Epoch  300/1000 Cost: 0.679631 Accuracy 57.19%
Epoch  320/1000 Cost: 0.679048 Accuracy 57.19%
Epoch  340/1000 Cost: 0.678468 Accuracy 57.19%
Epoch  360/1000 Cost: 0.677891 Accuracy 57.19%
Epoch  380/1000 Cost: 0.677317 Accuracy 57.19%
Epoch  400/1000 Cost: 0.676746 Accuracy 57.19%
Epoch  420/1000 Cost: 0.676178 Accuracy 58.11%
Epoch  440/10

In [111]:
# Check W, b
params = list(model.parameters())
print('W :', params[0].item())
print('b :', params[1].item())

W : 0.5161390900611877
b : -3.2900876998901367


In [112]:
# Predict
val = torch.FloatTensor([[11.0]])
W = params[0].item()
b = params[1].item()

y_pred = W * val + b
y_pred

tensor([[2.3874]])

In [113]:
model(val)

tensor([[0.9159]], grad_fn=<SigmoidBackward0>)

검색 결과 : sigmoid() 에서 0.9159은 91.59%의 확신도로 1일 것이라 판단하는 것
           => 알코올이 1일 확률이 91.59% 이다.