In [2]:
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# ======================================
# Prepare Data
def make_batch(data, batch_size, window_size, shuffle=True):
    window_list = []
    for i in range(len(data) - window_size - 1):
        window = data[i: i + window_size]
        window_list.append(window)
#데이터에서 사이즈씩 뽑아서 리스트를 만들어줌...

    if shuffle:
        random.shuffle(window_list)
#셔플은 기본적으로 필요함

#윈도우 리스트는?
#윈도우 사이즈 - 열 숫자? 
#배치 사이즈 - 한번에 넘겨주는 데이터 사이즈-몇개의윈도우를 한번에 넘겨줄것인가.
    n_batch = math.ceil(len(window_list) / batch_size)
    batch_list = []
    for i in range(n_batch): #100에 10개씩이면 10번만
        batch = window_list[i*batch_size: (i+1)*batch_size] #0~9
        batch_list.append(batch)
    batch_list = np.array(batch_list)

    return batch_list

data = pd.read_csv("./nomal-data_1.csv")
data = data.to_numpy()

scaler = MinMaxScaler()
dscaler = scaler.fit(data)
data = dscaler.transform(data)

# ========================================
# Modeling
class SequenceModel(nn.Module): # 아웃풋이 4라는건 4개의 값을 모두 예측? #히든 사이즈의 정의 # 넘레이어..? 보통 사용하는 숫자..찾아봐야 함..
    def __init__(self, input_size=4, output_dim=4, hidden_size=256, num_layers=1):
        super(SequenceModel, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True)
        self.scaler_bias = nn.Parameter(torch.ones(input_size, requires_grad=True))
        self.scaler = nn.Parameter(torch.ones(input_size, requires_grad=True))
        self.linear = nn.Linear(hidden_size, output_dim)

    def forward(self, x):
        x = (x + self.scaler_bias) * self.scaler #식은 어디서 확인 가능..?
        zs, hidden = self.lstm(x)
        z = zs[:, -1]
        v = self.linear(zs)
        return v, z


# ======================================
# Training
window_size = 5
batch_size = 64
hidden_size = 64
use_cuda = True

model = SequenceModel(input_size=data.shape[-1],
                      output_dim=1,
                      hidden_size=hidden_size,
                      num_layers=7)

if use_cuda:
    model.cuda()

optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

n_epoch = 10000
ema_loss = None
alpha = 0.1
verbose_interval = 50

for epoch_i in range(n_epoch):

    batch_list = make_batch(data, batch_size, window_size+1)
    for batch_i, batch in enumerate(batch_list):
        optimizer.zero_grad()

        batch = np.array(batch) #batch는 하나의 batch덩어리. 윈도우들의 모임
        batch_input = batch[:, :-1, :] #3차? 윈도우, 타임스탬프, 피처
        batch_output = batch[:, 1:, :]

        batch_input = torch.tensor(batch_input, dtype=torch.float32)
        batch_output = torch.tensor(batch_output, dtype=torch.float32)

        if use_cuda:
            batch_input = batch_input.cuda()
            batch_output = batch_output.cuda()


        v, _ = model(batch_input)

        loss = loss_fn(v, batch_output) #output, Target 로스값 계산. 해서 다시넣는 과정..

        loss.backward()
        optimizer.step()

        if ema_loss is None:
            ema_loss = loss.item()
        ema_loss = loss.item() * alpha + (1.-alpha) * ema_loss

    if epoch_i % verbose_interval == 0:
        print(f"{epoch_i}th epoch, loss: {ema_loss}")



# ======================================
# Inference
model.eval()
model.cpu()


# Prepare train data distribution
#Z는 학습 데이터를 모델에 태워서 Z를 만들어냄 레이턴트 벡터 .. 레이턴트 백터.. 학습데이터 
Z = []
reconstruction_error = []

batch_list = make_batch(data, batch_size, window_size, False)
for batch_i, batch in enumerate(batch_list):
    batch = np.array(batch)
    batch_input = batch

    batch_input = torch.tensor(batch_input, dtype=torch.float32)
    batch_output = torch.tensor(batch_output, dtype=torch.float32)

    v, z = model(batch_input) #레이턴트 백터 

    Z.extend(z.tolist())  #리스트로 변환해서 Z에 추가/
    reconstruction_error.extend(torch.sum(torch.abs(v-batch_input), dim=[1,2]).detach().tolist())

Z = np.array(Z)
reconstruction_error = np.array(reconstruction_error)


# Samples for quering
sample_pos = [[5.1, 20.5,  1.0,  4.9],
              [4.1, 16.3,  1.0,  6.1],
              [9.1, 36.5,  1.0,  2.7],
              [2.3,  9.2,  1.0, 10.9],
              [1.6,  6.4,  1.0, 15.7],
              [6.6, 26.3,  1.0,  3.8],
              [8.0, 31.9,  1.0,  3.1],
              [7.8, 31.1,  1.0,  3.2],
              [7.0,  28.,  1.0,  3.6],
              [7.0,  28.,  1.0,  3.6]]


sample_neg = [[ 66, 267,   5,   0],
                [ 74, 298,   5,   0],
                [ 88, 354,   5,   0],
              [ 83, 335,   5,   0],
              [ 78, 315,   5,   0],
              [ 96, 385,   5,   0],
              [ 15,  59,   5,   1],
              [ 67, 267,   5,   0],
              [ 75, 303,   5,   0],
              [ 60, 242,   5,   0]]

# pos
sample_pos = np.array(sample_pos)  # sequence_length x feature size
sample_pos = torch.tensor(sample_pos, dtype=torch.float32)  # sequence_length x feature size
sample_pos = sample_pos.unsqueeze(0)  # 1 x sequence_length x feature size
prediction_pos, z_prime_pos = model(sample_pos)

# neg
sample_neg = np.array(sample_neg)  # sequence_length x feature size
sample_neg = torch.tensor(sample_neg, dtype=torch.float32)  # sequence_length x feature size
sample_neg = sample_neg.unsqueeze(0)  # 1 x sequence_length x feature size
prediction_neg, z_prime_neg = model(sample_neg)

z_prime_pos = z_prime_pos.detach().numpy()
z_prime_neg = z_prime_neg.detach().numpy()

reconstruction_error_pos = torch.sum(torch.abs(prediction_pos - sample_pos), dim=[1,2]).detach().tolist()
reconstruction_error_neg = torch.sum(torch.abs(prediction_neg - sample_neg), dim=[1,2]).detach().tolist()


# ======================================
# Visualize latent space
pca = PCA(n_components=2)
pca.fit(Z)

Z_2d = pca.transform(Z)

z_prime_pos_2d = pca.transform(z_prime_pos)
z_prime_neg_2d = pca.transform(z_prime_neg)

plt.scatter(Z_2d[:, 0], Z_2d[:, 1], color='k')
plt.scatter(z_prime_pos_2d[:, 0],z_prime_pos_2d[:, 1] , color='g', label='normal')
plt.scatter(z_prime_neg_2d[:, 0],z_prime_neg_2d[:, 1] , color='r', label='abnormal')
plt.legend()
plt.show()


# ======================================
# Plot Reconstruction Error 
neg_height = 50
min_val = min(min(reconstruction_error), min(reconstruction_error_neg))
max_val = max(max(reconstruction_error), max(reconstruction_error_neg))
bins = np.linspace(min_val, 
                   max_val,
                   100)

plt.hist(reconstruction_error_neg * neg_height, bins=bins, alpha=0.5, color='red', label='abnormal')
plt.hist(reconstruction_error, bins=bins, alpha=0.5,color='k', label='normal')
plt.legend()
plt.show()


#LSTM으로 그룹2개를 생성함.. 정상과 비정상. 
#새로운 데이터가 정상 범주인지 비정상범주인지 확인하려면?

#out_dim 값이랑 스케일링 range 설정좀 바꿔보자.

  return F.mse_loss(input, target, reduction=self.reduction)


0th epoch, loss: 0.06696524372521967


KeyboardInterrupt: 