## setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/drive/MyDrive/python/공공데이터 공모전/data/')
import pandas as pd
# data path
path = '/content/drive/MyDrive/python/공공데이터 공모전/data/'

In [None]:
global_biz = pd.read_json(path+'mapping30.json')
hsk_final = pd.read_json(path+'hs_transformer_embedding.json')
data = pd.read_excel(path+'비식별된 해외기업별 영문 텍스트데이터.xlsx')
global_biz = pd.merge(data, global_biz, on='DSC', how='left')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers, Model

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

## VAE recommendation system

In [None]:
# Seed 설정
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 42
set_seed(seed)

# 임베딩 값 로드
corp_embeddings = np.array(global_biz['transformer_embedding'].tolist())
hs_embeddings = np.array(hsk_final['transformer_embedding'].tolist())

# 데이터 정규화
scaler = StandardScaler()
corp_embeddings = scaler.fit_transform(corp_embeddings)
hs_embeddings = scaler.fit_transform(hs_embeddings)

# 코사인 유사도 계산
cosine_similarities = cosine_similarity(corp_embeddings, hs_embeddings)

# Improved VAE 모델 정의
class ImprovedVAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(ImprovedVAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, latent_dim * 2)  # mu와 logvar
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Linear(1024, input_dim),
            nn.Sigmoid()
        )

    def encode(self, x):
        mu_logvar = self.encoder(x)
        mu, logvar = mu_logvar.chunk(2, dim=-1)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon_x = self.decode(z)
        return recon_x, mu, logvar

# 손실 함수 정의
def vae_loss(recon_x, x, mu, logvar):
    MSE = nn.functional.mse_loss(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return MSE + KLD

# 하이퍼파라미터 설정
input_dim = corp_embeddings.shape[1]
latent_dim = 50
lr = 1e-4
batch_size = 64
epochs = 100

# 데이터셋 생성
corp_embeddings_tensor = torch.tensor(corp_embeddings, dtype=torch.float32)
dataset = torch.utils.data.TensorDataset(corp_embeddings_tensor)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 모델 초기화
vae = ImprovedVAE(input_dim, latent_dim)
optimizer = optim.Adam(vae.parameters(), lr=lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

# 모델 학습
for epoch in range(epochs):
    vae.train()
    total_loss = 0
    for batch in dataloader:
        x_batch = batch[0]
        optimizer.zero_grad()
        recon_x, mu, logvar = vae(x_batch)
        loss = vae_loss(recon_x, x_batch, mu, logvar)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    avg_loss = total_loss / len(dataloader.dataset)
    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')

In [None]:
# 재구성된 데이터로부터 추천 얻기
vae.eval()
with torch.no_grad():
    reconstructed_data = vae(corp_embeddings_tensor)[0].numpy()

# 재구성된 데이터와 HSK 임베딩 간의 유사도 계산
reconstructed_similarities = cosine_similarity(reconstructed_data, hs_embeddings)

# 상위 10개 유사한 HS 부호 찾기
top_n = 10
most_similar_indices = np.argsort(-reconstructed_similarities, axis=1)[:, :top_n]

# 유사도와 함께 상위 10개 HS 부호 저장
top_similarities = np.sort(-reconstructed_similarities, axis=1)[:, :top_n] * -1

# VAE_top_10 컬럼에 상위 10개 HS 부호와 유사도 저장
vae_top_list = []
for row_idx in range(len(global_biz)):
    vae_top_dict = {}
    for i in range(top_n):
        similarity = top_similarities[row_idx][i]
        hscode = hsk_final['HS부호'].iloc[int(most_similar_indices[row_idx][i])]
        vae_top_dict[str(similarity)] = hscode
    vae_top_list.append(vae_top_dict)

global_biz['VAE_top_10'] = vae_top_list

In [None]:
global_biz.to_json('VAE.json')