## Metrics

In [None]:
# Import necessary libraries
import os
import torch
import numpy as np
import matplotlib.pyplot as plt

from model.sasrec import SASRecModel
from trainers import Trainer
from utils import EarlyStopping, check_path, set_seed, set_logger
from dataset import get_seq_dic, get_dataloder, get_rating_matrix

# Set up arguments
class Args:
    data_dir = "./data/"
    output_dir = "output/"
    data_name = "cleaned_final_20241123"
    do_eval = False
    load_model = None
    train_name = "1123_model"
    num_items = 10
    num_users = 10
    lr = 0.001
    batch_size = 1024
    epochs = 10
    # no_cuda = False
    no_cuda = True
    log_freq = 1
    patience = 2
    num_workers = 0  # Set num_workers to 0 to avoid BrokenPipeError on Windows
    seed = 42
    weight_decay = 0.0
    adam_beta1 = 0.9
    adam_beta2 = 0.999
    gpu_id = "0,1,2,3"
    variance = 5
    model_type = 'sasrec'
#     model_type = 'sasrec_model'
    max_seq_length = 10
    hidden_size = 128
    num_hidden_layers = 2
    hidden_act = "gelu"
    num_attention_heads = 2
    attention_probs_dropout_prob = 0.5
    hidden_dropout_prob = 0.5
    initializer_range = 0.02

args = Args()

In [None]:
from collections import Counter
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

model = torch.load("./llmeb_1123.pt")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

In [None]:
# Standalone predict function
def predict(model, input_ids, device):
    model.to(device)
    model.eval()
    input_ids = torch.tensor(input_ids, dtype=torch.long).to(device)
    with torch.no_grad():
        recommend_output = model.forward(input_ids, all_sequence_output=False)
        recommend_output = recommend_output[:, -1, :]  # Last item in the sequence

        test_item_emb = model.item_embeddings.weight.to(device)
        rating_pred = torch.matmul(recommend_output, test_item_emb.transpose(0, 1))
        rating_pred = rating_pred.cpu().data.numpy().copy()

        top20_indices = np.argpartition(rating_pred, -40)[:, -40:]
        arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], top20_indices]
        arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]
        top20_indices = top20_indices[np.arange(len(rating_pred))[:, None], arr_ind_argsort]

    return top20_indices

In [None]:
input_file_path = '/data/log-data-2024/yh/LLM_EB/src/data/cleaned_final_20241123.txt'
with open(input_file_path, 'r') as f:
    input_data = f.readlines()

input_ids = []
for line in input_data:
    items = list(map(int, line.strip().split()))
    pad_len = args.max_seq_length - len(items)
    input_ids.append([0] * pad_len + items)

input_ids = []
for line in input_data:
    items = list(map(int, line.strip().split()))
    # Truncate sequences longer than max_seq_length
    if len(items) > args.max_seq_length:
        items = items[-args.max_seq_length:]
    pad_len = args.max_seq_length - len(items)
    input_ids.append([0] * pad_len + items)

confirm = []
for i in tqdm(input_ids):
    temp = []
    for w in i:
        if w != 0:
            temp.append(w)
    confirm.append(temp)

length = [len(i) for i in confirm]
print(np.mean(length), np.std(length), np.min(length), np.max(length))

In [None]:
label = [i[-1] for i in input_ids]
input_ids = [i[:-1] for i in input_ids]

lab = label[:1000]
ids = input_ids[:1000]

pred = predict(model, ids, device).tolist() # SASRec 예측

In [None]:
def hr_at_k(recommendations, true_labels, k=20):
    score = 0
    recommendations = [i[:k] for i in recommendations]
    for a, i in enumerate(true_labels):
        if i in recommendations[a]:
            score += 1
    return score/len(true_labels)

def precision_at_k(recommendations, true_labels, k=20):
    precision_scores = []
    
    for user_recommendations, true_label in zip(recommendations, true_labels):
        # 추천된 상위 20개 중 실제 정답이 있는지 확인
        hits = 1 if true_label in user_recommendations[:k] else 0
        
        # Precision은 정답이 있으면 1 / k, 없으면 0
        precision = hits / k
        precision_scores.append(precision)
    
    # 모든 사용자에 대한 평균 Precision을 반환
    return sum(precision_scores) / len(precision_scores)

def recall_at_k(recommendations, true_labels, k=20):
    recall_scores = []
    
    for user_recommendations, true_label in zip(recommendations, true_labels):
        # 추천된 상위 k개 중 실제 정답이 있는지 확인
        hits = 1 if true_label in user_recommendations[:k] else 0
        
        # Recall은 정답이 있으면 1, 없으면 0
        recall = hits
        recall_scores.append(recall)
    
    # 모든 사용자에 대한 평균 Recall을 반환
    return sum(recall_scores) / len(recall_scores)

def total_print(k):
    print("LLM for Embedding HR@{}: ".format(k), round(hr_at_k(pred, lab, k = k), 3))
    print("LLM for Embedding precision@{}: ".format(k), round(precision_at_k(pred, lab, k = k), 3))

In [None]:
print(total_print(1))
print(total_print(3))
print(total_print(5))
print(total_print(10))
print(total_print(15))
print(total_print(20))

## Prediction

In [None]:
import pandas as pd
import pickle
from tqdm import tqdm

df = pd.read_csv("/data/log-data-2024/2.sequence_generate_ksc/data/sequence_device_match_241123.csv")
cf = pd.read_csv("/data/log-data-2024/20241123_Final/input_search_final_20241123.txt", sep = "\t", header = None)
df = pd.concat([df, cf], axis = 1)

with open(file= '/data/log-data-2024/20241123_Final/match_dict_final.pickle', mode='rb') as f:
    dic1 = pickle.load(f)

with open(file= '/data/log-data-2024/20241123_Final/match_dict_final2.pickle', mode='rb') as f:
    dic2 = pickle.load(f)
    
samp = list(pd.read_csv("/data/log-data-2024/20241123_Final/8man_sample_20241123")["treatment2"])

In [None]:
rev = dict(zip(list(dic1.values()), list(dic1.keys())))
dic = {}
for i in dic2:
    try:
        dic[rev[i]] = dic2[i]
    except:
        pass

df["use"] = [1 if i in samp else 0 for i in tqdm(df["device_id"])]
df = df[df["use"] == 1]
df = df.drop_duplicates(subset = 'device_id').reset_index(drop = True)

In [None]:
# input_file_path = "/data/log-data-2024/SASRec/BSARec/src/data/input_search_prediction_final.txt"
# with open(input_file_path, 'r') as f:
#     input_data = f.readlines()

input_data = []

for i in df[0]:
    temp = []
    t = i.split()[1:]
    for w in t:
#         temp.append(dic[int(w)])
        try:
            temp.append(dic[int(w)])
        except:
            pass
    input_data.append(" ".join(temp))

input_ids = []
for line in input_data:
    items = list(map(int, line.strip().split()))
    pad_len = args.max_seq_length - len(items)
    input_ids.append([0] * pad_len + items)
    
# confirm = []
# for i in tqdm(input_ids):
#     temp = []
#     for w in i:
#         if w != 0:
#             temp.append(w)
#     confirm.append(temp)

# length = [len(i) for i in confirm]
# print(np.mean(length), np.std(length), np.min(length), np.max(length))

In [None]:
ii = []
for i in input_ids:
    temp = i
    while len(temp) > 50:
        temp = temp[1:]
    ii.append(temp)

In [None]:
pred = []
for i in range(20):
    pred = pred + predict(model, ii[i*1000:(i+1)*1000], device).tolist()

In [None]:
samp = pd.DataFrame({"treatment2" : df["device_id"], "treatment2 prediction" : pred}).reset_index(drop = True)
samp.to_csv("/data/log-data-2024/20241123_Final/8man_sample_20241123_predicted_treatment2")