In [4]:
import pickle
from typing import Optional
import pandas as pd
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import torch
from transformers import LlamaTokenizer, LlamaModel, AutoModelForCausalLM, LlamaForCausalLM, GenerationConfig, LlamaConfig, AutoTokenizer
# 全局加载LLaMA-2-7B模型

In [None]:
model_name_or_path = "/mnt/bn/data-tns-live-llm/leon/datasets/Llama-2-7b-hf"

print("Loading tokenizer...")
tokenizer: Optional[LlamaTokenizer] = AutoTokenizer.from_pretrained(model_name_or_path)
# 设置pad_token为eos_token
tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded.")

print("Loading model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model: Optional[LlamaForCausalLM] = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16, output_hidden_states=True)
try: model.to(device)
except : pass
print("Model loaded.")

In [2]:
# def calculate_dtw_distance(embedding_seq1, embedding_seq2):
#     # 将向量调整为二维数组，以便 fastdtw 正确处理
#     embedding_seq1 = embedding_seq1.reshape(-1, 1)
#     embedding_seq2 = embedding_seq2.reshape(-1, 1)
#     distance, path = fastdtw(embedding_seq1, embedding_seq2, dist=euclidean)
#     return distance

# def calculate_similarity(df):
#     movie_embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
#     df['movie_embeddings'] = list(movie_embeddings)
#     embeddings = np.stack(df['movie_embeddings'].values)
    
#     most_similar_indices = []
#     for i, embedding_seq1 in enumerate(embeddings):
#         min_distance = float('inf')
#         most_similar_index = -1
#         for j, embedding_seq2 in enumerate(embeddings):
#             if i != j:
#                 distance = calculate_dtw_distance(embedding_seq1, embedding_seq2)
#                 if distance < min_distance:
#                     min_distance = distance
#                     most_similar_index = j
#         most_similar_indices.append(most_similar_index)
    
#     df['most_similar_seq_index'] = most_similar_indices
#     df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idx: df.at[idx, 'seq'])
#     return df

# def add_most_similar_seq_next(df, movie_dict):
#     df['most_similar_seq_next'] = df['next'].iloc[df['most_similar_seq_indexs']].values
#     df['most_similar_seq_name'] = df['most_similar_seqs'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
#     df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: movie_dict.get(x[0], "Unknown"))
#     return df

In [5]:
from tqdm import tqdm
import json

def load_data(file_path):
    return pd.read_pickle(file_path)

def load_movie_dict(item_file):
    item_df = pd.read_csv(item_file, sep='|', header=None, encoding='latin-1', usecols=[0, 1])
    item_df.columns = ['movie_id', 'movie_title']
    movie_dict = dict(zip(item_df['movie_id'], item_df['movie_title']))
    return movie_dict

def map_movie_names_only(seq, movie_dict):
    return [movie_dict[id] if id in movie_dict else id for (id, rating) in seq]

def extract_sequences(df, movie_dict):
    df['movie_names_only'] = df['seq'].apply(lambda x: map_movie_names_only(x, movie_dict))
    df['seq_only'] = df['seq'].apply(lambda x: [id for (id, rating) in x])
    return df

def get_movie_embeddings(movie_list):
    embeddings = []
    max_length = 512  # 设定一个合理的最大长度
    for movies in tqdm(movie_list):
        movie_string = " ".join(str(movie) for movie in movies)
        inputs = tokenizer(movie_string, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            movie_embedding = outputs.hidden_states[-1].mean(dim=1).squeeze().cpu()
        embeddings.append(movie_embedding)
    return torch.stack(embeddings)

def get_topk_similar_indices(similarity_scores, topK):
    print(similarity_scores.shape)
    indices = np.argsort(-np.array(similarity_scores.to(torch.float32)))
    print(indices.shape)
    print(indices[-5:])
    topk_indices = np.ones((indices.shape[0], topK))
    for i,indice in enumerate(indices):
        tmp = indice[indice!=i]
        topk_indices[i] = tmp[:topK] # 获取每个向量最相似的topK个索引, 不包含他自己
    # topk_indices = topk_indices.to(torch.int)
    print(topk_indices.shape)
    return topk_indices

def get_topK_candidate(df, topK=10):
    embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
    # df['movie_embeddings'] = list(movie_embeddings)
    # embeddings = np.stack(df['movie_embeddings'].values)
    similarity_scores = embeddings @ embeddings.T
    # 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
    most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
    print(type(most_similar_indices))
    # 将索引信息添加到DataFrame中
    df['most_similar_seq_index'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
    # 根据索引获取最相似的序列
    df['most_similar_seq'] = df['most_similar_seq_indexs'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])
    return df

def add_most_similar_seq_next(df, movie_dict):
    df['most_similar_seq_next'] = df['most_similar_seq_index'].apply(lambda idxs: [df.at[idx, 'next'] for idx in json.loads(idxs)])
    df['most_similar_seq_name'] = df['most_similar_seq'].apply(lambda x: [[movie_dict.get(item[0], "Unknown") for item in items] for items in x])
    df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
    return df

def save_data(df, output_file_path):
    df.to_pickle(output_file_path)

def process_data(file_path, item_file, output_file_path):
    data = load_data(file_path)
    movie_dict = load_movie_dict(item_file)
    df = extract_sequences(data, movie_dict)
    df = get_topK_candidate(df)
    df = add_most_similar_seq_next(df, movie_dict)
    save_data(df, output_file_path)
    return df


In [8]:
file_path = '/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/train_data.df'
item_file = '/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/u.item'
output_file_path = '/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/similar_train_data.df'

In [9]:
data = load_data(file_path)
movie_dict = load_movie_dict(item_file)
df = extract_sequences(data, movie_dict)

In [13]:
topK = 10
embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
# df['movie_embeddings'] = list(movie_embeddings)
# embeddings = np.stack(df['movie_embeddings'].values)
similarity_scores = embeddings @ embeddings.T
# 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
print(type(most_similar_indices))
# 将索引信息添加到DataFrame中
df['most_similar_seq_index'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
# 根据索引获取最相似的序列
df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])

  0%|          | 0/68388 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 68388/68388 [19:08<00:00, 59.56it/s]


torch.Size([68388, 68388])
(68388, 68388)
[[43369 60202 16659 ... 28484 33981 45316]
 [35108 43369 43368 ... 28484 33981 45316]
 [35108  3430 43371 ... 33981 28484 45316]
 [43368 43369 43371 ... 28484 33981 45316]
 [27329 35108 42728 ... 28484 33981 45316]]
(68388, 10)
<class 'list'>


In [14]:
movie_dict = load_movie_dict(item_file)
tmp = add_most_similar_seq_next(df, movie_dict)
save_data(tmp, output_file_path)

In [None]:
file_path = '/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/Val_data.df'
item_file = '/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/u.item'
output_file_path = '/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/similar_val_data.df'
data = load_data(file_path)
movie_dict = load_movie_dict(item_file)
df = extract_sequences(data, movie_dict)
topK = 10
embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
# df['movie_embeddings'] = list(movie_embeddings)
# embeddings = np.stack(df['movie_embeddings'].values)
similarity_scores = embeddings @ embeddings.T
# 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
print(type(most_similar_indices))
# 将索引信息添加到DataFrame中
df['most_similar_seq_index'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
# 根据索引获取最相似的序列
df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])
movie_dict = load_movie_dict(item_file)
tmp = add_most_similar_seq_next(df, movie_dict)
save_data(tmp, output_file_path)

In [None]:
file_path = '/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/Test_data.df'
item_file = '/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/u.item'
output_file_path = '/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/similar_test_data.df'
data = load_data(file_path)
movie_dict = load_movie_dict(item_file)
df = extract_sequences(data, movie_dict)
topK = 10
embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
# df['movie_embeddings'] = list(movie_embeddings)
# embeddings = np.stack(df['movie_embeddings'].values)
similarity_scores = embeddings @ embeddings.T
# 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
print(type(most_similar_indices))
# 将索引信息添加到DataFrame中
df['most_similar_seq_index'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
# 根据索引获取最相似的序列
df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])
movie_dict = load_movie_dict(item_file)
tmp = add_most_similar_seq_next(df, movie_dict)
save_data(tmp, output_file_path)

In [None]:
print(tmp)

In [7]:
i=10
print(tmp["movie_names_only"][i][:tmp["len_seq"][i]])

['Hoodlum (1997)', 'Mother (1996)', 'Fierce Creatures (1997)', "Ulee's Gold (1997)", 'Hellraiser: Bloodline (1996)', 'Last Supper, The (1995)', 'Snow White and the Seven Dwarfs (1937)', 'Maximum Risk (1996)', 'White Squall (1996)', 'Home for the Holidays (1995)']


In [None]:
print(tmp["most_similar_seq_name"][10])

In [None]:
movie_list = [" ".join(names) for names in tmp["most_similar_seq_name"][10][:10]]
print(len(movie_list))
movie_lists = ""
for i,name in enumerate(movie_list):
    movie_lists += f"Watch History {i+1}: {name} \n"
print(movie_lists)

In [10]:
target_movie = " ".join(tmp["movie_names_only"][i][:tmp["len_seq"][i]])

In [11]:
prompt = f"""You are a system that recommends movies based on viewing history. Please evaluate the similarity between each watch history in the candidate list and the single target watch history. Rate the similarity on a scale from 1 to 10 between , where 1 is not similar at all and 10 is very similar.

Candidate Watch History:
{movie_lists}

Target Watch History:
{target_movie}

Please output the similarity ratings in JSON format. The output should only contain the JSON object with similarity scores, without any additional text. Output:"""

In [None]:
prompt = f"""You are an intelligent movie recommendation assistant. The sequences below represent the watching histories of users. Ranking the candidate sequences based on their similarity to the target sequence. Similarity is defined by both the semantic content of the movies and the order in which they are watched.

[target sequence: {target_movie}]

[candidate sequences:
{movie_lists}]
Rank the 10 sequences above based on their similarity to the target sequence, considering both the semantic content of the movies and their order. The sequences should be listed in descending order using identifiers. The most similar sequences should be listed first. The output format should be [] > [], e.g., [user 1] > [user 2]. Only respond with the ranking results, do not say any word or explain.
"""
print(prompt)

In [None]:
input = tokenizer(prompt, return_tensors="pt")
try: output = model.generate(input["input_ids"].cuda(), temperature=0.1).cpu()[0]
except: output = model.generate(input["input_ids"], temperature=0.1).cpu()[0]
output = tokenizer.decode(output)
print(output)

# 查看 similar_train_data

In [2]:
import pandas as pd

In [6]:
data = load_data("/mnt/bn/data-tns-live-llm/leon/recom/LLaRA-similar_seq_as_demo-/data/ref/movielens/similar_train_data.df")
print(data)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f1783281c70>>
Traceback (most recent call last):
  File "/home/tiger/.local/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


                                                     seq  len_seq       next  \
0      [(1682, 0), (1682, 0), (1682, 0), (1682, 0), (...        1   (299, 5)   
1      [(299, 5), (1682, 0), (1682, 0), (1682, 0), (1...        1   (321, 5)   
2      [(299, 5), (321, 5), (1682, 0), (1682, 0), (16...        2   (290, 4)   
3      [(299, 5), (321, 5), (290, 4), (1682, 0), (168...        3   (297, 3)   
4      [(299, 5), (321, 5), (290, 4), (297, 3), (1682...        4   (590, 4)   
...                                                  ...      ...        ...   
68383  [(180, 5), (918, 5), (297, 5), (256, 4), (6, 4...       10   (454, 4)   
68384  [(918, 5), (297, 5), (256, 4), (6, 4), (123, 5...       10   (146, 4)   
68385  [(297, 5), (256, 4), (6, 4), (123, 5), (992, 4...       10  (1006, 4)   
68386  [(256, 4), (6, 4), (123, 5), (992, 4), (762, 3...       10     (0, 5)   
68387  [(6, 4), (123, 5), (992, 4), (762, 3), (272, 3...       10    (14, 4)   

                                       