In [246]:
import pickle
from typing import Optional
import pandas as pd
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import torch
from transformers import LlamaTokenizer, LlamaModel, AutoModelForCausalLM, LlamaForCausalLM, GenerationConfig, LlamaConfig, AutoTokenizer
# 全局加载LLaMA-2-7B模型
model_name_or_path = "/mnt/bn/data-tns-live-llm/leon/datasets/Meta-Llama-3-8B"

print("Loading tokenizer...")
tokenizer: Optional[LlamaTokenizer] = AutoTokenizer.from_pretrained(model_name_or_path)
# 设置pad_token为eos_token
tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded.")

print("Loading model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model: Optional[LlamaForCausalLM] = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16, output_hidden_states=True)
try: model.to(device)
except : pass
print("Model loaded.")

Loading tokenizer...
Tokenizer loaded.
Loading model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded.


In [239]:
# def calculate_dtw_distance(embedding_seq1, embedding_seq2):
#     # 将向量调整为二维数组，以便 fastdtw 正确处理
#     embedding_seq1 = embedding_seq1.reshape(-1, 1)
#     embedding_seq2 = embedding_seq2.reshape(-1, 1)
#     distance, path = fastdtw(embedding_seq1, embedding_seq2, dist=euclidean)
#     return distance

# def calculate_similarity(df):
#     movie_embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
#     df['movie_embeddings'] = list(movie_embeddings)
#     embeddings = np.stack(df['movie_embeddings'].values)
    
#     most_similar_indices = []
#     for i, embedding_seq1 in enumerate(embeddings):
#         min_distance = float('inf')
#         most_similar_index = -1
#         for j, embedding_seq2 in enumerate(embeddings):
#             if i != j:
#                 distance = calculate_dtw_distance(embedding_seq1, embedding_seq2)
#                 if distance < min_distance:
#                     min_distance = distance
#                     most_similar_index = j
#         most_similar_indices.append(most_similar_index)
    
#     df['most_similar_seq_index'] = most_similar_indices
#     df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idx: df.at[idx, 'seq'])
#     return df

# def add_most_similar_seq_next(df, movie_dict):
#     df['most_similar_seq_next'] = df['next'].iloc[df['most_similar_seq_indexs']].values
#     df['most_similar_seq_name'] = df['most_similar_seqs'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
#     df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: movie_dict.get(x[0], "Unknown"))
#     return df

In [167]:
from tqdm import tqdm
import json

def load_data(file_path):
    return pd.read_pickle(file_path)

def load_movie_dict(item_file):
    item_df = pd.read_csv(item_file, sep='|', header=None, encoding='latin-1', usecols=[0, 1])
    item_df.columns = ['movie_id', 'movie_title']
    movie_dict = dict(zip(item_df['movie_id'], item_df['movie_title']))
    return movie_dict

def map_movie_names_only(seq, movie_dict):
    return [movie_dict[id] if id in movie_dict else id for (id, rating) in seq]

def extract_sequences(df, movie_dict):
    df['movie_names_only'] = df['seq'].apply(lambda x: map_movie_names_only(x, movie_dict))
    df['seq_only'] = df['seq'].apply(lambda x: [id for (id, rating) in x])
    return df

def get_movie_embeddings(movie_list):
    embeddings = []
    max_length = 512  # 设定一个合理的最大长度
    for movies in tqdm(movie_list):
        movie_string = " ".join(str(movie) for movie in movies)
        inputs = tokenizer(movie_string, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            movie_embedding = outputs.hidden_states[-1].mean(dim=1).squeeze().cpu()
        embeddings.append(movie_embedding)
    return torch.stack(embeddings)

def get_topk_similar_indices(similarity_scores, topK):
    print(similarity_scores.shape)
    indices = np.argsort(-np.array(similarity_scores.to(torch.float32)))
    print(indices.shape)
    print(indices[-5:])
    topk_indices = np.ones((indices.shape[0], topK))
    for i,indice in enumerate(indices):
        tmp = indice[indice!=i]
        topk_indices[i] = tmp[:topK] # 获取每个向量最相似的topK个索引, 不包含他自己
    # topk_indices = topk_indices.to(torch.int)
    print(topk_indices.shape)
    return topk_indices

def get_topK_candidate(df, topK=50):
    embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
    # df['movie_embeddings'] = list(movie_embeddings)
    # embeddings = np.stack(df['movie_embeddings'].values)
    similarity_scores = embeddings @ embeddings.T
    # 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
    most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
    print(type(most_similar_indices))
    # 将索引信息添加到DataFrame中
    df['most_similar_seq_indexs'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
    # 根据索引获取最相似的序列
    df['most_similar_seqs'] = df['most_similar_seq_indexs'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])
    return df

def add_most_similar_seq_next(df, movie_dict):
    df['most_similar_seq_next'] = df['most_similar_seq_indexs'].apply(lambda idxs: [df.at[idx, 'next'] for idx in json.loads(idxs)])
    df['most_similar_seq_name'] = df['most_similar_seqs'].apply(lambda x: [[movie_dict.get(item[0], "Unknown") for item in items] for items in x])
    df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
    return df

def save_data(df, output_file_path):
    df.to_pickle(output_file_path)

def process_data(file_path, item_file, output_file_path):
    data = load_data(file_path).head(1000)
    movie_dict = load_movie_dict(item_file)
    df = extract_sequences(data, movie_dict)
    df = get_topK_candidate(df)
    df = add_most_similar_seq_next(df, movie_dict)
    save_data(df, output_file_path)
    return df


In [125]:
file_path = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/train_data.df'
item_file = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/u.item'
output_file_path = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/similar_val_data.df'

In [160]:
data = load_data(file_path).head(1000)
movie_dict = load_movie_dict(item_file)
df = extract_sequences(data, movie_dict)

In [168]:
topK = 50
embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
# df['movie_embeddings'] = list(movie_embeddings)
# embeddings = np.stack(df['movie_embeddings'].values)
similarity_scores = embeddings @ embeddings.T
# 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
print(type(most_similar_indices))
# 将索引信息添加到DataFrame中
df['most_similar_seq_indexs'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
print("hello")
# 根据索引获取最相似的序列
df['most_similar_seqs'] = df['most_similar_seq_indexs'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])

100%|██████████| 1000/1000 [00:17<00:00, 56.93it/s]


torch.Size([1000, 1000])
(1000, 1000)
[[488 233 437 ... 573 571 572]
 [488 233 533 ... 570 571 572]
 [488 233 533 ... 570 571 572]
 [488 233 533 ... 570 571 572]
 [488 233 533 ... 570 571 572]]
(1000, 50)
<class 'list'>
hello


In [170]:
movie_dict = load_movie_dict(item_file)
tmp = add_most_similar_seq_next(df, movie_dict)
save_data(tmp, output_file_path)

In [171]:
print(tmp)

                                                   seq  len_seq       next  \
0    [(1682, 0), (1682, 0), (1682, 0), (1682, 0), (...        1   (299, 5)   
1    [(299, 5), (1682, 0), (1682, 0), (1682, 0), (1...        1   (321, 5)   
2    [(299, 5), (321, 5), (1682, 0), (1682, 0), (16...        2   (290, 4)   
3    [(299, 5), (321, 5), (290, 4), (1682, 0), (168...        3   (297, 3)   
4    [(299, 5), (321, 5), (290, 4), (297, 3), (1682...        4   (590, 4)   
..                                                 ...      ...        ...   
995  [(541, 3), (767, 4), (811, 4), (664, 4), (559,...       10   (131, 5)   
996  [(767, 4), (811, 4), (664, 4), (559, 4), (230,...       10   (290, 3)   
997  [(811, 4), (664, 4), (559, 4), (230, 4), (1418...       10   (865, 4)   
998  [(664, 4), (559, 4), (230, 4), (1418, 5), (122...       10  (1033, 3)   
999  [(559, 4), (230, 4), (1418, 5), (1227, 4), (77...       10   (830, 4)   

                                      movie_names_only  \
0    

In [226]:
i=10
print(tmp["movie_names_only"][i][:tmp["len_seq"][i]])

['Hoodlum (1997)', 'Mother (1996)', 'Fierce Creatures (1997)', "Ulee's Gold (1997)", 'Hellraiser: Bloodline (1996)', 'Last Supper, The (1995)', 'Snow White and the Seven Dwarfs (1937)', 'Maximum Risk (1996)', 'White Squall (1996)', 'Home for the Holidays (1995)']


In [183]:
print(tmp["most_similar_seq_name"][10])

[['Braveheart (1995)', 'Full Metal Jacket (1987)', "Miller's Crossing (1990)", 'Right Stuff, The (1983)', 'Notorious (1946)', 'His Girl Friday (1940)', 'Magnificent Seven, The (1954)', 'Army of Darkness (1993)', 'Crossing Guard, The (1995)', 'Raiders of the Lost Ark (1981)'], ['Dances with Wolves (1990)', 'In the Name of the Father (1993)', 'Magnificent Seven, The (1954)', 'Terminator 2: Judgment Day (1991)', 'Gone with the Wind (1939)', 'Princess Bride, The (1987)', 'Ruby in Paradise (1993)', 'Aladdin and the King of Thieves (1996)', 'Empire Strikes Back, The (1980)', 'Fish Called Wanda, A (1988)'], ['GoodFellas (1990)', 'Crimson Tide (1995)', 'Braveheart (1995)', 'Full Metal Jacket (1987)', "Miller's Crossing (1990)", 'Right Stuff, The (1983)', 'Notorious (1946)', 'His Girl Friday (1940)', 'Magnificent Seven, The (1954)', 'Army of Darkness (1993)'], ['Crimson Tide (1995)', 'Braveheart (1995)', 'Full Metal Jacket (1987)', "Miller's Crossing (1990)", 'Right Stuff, The (1983)', 'Notorio

In [227]:
movie_list = [" ".join(names) for names in tmp["most_similar_seq_name"][10][:10]]
print(len(movie_list))
movie_lists = ""
for i,name in enumerate(movie_list):
    movie_lists += f"Watch History {i}: {name} \n"
print(movie_lists)

10
Watch History 0: Braveheart (1995) Full Metal Jacket (1987) Miller's Crossing (1990) Right Stuff, The (1983) Notorious (1946) His Girl Friday (1940) Magnificent Seven, The (1954) Army of Darkness (1993) Crossing Guard, The (1995) Raiders of the Lost Ark (1981) 
Watch History 1: Dances with Wolves (1990) In the Name of the Father (1993) Magnificent Seven, The (1954) Terminator 2: Judgment Day (1991) Gone with the Wind (1939) Princess Bride, The (1987) Ruby in Paradise (1993) Aladdin and the King of Thieves (1996) Empire Strikes Back, The (1980) Fish Called Wanda, A (1988) 
Watch History 2: GoodFellas (1990) Crimson Tide (1995) Braveheart (1995) Full Metal Jacket (1987) Miller's Crossing (1990) Right Stuff, The (1983) Notorious (1946) His Girl Friday (1940) Magnificent Seven, The (1954) Army of Darkness (1993) 
Watch History 3: Crimson Tide (1995) Braveheart (1995) Full Metal Jacket (1987) Miller's Crossing (1990) Right Stuff, The (1983) Notorious (1946) His Girl Friday (1940) Magnifi

In [228]:
target_movie = " ".join(tmp["movie_names_only"][i][:tmp["len_seq"][i]])

In [249]:
prompt = f"""
You are a system that recommends movies based on viewing history. Please evaluate the similarity between each watch history in the candidate list and the single target watch history, and the reason. Rate the similarity on a scale from 1 to 10 between , where 1 is not similar at all and 10 is very similar.

Candidate Watch History:
{movie_lists}

Target Watch History:
{target_movie}

Please output the similarity ratings between each Candidate Watch History and Target Watch History in JSON format. The output should only contain the JSON object with similarity scores and reasons, without any additional text. Output:
"""
print(prompt)


You are a system that recommends movies based on viewing history. Please evaluate the similarity between each watch history in the candidate list and the single target watch history, and the reason. Rate the similarity on a scale from 1 to 10 between , where 1 is not similar at all and 10 is very similar.

Candidate Watch History:
Watch History 0: Braveheart (1995) Full Metal Jacket (1987) Miller's Crossing (1990) Right Stuff, The (1983) Notorious (1946) His Girl Friday (1940) Magnificent Seven, The (1954) Army of Darkness (1993) Crossing Guard, The (1995) Raiders of the Lost Ark (1981) 
Watch History 1: Dances with Wolves (1990) In the Name of the Father (1993) Magnificent Seven, The (1954) Terminator 2: Judgment Day (1991) Gone with the Wind (1939) Princess Bride, The (1987) Ruby in Paradise (1993) Aladdin and the King of Thieves (1996) Empire Strikes Back, The (1980) Fish Called Wanda, A (1988) 
Watch History 2: GoodFellas (1990) Crimson Tide (1995) Braveheart (1995) Full Metal Jac

In [250]:
input = tokenizer(prompt, return_tensors="pt")
try: output = model.generate(input["input_ids"].cuda(), temperature=0.1, max_new_tokens=1024).cpu()[0]
except: output = model.generate(input["input_ids"], temperature=0.1, max_new_tokens=1024).cpu()[0]
output = tokenizer.decode(output)
print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>
You are a system that recommends movies based on viewing history. Please evaluate the similarity between each watch history in the candidate list and the single target watch history, and the reason. Rate the similarity on a scale from 1 to 10 between, where 1 is not similar at all and 10 is very similar.

Candidate Watch History:
Watch History 0: Braveheart (1995) Full Metal Jacket (1987) Miller's Crossing (1990) Right Stuff, The (1983) Notorious (1946) His Girl Friday (1940) Magnificent Seven, The (1954) Army of Darkness (1993) Crossing Guard, The (1995) Raiders of the Lost Ark (1981) 
Watch History 1: Dances with Wolves (1990) In the Name of the Father (1993) Magnificent Seven, The (1954) Terminator 2: Judgment Day (1991) Gone with the Wind (1939) Princess Bride, The (1987) Ruby in Paradise (1993) Aladdin and the King of Thieves (1996) Empire Strikes Back, The (1980) Fish Called Wanda, A (1988) 
Watch History 2: GoodFellas (1990) Crimson Tide (1995) Braveheart (1995