In [22]:
import pickle
from typing import Optional
import pandas as pd
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import torch
from transformers import LlamaTokenizer, LlamaModel, AutoModelForCausalLM, LlamaForCausalLM, GenerationConfig, LlamaConfig, AutoTokenizer
# 全局加载LLaMA-2-7B模型
model_name_or_path = "/mnt/bn/data-tns-live-llm/leon/datasets/Llama-2-7b-hf"

print("Loading tokenizer...")
tokenizer: Optional[LlamaTokenizer] = AutoTokenizer.from_pretrained(model_name_or_path)
# 设置pad_token为eos_token
tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded.")

print("Loading model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model: Optional[LlamaForCausalLM] = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16, output_hidden_states=True)
try: model.to(device)
except : pass
print("Model loaded.")

Loading tokenizer...
Tokenizer loaded.
Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.


In [2]:
# def calculate_dtw_distance(embedding_seq1, embedding_seq2):
#     # 将向量调整为二维数组，以便 fastdtw 正确处理
#     embedding_seq1 = embedding_seq1.reshape(-1, 1)
#     embedding_seq2 = embedding_seq2.reshape(-1, 1)
#     distance, path = fastdtw(embedding_seq1, embedding_seq2, dist=euclidean)
#     return distance

# def calculate_similarity(df):
#     movie_embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
#     df['movie_embeddings'] = list(movie_embeddings)
#     embeddings = np.stack(df['movie_embeddings'].values)
    
#     most_similar_indices = []
#     for i, embedding_seq1 in enumerate(embeddings):
#         min_distance = float('inf')
#         most_similar_index = -1
#         for j, embedding_seq2 in enumerate(embeddings):
#             if i != j:
#                 distance = calculate_dtw_distance(embedding_seq1, embedding_seq2)
#                 if distance < min_distance:
#                     min_distance = distance
#                     most_similar_index = j
#         most_similar_indices.append(most_similar_index)
    
#     df['most_similar_seq_index'] = most_similar_indices
#     df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idx: df.at[idx, 'seq'])
#     return df

# def add_most_similar_seq_next(df, movie_dict):
#     df['most_similar_seq_next'] = df['next'].iloc[df['most_similar_seq_indexs']].values
#     df['most_similar_seq_name'] = df['most_similar_seqs'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
#     df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: movie_dict.get(x[0], "Unknown"))
#     return df

In [23]:
from tqdm import tqdm
import json

def load_data(file_path):
    return pd.read_pickle(file_path)

def load_movie_dict(item_file):
    item_df = pd.read_csv(item_file, sep='|', header=None, encoding='latin-1', usecols=[0, 1])
    item_df.columns = ['movie_id', 'movie_title']
    movie_dict = dict(zip(item_df['movie_id'], item_df['movie_title']))
    return movie_dict

def map_movie_names_only(seq, movie_dict):
    return [movie_dict[id] if id in movie_dict else id for (id, rating) in seq]

def extract_sequences(df, movie_dict):
    df['movie_names_only'] = df['seq'].apply(lambda x: map_movie_names_only(x, movie_dict))
    df['seq_only'] = df['seq'].apply(lambda x: [id for (id, rating) in x])
    return df

def get_movie_embeddings(movie_list):
    embeddings = []
    max_length = 512  # 设定一个合理的最大长度
    for movies in tqdm(movie_list):
        movie_string = " ".join(str(movie) for movie in movies)
        inputs = tokenizer(movie_string, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            movie_embedding = outputs.hidden_states[-1].mean(dim=1).squeeze().cpu()
        embeddings.append(movie_embedding)
    return torch.stack(embeddings)

def get_topk_similar_indices(similarity_scores, topK):
    print(similarity_scores.shape)
    indices = np.argsort(-np.array(similarity_scores.to(torch.float32)))
    print(indices.shape)
    print(indices[-5:])
    topk_indices = np.ones((indices.shape[0], topK))
    for i,indice in enumerate(indices):
        tmp = indice[indice!=i]
        topk_indices[i] = tmp[:topK] # 获取每个向量最相似的topK个索引, 不包含他自己
    # topk_indices = topk_indices.to(torch.int)
    print(topk_indices.shape)
    return topk_indices

def get_topK_candidate(df, topK=10):
    embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
    # df['movie_embeddings'] = list(movie_embeddings)
    # embeddings = np.stack(df['movie_embeddings'].values)
    similarity_scores = embeddings @ embeddings.T
    # 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
    most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
    print(type(most_similar_indices))
    # 将索引信息添加到DataFrame中
    df['most_similar_seq_index'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
    # 根据索引获取最相似的序列
    df['most_similar_seq'] = df['most_similar_seq_indexs'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])
    return df

def add_most_similar_seq_next(df, movie_dict):
    df['most_similar_seq_next'] = df['most_similar_seq_index'].apply(lambda idxs: [df.at[idx, 'next'] for idx in json.loads(idxs)])
    df['most_similar_seq_name'] = df['most_similar_seq'].apply(lambda x: [[movie_dict.get(item[0], "Unknown") for item in items] for items in x])
    df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
    return df

def save_data(df, output_file_path):
    df.to_pickle(output_file_path)

def process_data(file_path, item_file, output_file_path):
    data = load_data(file_path)
    movie_dict = load_movie_dict(item_file)
    df = extract_sequences(data, movie_dict)
    df = get_topK_candidate(df)
    df = add_most_similar_seq_next(df, movie_dict)
    save_data(df, output_file_path)
    return df


In [24]:
file_path = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/train_data.df'
item_file = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/u.item'
output_file_path = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/similar_train_data.df'

In [27]:
data = load_data(file_path)
movie_dict = load_movie_dict(item_file)
df = extract_sequences(data, movie_dict)

In [28]:
topK = 10
embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
# df['movie_embeddings'] = list(movie_embeddings)
# embeddings = np.stack(df['movie_embeddings'].values)
similarity_scores = embeddings @ embeddings.T
# 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
print(type(most_similar_indices))
# 将索引信息添加到DataFrame中
df['most_similar_seq_index'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
# 根据索引获取最相似的序列
df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])

100%|██████████| 68388/68388 [23:08<00:00, 49.25it/s]


torch.Size([68388, 68388])
(68388, 68388)
[[35108 60202 43368 ... 28484 33981 45316]
 [43369 35108 43368 ... 28484 33981 45316]
 [35108 43369 43368 ... 33981 28484 45316]
 [43369 16659 35108 ... 28484 33981 45316]
 [35108 27329 16659 ... 28484 45316 33981]]
(68388, 10)
<class 'list'>


In [29]:
movie_dict = load_movie_dict(item_file)
tmp = add_most_similar_seq_next(df, movie_dict)
save_data(tmp, output_file_path)

In [30]:
file_path = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/Val_data.df'
item_file = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/u.item'
output_file_path = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/similar_val_data.df'
data = load_data(file_path)
movie_dict = load_movie_dict(item_file)
df = extract_sequences(data, movie_dict)
topK = 10
embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
# df['movie_embeddings'] = list(movie_embeddings)
# embeddings = np.stack(df['movie_embeddings'].values)
similarity_scores = embeddings @ embeddings.T
# 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
print(type(most_similar_indices))
# 将索引信息添加到DataFrame中
df['most_similar_seq_index'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
# 根据索引获取最相似的序列
df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])
movie_dict = load_movie_dict(item_file)
tmp = add_most_similar_seq_next(df, movie_dict)
save_data(tmp, output_file_path)

100%|██████████| 94/94 [00:01<00:00, 49.03it/s]


torch.Size([94, 94])
(94, 94)
[[89 18  8 93 72 56 19 11 42 63  4 74 80  1  5 14 41 85 79 37 67 77 86 90
  34 25 23 20 81 51 27 26 53  6 43 84 47 92 66 57 78 40 64 65 28 24 75 82
  35 68 38 49 55 83 88  7 58 13 73 15 60 39 59 69 33 48 16 36 32 21 70 22
  87  9 52 17 45  0  3 31  2 30 46 61 54 29 50 10 12 76 44 62 91 71]
 [18 93 19 90  8 56 89 11 72 63 14  4 25  1 42 41 85 74 80 37 81 75 57 43
  66 67 28 27 26 79 23 78 34 92  5 86 51 64 20 77 47  6  7 24 82 53 65 35
  88 84 15 49 58 40 38 83 55 73 60 22 33 13 87 59 68 32 70 16 39 69 48 36
  21  9 45 52  2 17 31 46  3 61  0 30 50 54 29 12 10 62 76 44 91 71]
 [93 18 72  1 25 67  4 56  8 63 19 42 26 21 41 68 51 11 14 55 66 80 24 38
  81 89  7 20 43 82 86 28 92 88 47 35 64 15 75 79 37 34 74 52 91 57 85 83
  87  9 45 69 61 59 40 77 22 53 90 17 23 58 65  2 30 78 27 36 39 60 31  0
  70 48  5 32 46 16  6 13 10 73  3 29 84 44 49 54 33 62 50 76 12 71]
 [18 93 72  8 56 19  4 42 63  1 25 92 89 11 67 14 41 82 51 34 81 66 80 20
  28 86 43 26 64 74 75 

In [31]:
file_path = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/Test_data.df'
item_file = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/u.item'
output_file_path = '/mnt/bn/data-tns-live-llm/leon/LLaRA-similar_seq_as_demo-/data/LLaRA/movielens/similar_test_data.df'
data = load_data(file_path)
movie_dict = load_movie_dict(item_file)
df = extract_sequences(data, movie_dict)
topK = 10
embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
# df['movie_embeddings'] = list(movie_embeddings)
# embeddings = np.stack(df['movie_embeddings'].values)
similarity_scores = embeddings @ embeddings.T
# 对于每个嵌入向量，找到最相似的topK个嵌入向量的索引
most_similar_indices = np.array(get_topk_similar_indices(similarity_scores, topK)).tolist()
print(type(most_similar_indices))
# 将索引信息添加到DataFrame中
df['most_similar_seq_index'] = [json.dumps(most_similar_idxs) for most_similar_idxs in most_similar_indices]
# 根据索引获取最相似的序列
df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idxs: [df.at[idx, 'seq'] for idx in json.loads(idxs)])
movie_dict = load_movie_dict(item_file)
tmp = add_most_similar_seq_next(df, movie_dict)
save_data(tmp, output_file_path)

100%|██████████| 95/95 [00:01<00:00, 49.15it/s]

torch.Size([95, 95])
(95, 95)
[[48 22 23 21 70 13 84 90 59 80  5 11  8 75 33  3 89 37 71  1 27 18 88  2
  56 65 14 68 87  4 35 39  7 41 34 36 40 49 55 15 19 24 79 17 28  6 63 62
  94 51 45 46 54 53 12 67 77 74 43 86 25 64 58 29 83  9  0 16 81 20 73 31
  60 82 30 26 85 93 61 52 44 57 91 78 38 76 69 47 72 32 42 50 92 10 66]
 [48 21 22  3 70 84 13 23 59 91 75 11 71  8 88 18 37 33 87  4  5 80 89 39
  35 63 36 28 14 43 64  2  6 12  1 79 56 40 67 53 19 68  7 51 27 94 17 83
  62 24 29 74 41 49 86 90 55 34 16 31 15  0 65 54 20 73 58 77 57 60 44  9
  52 25 78 76 26 69 45 38 82 30 81 85 61 93 47 46 72 32 42 10 50 92 66]
 [48 21 22 13 70 59 84 75 23  5 37 11 71  3 89 18 88  8 80 33 39 87  2 14
  27  4 56 36 35 63 62 41  7  6  1 40 34 28 29 49 55 43 65 90 15 94 53 51
  86 24 83 16 68 64 79 12 67 74 73 19 20 26 77 44 54 17 25 31 92 60 57 58
   0 82 81 52 38  9 91 30 46 45 85 93 78 69 47 61 72 76 32 42 10 50 66]
 [48 22 21 13 84 23 59 70  5 11 37  3 71 75 88  8 18 89 80 87  2 33 39 14
  36 79 40 27 




In [None]:
print(tmp)

In [7]:
i=10
print(tmp["movie_names_only"][i][:tmp["len_seq"][i]])

['Hoodlum (1997)', 'Mother (1996)', 'Fierce Creatures (1997)', "Ulee's Gold (1997)", 'Hellraiser: Bloodline (1996)', 'Last Supper, The (1995)', 'Snow White and the Seven Dwarfs (1937)', 'Maximum Risk (1996)', 'White Squall (1996)', 'Home for the Holidays (1995)']


In [8]:
print(tmp["most_similar_seq_name"][10])

[['Braveheart (1995)', 'Full Metal Jacket (1987)', "Miller's Crossing (1990)", 'Right Stuff, The (1983)', 'Notorious (1946)', 'His Girl Friday (1940)', 'Magnificent Seven, The (1954)', 'Army of Darkness (1993)', 'Crossing Guard, The (1995)', 'Raiders of the Lost Ark (1981)'], ['Dances with Wolves (1990)', 'In the Name of the Father (1993)', 'Magnificent Seven, The (1954)', 'Terminator 2: Judgment Day (1991)', 'Gone with the Wind (1939)', 'Princess Bride, The (1987)', 'Ruby in Paradise (1993)', 'Aladdin and the King of Thieves (1996)', 'Empire Strikes Back, The (1980)', 'Fish Called Wanda, A (1988)'], ['Home Alone (1990)', 'Glengarry Glen Ross (1992)', 'Die Hard 2 (1990)', 'Duck Soup (1933)', 'Star Trek V: The Final Frontier (1989)', 'Brady Bunch Movie, The (1995)', '12 Angry Men (1957)', 'His Girl Friday (1940)', 'Vanya on 42nd Street (1994)', 'Godfather: Part II, The (1974)'], ['Citizen Ruth (1996)', 'Fan, The (1996)', 'Rainmaker, The (1997)', 'Kiss the Girls (1997)', 'Event Horizon (

In [9]:
movie_list = [" ".join(names) for names in tmp["most_similar_seq_name"][10][:10]]
print(len(movie_list))
movie_lists = ""
for i,name in enumerate(movie_list):
    movie_lists += f"Watch History {i+1}: {name} \n"
print(movie_lists)

10
Watch History 1: Braveheart (1995) Full Metal Jacket (1987) Miller's Crossing (1990) Right Stuff, The (1983) Notorious (1946) His Girl Friday (1940) Magnificent Seven, The (1954) Army of Darkness (1993) Crossing Guard, The (1995) Raiders of the Lost Ark (1981) 
Watch History 2: Dances with Wolves (1990) In the Name of the Father (1993) Magnificent Seven, The (1954) Terminator 2: Judgment Day (1991) Gone with the Wind (1939) Princess Bride, The (1987) Ruby in Paradise (1993) Aladdin and the King of Thieves (1996) Empire Strikes Back, The (1980) Fish Called Wanda, A (1988) 
Watch History 3: Home Alone (1990) Glengarry Glen Ross (1992) Die Hard 2 (1990) Duck Soup (1933) Star Trek V: The Final Frontier (1989) Brady Bunch Movie, The (1995) 12 Angry Men (1957) His Girl Friday (1940) Vanya on 42nd Street (1994) Godfather: Part II, The (1974) 
Watch History 4: Citizen Ruth (1996) Fan, The (1996) Rainmaker, The (1997) Kiss the Girls (1997) Event Horizon (1997) Silence of the Lambs, The (1991

In [10]:
target_movie = " ".join(tmp["movie_names_only"][i][:tmp["len_seq"][i]])

In [11]:
prompt = f"""You are a system that recommends movies based on viewing history. Please evaluate the similarity between each watch history in the candidate list and the single target watch history. Rate the similarity on a scale from 1 to 10 between , where 1 is not similar at all and 10 is very similar.

Candidate Watch History:
{movie_lists}

Target Watch History:
{target_movie}

Please output the similarity ratings in JSON format. The output should only contain the JSON object with similarity scores, without any additional text. Output:"""

In [26]:
prompt = f"""You are an intelligent movie recommendation assistant. The sequences below represent the watching histories of users. Ranking the candidate sequences based on their similarity to the target sequence. Similarity is defined by both the semantic content of the movies and the order in which they are watched.

[target sequence: {target_movie}]

[candidate sequences:
{movie_lists}]
Rank the 10 sequences above based on their similarity to the target sequence, considering both the semantic content of the movies and their order. The sequences should be listed in descending order using identifiers. The most similar sequences should be listed first. The output format should be [] > [], e.g., [user 1] > [user 2]. Only respond with the ranking results, do not say any word or explain.
"""
print(prompt)

You are an intelligent movie recommendation assistant. The sequences below represent the watching histories of users. Ranking the candidate sequences based on their similarity to the target sequence. Similarity is defined by both the semantic content of the movies and the order in which they are watched.

[target sequence: Hoodlum (1997) Mother (1996) Fierce Creatures (1997) Ulee's Gold (1997) Hellraiser: Bloodline (1996) Last Supper, The (1995) Snow White and the Seven Dwarfs (1937) Maximum Risk (1996) White Squall (1996)]

[candidate sequences:
[user 1]: [Devil's Advocate, The (1997) Santa Clause, The (1994) Cool Hand Luke (1967) Third Man, The (1949) Twelve Monkeys (1995) Dances with Wolves (1990) Transformers: The Movie, The (1986) Citizen Kane (1941) Abyss, The (1989) Star Trek: First Contact (1996)] 
[user 2]: [Private Benjamin (1980) Tin Men (1987) Unknown Abyss, The (1989) Sleeper (1973) Raising Arizona (1987) Citizen Kane (1941) Client, The (1994) Ben-Hur (1959) Spy Hard (1996

In [12]:
input = tokenizer(prompt, return_tensors="pt")
try: output = model.generate(input["input_ids"].cuda(), temperature=0.1).cpu()[0]
except: output = model.generate(input["input_ids"], temperature=0.1).cpu()[0]
output = tokenizer.decode(output)
print(output)

<s> You are a system that recommends movies based on viewing history. Please evaluate the similarity between each watch history in the candidate list and the single target watch history. Rate the similarity on a scale from 1 to 10 between , where 1 is not similar at all and 10 is very similar.

Candidate Watch History:
Watch History 1: Braveheart (1995) Full Metal Jacket (1987) Miller's Crossing (1990) Right Stuff, The (1983) Notorious (1946) His Girl Friday (1940) Magnificent Seven, The (1954) Army of Darkness (1993) Crossing Guard, The (1995) Raiders of the Lost Ark (1981) 
Watch History 2: Dances with Wolves (1990) In the Name of the Father (1993) Magnificent Seven, The (1954) Terminator 2: Judgment Day (1991) Gone with the Wind (1939) Princess Bride, The (1987) Ruby in Paradise (1993) Aladdin and the King of Thieves (1996) Empire Strikes Back, The (1980) Fish Called Wanda, A (1988) 
Watch History 3: Home Alone (1990) Glengarry Glen Ross (1992) Die Hard 2 (1990) Duck Soup (1933) Sta