In [2]:
import pickle
import pandas as pd
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import torch
from transformers import LlamaTokenizer, LlamaModel, AutoModelForCausalLM, LlamaForCausalLM, GenerationConfig, LlamaConfig

In [3]:
# 全局加载LLaMA-2-7B模型
model_name_or_path = "/workspace/llama/models_hf/Llama-2-7b-hf"

print("Loading tokenizer...")
tokenizer: LlamaTokenizer = LlamaTokenizer.from_pretrained(model_name_or_path, local_files_only=True, model_max_length=512)
# 设置pad_token为eos_token
tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded.")

print("Loading model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(model_name_or_path, local_files_only=True)
model.to(device)
print("Model loaded.")

Loading tokenizer...
Tokenizer loaded.
Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.


In [4]:
input = tokenizer("introduce the film Marriage story", return_tensors="pt")
output = model.generate(input["input_ids"].cuda(), max_length=512).cpu()

In [5]:
output = tokenizer.decode(output[0])
print(output)

<s> introduce the film Marriage story.
Marriage Story is a 2019 American drama film written, produced and directed by Noah Baumbach, starring Adam Driver and Scarlett Johansson. The film follows a married couple (Driver and Johansson) who go through a divorce.
The film premiered at the 76th Venice International Film Festival on August 30, 2019, and was theatrically released in the United States on November 6, 2019. It received widespread critical acclaim, with many critics praising its direction, screenplay, acting, and musical score.
Marriage Story is a 2019 American drama film written, produced and directed by Noah Baumbach, starring Adam Driver and Scarlett Johansson. The film follows a married couple (Driver and Johansson) who go through a divorce. The film premiered at the 76th Venice International Film Festival on August 30, 2019, and was theatrically released in the United States on November 6, 2019. It received widespread critical acclaim, with many critics praising its directi

In [1]:
def load_data(file_path):
    return pd.read_pickle(file_path)

def load_movie_dict(item_file):
    item_df = pd.read_csv(item_file, sep='|', header=None, encoding='latin-1', usecols=[0, 1])
    item_df.columns = ['movie_id', 'movie_title']
    movie_dict = dict(zip(item_df['movie_id'], item_df['movie_title']))
    return movie_dict

def map_movie_names_only(seq, movie_dict):
    return [movie_dict[id] if id in movie_dict else id for (id, rating) in seq]

def extract_sequences(df, movie_dict):
    df['movie_names_only'] = df['seq'].apply(lambda x: map_movie_names_only(x, movie_dict))
    df['seq_only'] = df['seq'].apply(lambda x: [id for (id, rating) in x])
    return df

def get_movie_embeddings(movie_list):
    embeddings = []
    max_length = 512  # 设定一个合理的最大长度
    for movies in movie_list:
        movie_string = " ".join(str(movie) for movie in movies)
        inputs = tokenizer(movie_string, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            movie_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        embeddings.append(movie_embedding)
    return np.array(embeddings)

def calculate_dtw_distance(embedding_seq1, embedding_seq2):
    # 将向量调整为二维数组，以便 fastdtw 正确处理
    embedding_seq1 = embedding_seq1.reshape(-1, 1)
    embedding_seq2 = embedding_seq2.reshape(-1, 1)
    distance, path = fastdtw(embedding_seq1, embedding_seq2, dist=euclidean)
    return distance

def calculate_similarity(df):
    movie_embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
    df['movie_embeddings'] = list(movie_embeddings)
    embeddings = np.stack(df['movie_embeddings'].values)
    
    most_similar_indices = []
    for i, embedding_seq1 in enumerate(embeddings):
        min_distance = float('inf')
        most_similar_index = -1
        for j, embedding_seq2 in enumerate(embeddings):
            if i != j:
                distance = calculate_dtw_distance(embedding_seq1, embedding_seq2)
                if distance < min_distance:
                    min_distance = distance
                    most_similar_index = j
        most_similar_indices.append(most_similar_index)
    
    df['most_similar_seq_index'] = most_similar_indices
    df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idx: df.at[idx, 'seq'])
    return df

def add_most_similar_seq_next(df, movie_dict):
    df['most_similar_seq_next'] = df['next'].iloc[df['most_similar_seq_index']].values
    df['most_similar_seq_name'] = df['most_similar_seq'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
    df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: movie_dict.get(x[0], "Unknown"))
    return df

def save_data(df, output_file_path):
    df.to_pickle(output_file_path)

def process_data(file_path, item_file, output_file_path):
    df = load_data(file_path)
    movie_dict = load_movie_dict(item_file)
    df = extract_sequences(df, movie_dict)
    df = calculate_similarity(df)
    df = add_most_similar_seq_next(df, movie_dict)
    save_data(df, output_file_path)


In [2]:
# 使用函数处理数据
file_path = '/workspace/LLaRA/data/ref/lastfm/train_data.df'
item_file = '/workspace/LLaRA/data/ref/movielens/id2name.txt'
output_file_path = '/workspace/LLaRA/data/ref/lastfm/similar_train_data.df'

process_data(file_path, item_file, output_file_path)

NameError: name 'pd' is not defined