In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
from sentence_transformers import SentenceTransformer

# 全局加载模型
model = SentenceTransformer(model_name_or_path="/data/projects/wsx/LLaRA/all-MiniLM-L6-v2/")

def load_data(file_path):
    return pd.read_pickle(file_path)

def load_movie_dict(item_file):
    item_df = pd.read_csv(item_file, sep='|', header=None, encoding='latin-1', usecols=[0, 1])
    item_df.columns = ['movie_id', 'movie_title']
    movie_dict = dict(zip(item_df['movie_id'], item_df['movie_title']))
    return movie_dict

def map_movie_names_only(seq, movie_dict):
    return [movie_dict[id] if id in movie_dict else id for (id, rating) in seq]

def extract_sequences(df, movie_dict):
    df['movie_names_only'] = df['seq'].apply(lambda x: map_movie_names_only(x, movie_dict))
    df['seq_only'] = df['seq'].apply(lambda x: [id for (id, rating) in x])
    return df

def get_movie_embeddings(movie_list):
    embeddings = []
    for movies in movie_list:
        movie_string = "。".join(str(movie) for movie in movies)
        movie_embedding = model.encode(movie_string)
        embeddings.append(movie_embedding)
    return np.array(embeddings)

def calculate_similarity(df):
    movie_embeddings = get_movie_embeddings(df['movie_names_only'].tolist())
    df['movie_embeddings'] = list(movie_embeddings)
    embeddings = np.stack(df['movie_embeddings'].values)
    similarity_matrix = cosine_similarity(embeddings)
    most_similar_indices = np.argmax(similarity_matrix - np.eye(len(similarity_matrix)), axis=1)
    df['most_similar_seq_index'] = most_similar_indices
    df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idx: df.at[idx, 'seq'])
    return df

def add_most_similar_seq_next(df, movie_dict):
    df['most_similar_seq_next'] = df['next'].iloc[df['most_similar_seq_index']].values
    df['most_similar_seq_name'] = df['most_similar_seq'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
    df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: movie_dict.get(x[0], "Unknown"))
    return df

def save_data(df, output_file_path):
    df.to_pickle(output_file_path)

def process_data(file_path, item_file, output_file_path):
    df = load_data(file_path)
    movie_dict = load_movie_dict(item_file)
    df = extract_sequences(df, movie_dict)
    df = calculate_similarity(df)
    df = add_most_similar_seq_next(df, movie_dict)
    save_data(df, output_file_path)


In [None]:
# 使用函数处理数据
file_path = '/data/projects/wsx/LLaRA/data/ref/movielens/Val_data.df'
item_file = '/data/projects/wsx/LLaRA/data/ref/movielens/u.item'
output_file_path = '/data/projects/wsx/LLaRA/data/ref/movielens/similar_val_data.df'

process_data(file_path, item_file, output_file_path)

In [1]:
import pandas as pd

In [2]:
df=pd.read_pickle("/workspace/LLaRA/data/ref/movielens/similar_test_data.df")

In [3]:
df.head()

Unnamed: 0,seq,len_seq,next,movie_names_only,seq_only,movie_embeddings,most_similar_seq_index,most_similar_seq,most_similar_seq_next,most_similar_seq_name,most_similar_seq_next_name
0,"[(704, 5), (155, 3), (80, 4), (698, 4), (745, ...",10,"(3, 4)","[House of the Spirits, The (1993), Dirty Danci...","[704, 155, 80, 698, 745, 95, 1403, 202, 731, 432]","[0.01653142, 0.06018769, 0.12848003, -0.023819...",4,"[(11, 5), (198, 5), (192, 5), (184, 5), (640, ...","(340, 3)","[Seven (Se7en) (1995), Nikita (La Femme Nikita...",Boogie Nights (1997)
1,"[(1305, 5), (135, 4), (1125, 5), (1124, 5), (2...",10,"(132, 5)","[National Lampoon's Senior Trip (1995), 2001: ...","[1305, 135, 1125, 1124, 240, 605, 477, 485, 52...","[0.002688799, -0.0696835, 0.14252476, -0.01397...",17,"[(801, 4), (529, 3), (448, 5), (569, 4), (230,...","(585, 3)","[Air Up There, The (1994), My Life as a Dog (M...",Son in Law (1993)
2,"[(1030, 5), (394, 3), (104, 3), (779, 4), (103...",10,"(671, 3)","[Beverly Hillbillies, The (1993), Radioland Mu...","[1030, 394, 104, 779, 1036, 66, 446, 1029, 451...","[-0.014814284, 0.00812782, 0.038343, 0.0050779...",29,"[(258, 3), (69, 5), (735, 5), (1021, 4), (268,...","(257, 5)","[Contact (1997), Forrest Gump (1994), Philadel...",Men in Black (1997)
3,"[(537, 4), (293, 4), (244, 3), (287, 5), (524,...",10,"(21, 5)","[My Own Private Idaho (1991), Donnie Brasco (1...","[537, 293, 244, 287, 524, 514, 922, 602, 172, 97]","[0.0041796635, -0.13311706, 0.06252247, 0.0116...",36,"[(149, 4), (509, 4), (323, 4), (461, 4), (960,...","(1524, 4)","[Jude (1996), My Left Foot (1989), Dante's Pea...",Kaspar Hauser (1993)
4,"[(11, 5), (198, 5), (192, 5), (184, 5), (640, ...",10,"(340, 3)","[Seven (Se7en) (1995), Nikita (La Femme Nikita...","[11, 198, 192, 184, 640, 581, 653, 432, 356, 666]","[-0.16389114, -0.076835975, 0.05094645, 0.1335...",50,"[(6, 3), (673, 5), (217, 4), (163, 4), (671, 3...","(86, 4)",[Shanghai Triad (Yao a yao yao dao waipo qiao)...,"Remains of the Day, The (1993)"
