In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

In [8]:
# 全局加载模型和tokenizer
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [9]:
import pandas as pd
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def load_data(file_path):
    return pd.read_pickle(file_path)

def load_movie_dict(item_file):
    item_df = pd.read_csv(item_file, sep='|', header=None, encoding='latin-1', usecols=[0, 1])
    item_df.columns = ['movie_id', 'movie_title']
    movie_dict = dict(zip(item_df['movie_id'], item_df['movie_title']))
    return movie_dict

def map_movie_names_only(seq, movie_dict):
    return [movie_dict[id] if id in movie_dict else id for (id, rating) in seq]

def extract_sequences(df, movie_dict):
    df['movie_names_only'] = df['seq'].apply(lambda x: map_movie_names_only(x, movie_dict))
    df['seq_only'] = df['seq'].apply(lambda x: [id for (id, rating) in x])
    return df

def get_movie_embeddings(movie_list, tokenizer, model):
    embeddings = []
    for movies in movie_list:
        movie_string = "。".join(str(movie) for movie in movies)
        encoded_input = tokenizer(movie_string, padding=True, truncation=True, return_tensors='pt', max_length=512)  # Specify max_length here
        with torch.no_grad():
            model_output = model(**encoded_input)
            movie_embedding = model_output[0][:, 0]  # Using [CLS] token's embedding
            movie_embedding = torch.nn.functional.normalize(movie_embedding, p=2, dim=1)
            embeddings.append(movie_embedding[0].numpy())
    return np.array(embeddings)

def calculate_similarity(df, tokenizer, model):
    movie_embeddings = get_movie_embeddings(df['movie_names_only'].tolist(), tokenizer, model)
    df['movie_embeddings'] = list(movie_embeddings)
    embeddings = np.stack(df['movie_embeddings'].values)
    similarity_matrix = cosine_similarity(embeddings)
    most_similar_indices = np.argmax(similarity_matrix - np.eye(len(similarity_matrix)), axis=1)
    df['most_similar_seq_index'] = most_similar_indices
    df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idx: df.at[idx, 'seq'])
    return df

def add_most_similar_seq_next(df, movie_dict):
    df['most_similar_seq_next'] = df['next'].iloc[df['most_similar_seq_index']].values
    df['most_similar_seq_name'] = df['most_similar_seq'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
    df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: movie_dict.get(x[0], "Unknown"))
    return df

def save_data(df, output_file_path):
    df.to_pickle(output_file_path)

def process_data(file_path, item_file, output_file_path, tokenizer, model):
    df = load_data(file_path)
    movie_dict = load_movie_dict(item_file)
    df = extract_sequences(df, movie_dict)
    df = calculate_similarity(df, tokenizer, model)
    df = add_most_similar_seq_next(df, movie_dict)
    save_data(df, output_file_path)


In [11]:
# 使用函数处理数据
file_path = '/workspace/LLaRA/data/ref/movielens/train_data.df'
item_file = '/workspace/LLaRA/data/ref/movielens/u.item'
output_file_path = '/workspace/LLaRA/data/ref/movielens/train_data_flagembedding.df'

process_data(file_path, item_file, output_file_path, tokenizer, model)

In [10]:
dt=pd.read_pickle('/workspace/LLaRA/data/ref/movielens/test_data_flagembedding.df')

In [11]:
dt.head()

Unnamed: 0,seq,len_seq,next,movie_names_only,seq_only,movie_embeddings,most_similar_seq_index,most_similar_seq,most_similar_seq_next,most_similar_seq_name,most_similar_seq_next_name
0,"[(704, 5), (155, 3), (80, 4), (698, 4), (745, ...",10,"(3, 4)","[House of the Spirits, The (1993), Dirty Danci...","[704, 155, 80, 698, 745, 95, 1403, 202, 731, 432]","[0.0034087566, 0.024378177, -0.020450857, 0.00...",80,"[(3, 3), (163, 4), (209, 5), (510, 5), (495, 4...","(207, 3)","[Four Rooms (1995), Return of the Pink Panther...",Cyrano de Bergerac (1990)
1,"[(1305, 5), (135, 4), (1125, 5), (1124, 5), (2...",10,"(132, 5)","[National Lampoon's Senior Trip (1995), 2001: ...","[1305, 135, 1125, 1124, 240, 605, 477, 485, 52...","[0.035009373, 0.043504782, -0.012305785, 0.003...",17,"[(801, 4), (529, 3), (448, 5), (569, 4), (230,...","(585, 3)","[Air Up There, The (1994), My Life as a Dog (M...",Son in Law (1993)
2,"[(1030, 5), (394, 3), (104, 3), (779, 4), (103...",10,"(671, 3)","[Beverly Hillbillies, The (1993), Radioland Mu...","[1030, 394, 104, 779, 1036, 66, 446, 1029, 451...","[0.0023376667, 0.039744396, -0.0061233104, 0.0...",90,"[(1059, 4), (547, 3), (382, 3), (202, 4), (442...","(196, 4)",[Don't Be a Menace to South Central While Drin...,Dead Poets Society (1989)
3,"[(537, 4), (293, 4), (244, 3), (287, 5), (524,...",10,"(21, 5)","[My Own Private Idaho (1991), Donnie Brasco (1...","[537, 293, 244, 287, 524, 514, 922, 602, 172, 97]","[0.011190959, 0.026255367, -0.014226674, 0.004...",27,"[(8, 5), (99, 5), (148, 5), (474, 5), (13, 4),...","(123, 5)","[Babe (1995), Snow White and the Seven Dwarfs ...","Frighteners, The (1996)"
4,"[(11, 5), (198, 5), (192, 5), (184, 5), (640, ...",10,"(340, 3)","[Seven (Se7en) (1995), Nikita (La Femme Nikita...","[11, 198, 192, 184, 640, 581, 653, 432, 356, 666]","[-0.022858193, -0.011461976, -0.0041177417, -0...",80,"[(3, 3), (163, 4), (209, 5), (510, 5), (495, 4...","(207, 3)","[Four Rooms (1995), Return of the Pink Panther...",Cyrano de Bergerac (1990)


In [14]:
dt['movie_names_only'][32]

['Deep Rising (1998)',
 'Ma vie en rose (My Life in Pink) (1997)',
 'Deceiver (1997)',
 'Kiss the Girls (1997)',
 'Wishmaster (1997)',
 'Inventing the Abbotts (1997)',
 "Marvin's Room (1996)",
 'Deconstructing Harry (1997)',
 'Air Force One (1997)',
 'Mrs. Brown (Her Majesty, Mrs. Brown) (1997)']

In [15]:
dt['most_similar_seq_name'][32]

['Ma vie en rose (My Life in Pink) (1997)',
 'Secrets & Lies (1996)',
 'Apt Pupil (1998)',
 'Starship Troopers (1997)',
 'Edge, The (1997)',
 'Men in Black (1997)',
 'Deconstructing Harry (1997)',
 '3 Ninjas: High Noon At Mega Mountain (1998)',
 'Kiss the Girls (1997)',
 'MatchMaker, The (1997)']