In [28]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [29]:
# 配置全局参数
vocab_size = 10000  # 假设电影ID的总数
embedding_dim = 128  # 嵌入维度
max_length = 10  # 序列最大长度
lstm_units = 64


In [30]:
# 自定义注意力层
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(SelfAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1),
                                 initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[1], 1),
                                 initializer='zeros', trainable=True)
        super(SelfAttention, self).build(input_shape)

    def call(self, inputs):
        e = tf.nn.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)
        alpha = tf.nn.softmax(e, axis=1)
        context = tf.reduce_sum(inputs * alpha, axis=1)
        return context


In [31]:
# 构建模型
def create_model(vocab_size, embedding_dim, max_length, lstm_units):
    input_seq = Input(shape=(max_length,))
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(input_seq)
    lstm_layer = LSTM(lstm_units, return_sequences=True)(embedding_layer)
    attention_layer = SelfAttention()(lstm_layer)
    output_layer = Dense(lstm_units)(attention_layer)
    model = Model(inputs=input_seq, outputs=output_layer)
    return model

# 创建模型
model = create_model(vocab_size, embedding_dim, max_length, lstm_units)



In [32]:
def load_data(file_path):
    return pd.read_pickle(file_path)

def load_movie_dict(item_file):
    item_df = pd.read_csv(item_file, sep='|', header=None, encoding='latin-1', usecols=[0, 1])
    item_df.columns = ['movie_id', 'movie_title']
    movie_dict = dict(zip(item_df['movie_id'], item_df['movie_title']))
    return movie_dict

def map_movie_names_only(seq, movie_dict):
    return [movie_dict[id] if id in movie_dict else id for (id, rating) in seq]

def extract_sequences(df, movie_dict):
    df['movie_names_only'] = df['seq'].apply(lambda x: map_movie_names_only(x, movie_dict))
    df['seq_only'] = df['seq'].apply(lambda x: [id for (id, rating) in x])
    return df


In [33]:
def get_movie_embeddings(sequences, model, max_length):
    # 序列填充
    sequences_padded = pad_sequences(sequences, maxlen=max_length, padding='post')
    # 获取嵌入表示
    embeddings = model.predict(sequences_padded)
    return np.array(embeddings)

def calculate_similarity(df, model, max_length):
    sequences = df['seq_only'].tolist()
    movie_embeddings = get_movie_embeddings(sequences, model, max_length)
    df['movie_embeddings'] = list(movie_embeddings)
    embeddings = np.stack(df['movie_embeddings'].values)
    similarity_matrix = cosine_similarity(embeddings)
    most_similar_indices = np.argmax(similarity_matrix - np.eye(len(similarity_matrix)), axis=1)
    df['most_similar_seq_index'] = most_similar_indices
    df['most_similar_seq'] = df['most_similar_seq_index'].apply(lambda idx: df.at[idx, 'seq'])
    return df


In [34]:
def add_most_similar_seq_next(df, movie_dict):
    df['most_similar_seq_next'] = df['next'].iloc[df['most_similar_seq_index']].values
    df['most_similar_seq_name'] = df['most_similar_seq'].apply(lambda x: [movie_dict.get(item[0], "Unknown") for item in x])
    df['most_similar_seq_next_name'] = df['most_similar_seq_next'].apply(lambda x: movie_dict.get(x[0], "Unknown"))
    return df

def save_data(df, output_file_path):
    df.to_pickle(output_file_path)


In [51]:
def process_data(file_path, item_file, output_file_path):
    df = load_data(file_path)
    movie_dict = load_movie_dict(item_file)
    df = extract_sequences(df, movie_dict)
    df = calculate_similarity(df, model, max_length)
    df = add_most_similar_seq_next(df, movie_dict)
    save_data(df, output_file_path)

# 使用函数处理数据
file_path = '/workspace/LLaRA/data/ref/movielens/Val_data.df'
item_file = '/workspace/LLaRA/data/ref/movielens/u.item'
output_file_path = '/workspace/LLaRA/data/ref/movielens/lstm_val_data.df'

process_data(file_path, item_file, output_file_path)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


In [36]:
df=pd.read_pickle('/workspace/LLaRA/data/ref/movielens/lstm_train_data.df')

In [37]:
df.head()

Unnamed: 0,seq,len_seq,next,movie_names_only,seq_only,movie_embeddings,most_similar_seq_index,most_similar_seq,most_similar_seq_next,most_similar_seq_name,most_similar_seq_next_name
0,"[(1682, 0), (1682, 0), (1682, 0), (1682, 0), (...",1,"(299, 5)","[Scream of Stone (Schrei aus Stein) (1991), Sc...","[1682, 1682, 1682, 1682, 1682, 1682, 1682, 168...","[-0.0123769175, 0.011153697, 0.0014485666, 0.0...",798,"[(1682, 0), (1682, 0), (1682, 0), (1682, 0), (...","(303, 5)","[Scream of Stone (Schrei aus Stein) (1991), Sc...",Ulee's Gold (1997)
1,"[(299, 5), (1682, 0), (1682, 0), (1682, 0), (1...",1,"(321, 5)","[Hoodlum (1997), Scream of Stone (Schrei aus S...","[299, 1682, 1682, 1682, 1682, 1682, 1682, 1682...","[-0.011450895, 0.0058849547, 0.0026951947, 0.0...",1003,"[(299, 4), (1682, 0), (1682, 0), (1682, 0), (1...","(22, 5)","[Hoodlum (1997), Scream of Stone (Schrei aus S...",Braveheart (1995)
2,"[(299, 5), (321, 5), (1682, 0), (1682, 0), (16...",2,"(290, 4)","[Hoodlum (1997), Mother (1996), Scream of Ston...","[299, 321, 1682, 1682, 1682, 1682, 1682, 1682,...","[-0.005407514, 0.008567907, 0.0028236068, 0.01...",48729,"[(299, 4), (321, 3), (1682, 0), (1682, 0), (16...","(258, 3)","[Hoodlum (1997), Mother (1996), Scream of Ston...",Contact (1997)
3,"[(299, 5), (321, 5), (290, 4), (1682, 0), (168...",3,"(297, 3)","[Hoodlum (1997), Mother (1996), Fierce Creatur...","[299, 321, 290, 1682, 1682, 1682, 1682, 1682, ...","[-0.0048835003, 0.0066323597, 0.0013526137, 0....",48729,"[(299, 4), (321, 3), (1682, 0), (1682, 0), (16...","(258, 3)","[Hoodlum (1997), Mother (1996), Scream of Ston...",Contact (1997)
4,"[(299, 5), (321, 5), (290, 4), (297, 3), (1682...",4,"(590, 4)","[Hoodlum (1997), Mother (1996), Fierce Creatur...","[299, 321, 290, 297, 1682, 1682, 1682, 1682, 1...","[-0.0025054945, 0.0028942805, 0.0025338004, 0....",5,"[(299, 5), (321, 5), (290, 4), (297, 3), (590,...","(741, 3)","[Hoodlum (1997), Mother (1996), Fierce Creatur...","Last Supper, The (1995)"


In [45]:
df['movie_names_only'][40]

['Short Cuts (1993)',
 "Carlito's Way (1993)",
 "Weekend at Bernie's (1989)",
 'Nadja (1994)',
 'Fearless (1993)',
 'Flesh and Bone (1993)',
 'Naked Gun 33 1/3: The Final Insult (1994)',
 'Funeral, The (1996)',
 'Rob Roy (1995)',
 "Wes Craven's New Nightmare (1994)"]

In [47]:
df['next'][40]

(539, 4)

In [48]:
df['most_similar_seq_next_name'][40]

'Donnie Brasco (1997)'

In [46]:
df['most_similar_seq_name'][40]

["Carlito's Way (1993)",
 "Weekend at Bernie's (1989)",
 'Nadja (1994)',
 'Fearless (1993)',
 'Flesh and Bone (1993)',
 'Naked Gun 33 1/3: The Final Insult (1994)',
 'Funeral, The (1996)',
 'Rob Roy (1995)',
 "Wes Craven's New Nightmare (1994)",
 'Mouse Hunt (1997)']