In [15]:
import pandas as pd
import numpy as np
from dtw import dtw

# 定义 safe_eval 函数
def safe_eval(val):
    if isinstance(val, str):
        return eval(val)
    return val

# 定义 LCS 函数
def lcs(X, Y):
    m = len(X)
    n = len(Y)
    L = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i-1] == Y[j-1]:  # 只考虑电影ID
                L[i][j] = L[i-1][j-1] + 1
            else:
                L[i][j] = max(L[i-1][j], L[i][j-1])
    
    return L[m][n]

# 定义 Sequence Order Preservation 函数
def sequence_order_preservation(seq, most_similar_seq):
    # 提取电影ID
    seq_ids = [item[0] for item in seq]
    most_similar_seq_ids = [item[0] for item in most_similar_seq]
    
    lcs_length = lcs(seq_ids, most_similar_seq_ids)
    if min(len(seq_ids), len(most_similar_seq_ids)) == 0:
        return 0
    return lcs_length / min(len(seq_ids), len(most_similar_seq_ids))

# 定义 DCG 和 nDCG 函数
def dcg_at_k(r, k):
    """计算前k个位置的DCG值"""
    r = np.asfarray(r)[:k]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    return 0.0

def ndcg_at_k(r, k):
    """计算前k个位置的nDCG值"""
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.0
    return dcg_at_k(r, k) / dcg_max

# 解析 seq 和 most_similar_seq 列
def parse_sequences(df):
    df['seq'] = df['seq'].apply(safe_eval)
    df['most_similar_seq'] = df['most_similar_seq'].apply(safe_eval)
    return df

# 计算 Sequence Order Preservation
def calculate_sequence_order_preservation(df):
    df['sequence_order_preservation'] = df.apply(
        lambda row: sequence_order_preservation(row['seq'], row['most_similar_seq']), axis=1)
    return df

# 计算 nDCG@1, nDCG@2, nDCG@3
def calculate_ndcg(df):
    k_list = [1, 2, 3]
    for k in k_list:
        df[f'ndcg@{k}'] = df.apply(lambda row: ndcg_at_k([rating for _, rating in row['most_similar_seq']], k), axis=1)
    return df

# 计算 DTW
def calculate_dtw(seq, most_similar_seq):
    # 提取电影ID
    seq_ids = np.array([item[0] for item in seq]).reshape(-1, 1)
    most_similar_seq_ids = np.array([item[0] for item in most_similar_seq]).reshape(-1, 1)
    
    # 使用欧氏距离计算 DTW
    dist, _, _, _ = dtw(seq_ids, most_similar_seq_ids, dist=lambda x, y: np.linalg.norm(x - y, ord=1))
    return dist

def calculate_dtw_for_df(df):
    df['dtw_distance'] = df.apply(
        lambda row: calculate_dtw(row['seq'], row['most_similar_seq']), axis=1)
    return df

# 主函数
def main():
    # 加载数据
    df = pd.read_pickle('/workspace/LLaRA/Sequence_analysis/All-Mini-VS-Llama/AllMini_train_data.df')
    
    # 解析序列
    df = parse_sequences(df)
    
    # 计算 Sequence Order Preservation
    df = calculate_sequence_order_preservation(df)
    
    # 计算 nDCG@1, nDCG@2, nDCG@3
    df = calculate_ndcg(df)
    
    # 计算 DTW
    df = calculate_dtw_for_df(df)
    
    # 输出结果
    print(df[['sequence_order_preservation', 'ndcg@1', 'ndcg@2', 'ndcg@3', 'dtw_distance']])
    
    # 保存更新后的 DataFrame 到 CSV 文件
    df.to_csv('/workspace/LLaRA/Sequence_analysis/All-Mini-VS-Llama/AllMini_train.csv', index=False)

# 运行主函数
if __name__ == "__main__":
    main()


In [6]:
#AllMini_test.csv
def main():
    # 加载数据
    df = pd.read_csv('/workspace/LLaRA/Sequence_analysis/All-Mini-VS-Llama/AllMini_test.csv')
    
    # 计算各评估指标的平均值
    metrics = ['sequence_order_preservation', 'ndcg@1', 'ndcg@2', 'ndcg@3', 'dtw_distance']
    averages = {metric: df[metric].mean() for metric in metrics}
  
    print("Average Metrics:")
    for metric, avg in averages.items():
        print(f"{metric}: {avg}")

# 运行主函数
if __name__ == "__main__":
    main()


Average Metrics:
sequence_order_preservation: 0.12315789473684209
ndcg@1: 0.834736842105263
ndcg@2: 0.8658479532163742
ndcg@3: 0.8742713136891715
dtw_distance: 2243.7368421052633


In [8]:
#AllMini_val.csv
def main():
    # 加载数据
    df = pd.read_csv('/workspace/LLaRA/Sequence_analysis/All-Mini-VS-Llama/AllMini_val.csv')
    
    # 计算各评估指标的平均值
    metrics = ['sequence_order_preservation', 'ndcg@1', 'ndcg@2', 'ndcg@3', 'dtw_distance']
    averages = {metric: df[metric].mean() for metric in metrics}
  
    print("Average Metrics:")
    for metric, avg in averages.items():
        print(f"{metric}: {avg}")

# 运行主函数
if __name__ == "__main__":
    main()

Average Metrics:
sequence_order_preservation: 0.11276595744680849
ndcg@1: 0.7936170212765954
ndcg@2: 0.8229609929078014
ndcg@3: 0.838496200418662
dtw_distance: 2410.053191489362


In [10]:
#llama_val.csv
def main():
    # 加载数据
    df = pd.read_csv('/workspace/LLaRA/Sequence_analysis/All-Mini-VS-Llama/llama_val.csv')
    
    # 计算各评估指标的平均值
    metrics = ['sequence_order_preservation', 'ndcg@1', 'ndcg@2', 'ndcg@3', 'dtw_distance']
    averages = {metric: df[metric].mean() for metric in metrics}
  
    print("Average Metrics:")
    for metric, avg in averages.items():
        print(f"{metric}: {avg}")

# 运行主函数
if __name__ == "__main__":
    main()

Average Metrics:
sequence_order_preservation: 0.09468085106382977
ndcg@1: 0.8239361702127657
ndcg@2: 0.8360520094562648
ndcg@3: 0.8587564553661037
dtw_distance: 2489.031914893617


In [12]:
#llama_test.csv
def main():
    # 加载数据
    df = pd.read_csv('/workspace/LLaRA/Sequence_analysis/All-Mini-VS-Llama/llama_test.csv')
    
    # 计算各评估指标的平均值
    metrics = ['sequence_order_preservation', 'ndcg@1', 'ndcg@2', 'ndcg@3', 'dtw_distance']
    averages = {metric: df[metric].mean() for metric in metrics}
  
    print("Average Metrics:")
    for metric, avg in averages.items():
        print(f"{metric}: {avg}")

# 运行主函数
if __name__ == "__main__":
    main()

Average Metrics:
sequence_order_preservation: 0.12105263157894738
ndcg@1: 0.8305263157894734
ndcg@2: 0.8510818713450291
ndcg@3: 0.8622960886468778
dtw_distance: 2386.7894736842104


In [14]:
#llama_train.csv
def main():
    # 加载数据
    df = pd.read_csv('/workspace/LLaRA/Sequence_analysis/All-Mini-VS-Llama/llama_train.csv')
    
    # 计算各评估指标的平均值
    metrics = ['sequence_order_preservation', 'ndcg@1', 'ndcg@2', 'ndcg@3', 'dtw_distance']
    averages = {metric: df[metric].mean() for metric in metrics}
  
    print("Average Metrics:")
    for metric, avg in averages.items():
        print(f"{metric}: {avg}")

# 运行主函数
if __name__ == "__main__":
    main()

Average Metrics:
sequence_order_preservation: 0.9007530560917122
ndcg@1: 0.8191488272796398
ndcg@2: 0.8363906783980481
ndcg@3: 0.8498570717353356
dtw_distance: 602.7047727671521
