# Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from dotenv import load_dotenv
from scipy.stats import spearmanr
import os
load_dotenv()


In [None]:
origindf = pd.read_excel('score.xlsx')

In [None]:
class CountScore:
  
  _df:pd.DataFrame
  def __init__(self, df):
    self._origin = df.copy()
    self._model = os.getenv('LLM_MODEL')

  def get_df(self):
    return self._df
  
  def parse(self):
    self._origin['count'] = np.where(self._origin['author_independence'] > 0.5, 1, 0)
    self._origin['score'] = (self._origin["relevance_cross"] * 0.5 + self._origin[self._model] * 0.5) * ((1 - 0.5) + 0.5 * self._origin["author_independence"])
    self._origin['score_raw'] = (self._origin["relevance_cross"] + self._origin[self._model]) * ((1 - 0.5) + 0.5 * self._origin["author_independence"])
    mean_count = self._origin['count'].mean()
    mean_score = self._origin['score'].mean()
    mean_cross = self._origin['relevance_cross'].mean()
    mean_llm = self._origin[self._model].mean()
    self._origin['score_0_to_2_mm'] = self._origin['score'] * mean_count / mean_score
    self._origin['cross_0_to_2'] = self._origin['relevance_cross'] * (mean_count / mean_cross)
    self._origin['llm_0_to_2'] = self._origin[self._model] * (mean_count / mean_llm)
    self._df = self._origin.groupby(['cited_id', 'cited_title']).agg(
      count=('count', 'sum'),  # 引用次数
      total_score=('score_0_to_2_mm', 'sum'),  # score总和
    )
    self._df = self._df[self._df['count'] > 10].reset_index()
    self._df['diff'] = (self._df['total_score'] - self._df['count']) / self._df['count']
  
  def plot(self):
    width = 0.35
    df = self._df
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 5), gridspec_kw={'height_ratios': [2, 1]})

    x = np.arange(len(df['count']))

    # 上子图：count 和 total_score
    bars1 = ax1.bar(x - width/2, df['count'], width, label='Count')
    bars2 = ax1.bar(x + width/2, df['total_score'], width, label='Score Sum')
    ax1.set_ylabel("Value")
    ax1.set_xticks(x)
    ax1.set_xticklabels([])
    ax1.grid(True, alpha=0.3)
    # ax1.set_xticklabels(df.index, rotation=45)
    ax1.legend()
    # ax1.set_title("Citation Count vs. Aggregated Citation Score")

    # 下子图：diff 作为条形高度，颜色也根据 diff 值变化
    val = max(abs(df['diff'].min()), abs(df['diff'].max()))
    norm = plt.Normalize(-val, val)
    cmap = plt.cm.coolwarm
    bars3 = ax2.bar(x, df['diff'], width, color=cmap(norm(df['diff'])), edgecolor='black')
    ax2.set_xlabel("Sampled Papers")
    ax2.set_ylabel("Normalized Count-Score Gap")
    ax2.set_xticks(x)
    ax2.set_xticklabels(df.index, rotation=45)
    ax2.grid(True, alpha=0.3)

    # 添加颜色条
    # sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    # sm.set_array([])
    # fig.colorbar(sm, ax=ax1, orientation='vertical', label='Normalized Discrepancy')

    plt.tight_layout()
    plt.show()

  def topk(self):
    pos_case = self._df.loc[self._df['diff'].idxmax()]
    neg_case = self._df.loc[self._df['diff'].idxmin()]
    bal_case = self._df[self._df['diff'].abs() < 0.02]
    return pos_case, neg_case, bal_case
    

  def detail(self, id:str):
    df = self._origin[self._origin['cited_id'] == id][['relevance_cross', self._model, 'author_independence','score_raw', 'count', 'score_0_to_2_mm', 'cross_0_to_2', 'llm_0_to_2']]
    df['llm'] = df[self._model]
    df['count_sum'] = df['count'].cumsum()
    df['score_sum'] = df['score_0_to_2_mm'].cumsum()
    df['cross_sum'] = df['cross_0_to_2'].cumsum()
    df['llm_sum'] = df['llm_0_to_2'].cumsum()
    df["delta"] = df["score_0_to_2_mm"] - df["count"]
    fig, axes = plt.subplots(3, 1, figsize=(8, 5), gridspec_kw={'height_ratios': [2, 1, 1]})
    plt.subplots_adjust(wspace=0.5, hspace=0.5)
    x = np.arange(len(df))
    axes[0].step(x, df['count_sum'], label='Count', color='#9467bd')
    axes[0].plot(x, df['score_sum'], label='Score', color='#d62728')
    axes[0].plot(x, df['cross_sum'], label='Cross-only', color='#1f77b4')
    axes[0].plot(x, df['llm_sum'], label='LLM-only', color='#ff7f0e')
    axes[0].set_xticklabels([])
    axes[0].set_ylabel("Count & Score")
    axes[0].set_title("(a) Cumulative Citation Count & Score")
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

    
    norm = plt.Normalize(-1, 1)
    cmap = plt.cm.coolwarm
    axes[1].bar(x, df["delta"], color=cmap(norm(df['delta'])), edgecolor='black')
    axes[1].axhline(0, linestyle="--", color="black", alpha=0.6)
    axes[1].set_title("(b) Per-citation Score Deviation")
    axes[1].set_xticklabels([])
    axes[1].set_ylabel(r"$\tilde{S}_{ij} - N_{ij}$")
    axes[1].grid(True, alpha=0.3)

    axes[2].bar(x, df["relevance_cross"], label="Cross", color='#1f77b4')
    axes[2].bar(x, df["llm"], bottom=df["relevance_cross"], label="LLM", color='#ff7f0e')
    axes[2].bar(
        x,
        -1 + df["author_independence"],
        # bottom=df["relevance_cross"] + df["llm"],
        label="Author Penalty", color='#2ca02c'
    )
    axes[2].axhline(1, linestyle="--", color="black", alpha=0.6)
    axes[2].set_ylabel("Score")
    axes[2].set_title("(c) Score Components")
    axes[2].legend(fontsize=7, loc='upper center', bbox_to_anchor=(0.16, 1.71))
    axes[2].grid(True, alpha=0.3)

    plt.show()


In [None]:
count_score = CountScore(origindf)

In [None]:
count_score.parse()
count_score.plot()

In [None]:
count_score.topk()

In [None]:
count_score.detail('a9a654ea503386cbfd8bb119fc650cc3d08dc206')

In [None]:
count_score.detail('10eab4b2feec2c1ec1ecb0107aac91b974445a69')

In [None]:
count_score.detail('10dde76b297ae90451246138f00e92c832ecf14a')

In [None]:
class ComparisonRelevance:

  def __init__(self, df):
    self._df = pd.DataFrame({
      'cosine': df['relevance_cosine'],
      'cross_encoder': df['relevance_cross']
    })
  
  def get_df(self):
    return self._df

  def plot(self):
    df = self._df
    fig, axes = plt.subplots(2, 2, figsize=(8, 7))
    plt.subplots_adjust(wspace=0.5, hspace=1)  # 调整水平和垂直间距

    # 1. 直方图
    axes[0,0].hist(df['cosine'], bins=30, alpha=0.5, label='Cosine', color='blue', density=True)
    axes[0,0].hist(df['cross_encoder'], bins=30, alpha=0.5, label='Cross-Encoder', color='red', density=True)
    axes[0,0].set_title('(a) Distribution of Relevance Scores')
    axes[0,0].set_xlabel('Relevance Score')
    axes[0,0].set_ylabel('Probability Density')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)

    # 2. 箱线图
    box_data = [df['cosine'], df['cross_encoder']]
    axes[0,1].boxplot(box_data, tick_labels=['Cosine', 'Cross-Encoder'])
    axes[0,1].set_title('(b) Statistical Summary')
    axes[0,1].set_ylabel('Relevance Score')
    axes[0,1].grid(True, alpha=0.3)


    # 3. 散点图
    axes[1,0].scatter(df['cosine'], df['cross_encoder'], alpha=0.5, s=10)
    axes[1,0].plot([0, 1], [0, 1], 'r--', alpha=0.5)
    axes[1,0].set_xlabel('Cosine Relevance Score')
    axes[1,0].set_ylabel('Cross-Encoder Relevance Score')
    axes[1,0].set_title('(c) Correlation between Methods')
    axes[1,0].grid(True, alpha=0.3)


    # 4. 差异直方图
    axes[1,1].hist(df['cosine'] - df['cross_encoder'], bins=30, alpha=0.7, color='purple')
    axes[1,1].axvline(x=0, color='red', linestyle='--', alpha=0.5)
    axes[1,1].set_title('(d) Score Differences Distribution')
    axes[1,1].set_xlabel('Difference (Cosine - Cross-Encoder)')
    axes[1,1].set_ylabel('Probability Density')
    axes[1,1].grid(True, alpha=0.3)

    # plt.suptitle('Comprehensive Comparison: Cosine vs Cross-Encoder Distributions', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
relevance = ComparisonRelevance(origindf)
relevance.plot()

In [None]:
class RelevancePenalty:

  def __init__(self, df:pd.DataFrame):
    self._lambdas = np.arange(0.01, 0.95, 0.01)
    self._df = df.copy()
    self._df['score_context'] = self._df['relevance_cross'] * 0.5 + self._df['gpt-5-mini'] *0.5
    self._df['score_abstract'] = self._df['relevance_cross_abstract'] * 0.5 + self._df['gpt-5-mini-abstract'] *0.5

  def get_df(self):
    return self._df
  
  def plot(self):

    lambdas = self._lambdas
    df = self._df[['score_context', 'score_abstract']]

    # ---------- (a) MAE ----------
    mae_list = []
    for lam in lambdas:
        diff = df['score_context'] - lam * df['score_abstract']
        mae_list.append(np.mean(np.abs(diff)))

    best_lambda = lambdas[np.argmin(mae_list)]

    # ---------- (b) Spearman ----------
    rho_list = []
    for lam in lambdas:
        scaled = lam * df['score_abstract']
        rho, _ = spearmanr(df['score_context'], scaled)
        rho_list.append(rho)

    # ---------- (c) Ratio ----------
    ratio = (df['score_abstract'] / df['score_context']) \
              .replace([np.inf, -np.inf], np.nan).dropna()

    # ---------- (d) Mean gap ----------
    mean_gaps = []
    for lam in lambdas:
        mean_gaps.append(
            np.mean(lam * df['score_abstract'] - df['score_context'])
        )

    # ---------- Plot ----------
    fig, axes = plt.subplots(2, 2, figsize=(8, 7))

    # (a)
    axes[0, 0].plot(lambdas, mae_list, marker='o')
    axes[0, 0].axvline(best_lambda, linestyle='--')
    axes[0, 0].set_title('(a) MAE minimization')
    axes[0, 0].set_xlabel(r'$\lambda$')
    axes[0, 0].set_ylabel('MAE')
    axes[0, 0].grid(True)

    # (b)
    axes[0, 1].plot(lambdas, rho_list, marker='s')
    axes[0, 1].set_title('(b) Rank consistency (Spearman)')
    axes[0, 1].set_xlabel(r'$\lambda$')
    axes[0, 1].set_ylabel(r'$\rho$')
    axes[0, 1].grid(True)

    # (c)
    axes[1, 0].hist(ratio, bins=40)
    axes[1, 0].axvline(0.8, linestyle='--')
    axes[1, 0].set_title('(c) Abstract/context relevance ratio')
    axes[1, 0].set_xlabel('Ratio')
    axes[1, 0].set_ylabel('Frequency')

    # (d)
    axes[1, 1].plot(lambdas, mean_gaps)
    axes[1, 1].axhline(-0.1, linestyle='--', label='-10% threshold', color='red')
    axes[1, 1].axvline(0.8, linestyle='--', label=r'$\lambda=0.8$')
    axes[1, 1].set_title('(d) Mean penalized gap')
    axes[1, 1].set_xlabel(r'$\lambda$')
    axes[1, 1].set_ylabel('Mean gap')
    axes[1, 1].legend()
    axes[1, 1].grid(True)

    plt.tight_layout()
    plt.show()


In [None]:
instance_penalty = RelevancePenalty(origindf)

In [None]:
instance_penalty.plot()

In [None]:
class RelevanceLLM:

  def __init__(self, df:pd.DataFrame):
    self._df = df.copy()
    self._llm_cols = ["Cross-encoder", "gpt-5-mini", "qwen3-max", "deepseek-chat", "gemini-2.5-flash"]
    self._df['Cross-encoder'] = df['relevance_cross']


  def density(self):
    df = self._df
    # df['gpt-4.1-mini'] = df['llm']

    plt.figure(figsize=(8, 5))
    for col in self._llm_cols:
        sns.kdeplot(df[col], label=col, fill=True, alpha=0.3)

    plt.xlabel("Relevance Score")
    plt.ylabel("Density")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

  def heatmap(self):
    df = self._df
    corr = pd.DataFrame(index=self._llm_cols, columns=self._llm_cols)

    for c1 in self._llm_cols:
        for c2 in self._llm_cols:
            rho, _ = spearmanr(df[c1], df[c2])
            corr.loc[c1, c2] = rho

    corr = corr.astype(float)

    plt.figure(figsize=(5, 4))
    sns.heatmap(corr, annot=True, cmap="coolwarm", vmin=0, vmax=1)
    plt.xticks(rotation=45, ha='right')
    plt.show()

  def scatter(self, col:str):
    fig, axes = plt.subplots(figsize=(4, 3.5))

    rho, _ = spearmanr(self._df["Cross-encoder"], self._df[col])
    axes.scatter(self._df["Cross-encoder"], self._df[col], alpha=0.4, s=10)
    axes.plot([0,1], [0,1], 'r--', alpha=0.5)
    axes.set_xlabel("Cross-Encoder Relevance")
    axes.set_ylabel("LLM-based Relevance")
    axes.set_title(f"Spearman {r'$\rho$ '}= {rho:.2f}")
    axes.grid(True)
    plt.show()

  def delta(self):
    df = self._df[['llm', 'relevance_cross']]
    df["delta"] = df["llm"] - df["relevance_cross"]

    top_pos = df.nlargest(5, "delta")
    top_neg = df.nsmallest(5, "delta")
    mid = df.iloc[(df["delta"].abs()).argsort()[:5]]

    subdf = pd.concat([top_pos, top_neg, mid])

    # labels = subdf.index.astype(str)
    x = np.arange(len(subdf))

    fig, ax = plt.subplots(figsize=(7.2,3))
    # fig, ax = plt.subplots(figsize=(14, 6))

    # 添加垂直分割线和标签
    ax.axvline(x=4.5, color='gray', linestyle='--', alpha=0.7, linewidth=1)
    ax.axvline(x=9.5, color='gray', linestyle='--', alpha=0.7, linewidth=1)

    # 添加区域标签
    ax.text(2, ax.get_ylim()[1] * 1.1, 'Cross < LLM', 
            ha='center', fontsize=11, fontweight='bold', 
            bbox=dict(boxstyle='round', facecolor='#2E86AB', alpha=0.2))
    ax.text(7, ax.get_ylim()[1] * 1.1, 'Cross > LLM', 
            ha='center', fontsize=11, fontweight='bold',
            bbox=dict(boxstyle='round', facecolor='#6B8F71', alpha=0.2))
    ax.text(12, ax.get_ylim()[1] * 1.1, 'Cross = LLM', 
            ha='center', fontsize=11, fontweight='bold',
            bbox=dict(boxstyle='round', facecolor='#A23B72', alpha=0.2))

    plt.bar(x - 0.15, subdf["relevance_cross"], width=0.3, label="Cross")
    plt.bar(x + 0.15, subdf["llm"], width=0.3, label="LLM")
    # plt.xticks(x, labels, rotation=45)
    plt.ylabel("Score")
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()

  def stability(self, col:str):
    alphas = np.linspace(0, 1, 11)
    rhos = []

    for a in alphas:
        fused = a * self._df["relevance_cross"] + (1 - a) * self._df[col]
        rho, _ = spearmanr(self._df["relevance_cross"], fused)
        rhos.append(rho)

    plt.figure(figsize=(4,3.5))
    plt.plot(alphas, rhos, marker='o')
    plt.xlabel(r"Weight $\alpha$ for Cross")
    plt.ylabel("Spearman Correlation w.r.t Cross")
    # plt.title("Stability of Ranking under LLM Weighting")
    plt.grid(True)
    plt.show()


In [None]:
instance_llm = RelevanceLLM(origindf)

In [None]:
instance_llm.density()

In [None]:
instance_llm.heatmap()

In [None]:
instance_llm.scatter('gpt-5-mini')

In [None]:
instance_llm.stability('gpt-5-mini')

In [None]:
class BetaCalibration:
  def __init__(self, df: pd.DataFrame):
    # 只保留会被惩罚的样本
    self._df = df[['relevance_cross', 'gpt-5-mini', 'author_independence']].copy()
    # self._df.iloc[0]['author_independence'] = 0.6
    # self._df.iloc[1]['author_independence'] = 0.8
    self._betas = np.linspace(0, 1, 51)

  def distribution(self):
    df = self._df.copy()
    order = [0, 0.1, 0.5, 0.6, 0.8, 1.0]
    sns.countplot(x='author_independence', data=df, order=order)
    plt.xlabel("Author Independence Level")
    plt.ylabel("Number of Citations")
    plt.show()

  def evaluate(self, extract = None):
    avg_drop = []
    rank_rho = []

    df = self._df.copy()
    if extract:
      df = df[df['author_independence']<1]
    
    df['base'] = df["relevance_cross"] * 0.5 + df["gpt-5-mini"] * 0.5

    for beta in self._betas:
      penalty = (1 - beta) + beta * df["author_independence"]
      final = df['base'] * penalty
      
      # 相对降幅
      drop = (final - df['base']) / df['base']
      avg_drop.append(drop.mean())

      # 排序稳定性
      rho, _ = spearmanr(df['base'], final)
      rank_rho.append(rho)

    return pd.DataFrame({
      "beta": self._betas,
      "avg_relative_drop": avg_drop,
      "spearman_rho": rank_rho
    })

  def plot(self, res):
    fig, axes = plt.subplots(1, 2, figsize=(8, 3.5))

    # (a) 平均惩罚幅度
    axes[0].plot(res["beta"], -res["avg_relative_drop"], marker="o")
    axes[0].axhline(0.3, linestyle="--", color="gray", label="30% drop")
    axes[0].set_xlabel(r"$\beta$")
    axes[0].set_ylim(0, 1)
    axes[0].set_ylabel("Average Relative Penalty")
    axes[0].set_title("(a) Penalty Magnitude")
    axes[0].legend()
    axes[0].grid(True)

    # (b) 排序稳定性
    axes[1].plot(res["beta"], res["spearman_rho"], marker="s")
    axes[1].axhline(0.9, linestyle="--", color="gray", label=r"$\rho=0.9$")
    axes[1].set_xlabel(r"$\beta$")
    axes[1].set_ylim(0, 1)
    axes[1].set_ylabel("Spearman Correlation")
    axes[1].set_title("(b) Rank Stability")
    axes[1].legend()
    axes[1].grid(True)

    plt.tight_layout()
    plt.show()



In [None]:
beta_instance = BetaCalibration(origindf)

In [None]:
beta_instance.plot( beta_instance.evaluate() )

In [None]:
beta_instance.plot( beta_instance.evaluate(True) )