# LLMs

In [None]:
from dotenv import load_dotenv
import os
from time import sleep, time
import json
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
load_dotenv()

## Class

In [None]:
class LLMScorer:
    """
    LLM-based semantic relevance scorer
    L_ij = F_LLM(c_ij, a_j)
    """

    def __init__(self):
        self.client = OpenAI(api_key=os.getenv('LLM_KEY'), base_url=os.getenv('LLM_URL'))
        self.model = os.getenv('LLM_MODEL')

    def get_model(self):
        return self.model

    def score(self, context: str, cited_abstract: str) -> dict:
        """
        Returns a structured JSON with:
        score, methodology, theory, evidence, dependency, summary
        """

        prompt = f"""
You are an expert academic reviewer.

Your task is to evaluate the semantic relevance of a citation between two papers.

Citing context:
{context}

Abstract of the cited paper:
{cited_abstract}

Please assess how strongly the cited paper supports, informs, or is essential
to the citing paper in this context.

Focus on semantic relevance rather than surface similarity.
Do NOT consider citation counts, venue prestige, or author identity.

Return your answer strictly in the following JSON format:

{{
  "score": 0.0,
  "methodology": "Brief justification on whether the cited paper is used as a method or tool.",
  "theory": "Brief justification on theoretical or conceptual grounding.",
  "evidence": "Brief justification on empirical support or comparison.",
  "dependency": "Brief justification on how essential the cited work is to the citing paper.",
  "summary": "One-sentence overall explanation of the relevance."
}}

The score must be a real number between 0 and 1.
Each field must contain a concise but concrete justification.
Do not include any text outside the JSON object.
"""

        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a careful and neutral academic reviewer."},
                {"role": "user", "content": prompt}
            ],
            # temperature=0.0,   # 保证可复现
            response_format={"type": "json_object"}
        )
        try:
            result = json.loads(response.choices[0].message.content)
        except Exception as e:
            result = {
                'score': 0,
                'methodology':'',
                'theory':'',
                'evidence':'',
                'dependency':'',
                'summary': str(e)
            }

        # 安全兜底：保证 score 合法
        result["score"] = float(min(max(result.get("score", 0.0), 0.0), 1.0))
        return result

    def abstract(self, citing_abstract:str, cited_abstract:str) -> dict:
        """
        context not available
        """
        prompt = f"""
You are an expert academic reviewer.

Your task is to assess the potential semantic relatedness between two research papers,
in the absence of an explicit citation context.

Abstract of the citing paper:
{citing_abstract}

Abstract of the cited paper:
{cited_abstract}

Please evaluate to what extent the cited paper is thematically, methodologically,
or conceptually related to the citing paper at a global level.

Note that you are NOT asked to infer citation intent.
Instead, assess whether the two papers are meaningfully connected in terms of
research topic, methods, or theoretical foundations.

Return your answer strictly in the following JSON format:

{{
  "score": 0.0,
  "topic_overlap": "...",
  "methodological_similarity": "...",
  "conceptual_relation": "...",
  "summary": "..."
}}

The score must be a real number between 0 and 1.

A higher score indicates stronger overall semantic relatedness,
but does NOT imply that the citation is essential or central.

Do not include any text outside the JSON object.
        """
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a careful and neutral academic reviewer."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0,   # 保证可复现
            response_format={"type": "json_object"}
        )
        try:
            result = json.loads(response.choices[0].message.content)
        except Exception as e:
            result = {
                'score': 0,
                'methodology':'',
                'theory':'',
                'evidence':'',
                'dependency':'',
                'summary': str(e)
            }

        # 安全兜底：保证 score 合法
        result["score"] = float(min(max(result.get("score", 0.0), 0.0), 1.0))

        return result

## init

In [None]:
scorer = LLMScorer()

## data definted

In [None]:
scoredf = pd.read_excel('score.xlsx')

## single

In [None]:
col = scorer.get_model()
iterator = 1
data = scoredf.iloc[iterator]
res = scorer.score(data.contexts, data.cited_abstract)
scoredf.loc[iterator, [col, 'methodology', 'theory', 'evidence', 'dependency', 'summary']] = [res.get('score', 0), res.get('methodology', ''), res.get('theory', ''), res.get('evidence', ''), res.get('dependency', ''), res.get('summary', '')]
scoredf.to_excel('score.xlsx')

# context

In [None]:
col = scorer.get_model()
scoredf[col] = 0.0
end = len(scoredf)
iterator = 0

with tqdm(total=end, desc="处理中") as pbar:
  while iterator < end:
    row = scoredf.iloc[iterator]
    res = scorer.score(row['contexts'], row['cited_abstract'])
    scoredf.loc[iterator, [col]] = [res.get('score', 0)]
    scoredf.loc[iterator, [col, 'methodology', 'theory', 'evidence', 'dependency', 'summary']] = [res.get('score', 0), res.get('methodology', ''), res.get('theory', ''), res.get('evidence', ''), res.get('dependency', ''), res.get('summary', '')]
    # 可以更新额外信息
    pbar.set_postfix({"counter": iterator, 'title':row['title'], 'score':res.get('score', 0)})
    pbar.update(1)  # 更新1个单位
    scoredf.to_excel('score.xlsx')
    iterator += 1
pbar.close()
print("处理完成")


## abstract

In [None]:
col = scorer.get_model() + '-abstract'
scoredf[col] = 0.0
end = len(scoredf)
iterator = 0

with tqdm(total=end, desc="处理中") as pbar:
  while iterator < end:
    row = scoredf.iloc[iterator]
    res = scorer.abstract(row['abstract'], row['cited_abstract'])
    scoredf.loc[iterator, [col]] = [res.get('score', 0)]
    # 可以更新额外信息
    pbar.set_postfix({"counter": iterator, 'title':row['title'], 'score':res.get('score', 0)})
    pbar.update(1)  # 更新1个单位
    scoredf.to_excel('score.xlsx')
    iterator += 1
pbar.close()
print("处理完成")