In [2]:
import os
print(os.getcwd())

/Users/jiwenyu/Dev/jwgen/mytinyagent/test


In [3]:
# 处理元数据
import pandas as pd

# 读取 CSV 文件
csv_file = "../data/papers_metadata.csv" 
metadata_df = pd.read_csv(csv_file)

# 打印数据表格
# print(df.head())  # 显示前几行

# CSV标题：Index,Paper Title,Author Names,URL,Research Area,Venue,TLDR,Abstract

# 遍历并解析数据
for index, row in metadata_df.iterrows():
    print(f"Index: {row['Index']}")
    print(f"Title: {row['Paper Title']}")
    print(f"Authors: {row['Author Names']}")
    print(f"URL: {row['URL']}")
    print(f"Research Area: {row['Research Area']}")
    print(f"Venue: {row['Venue']}")
    print(f"TLDR: {row['TLDR']}")
    print(f"Abstract: {row['Abstract']}")
    print("-" * 80)

Index: 1
Title: Stress-Testing Capability Elicitation With Password-Locked Models
Authors: Ryan Greenblatt, Fabien Roger, Dmitrii Krasheninnikov, David Krueger
URL: https://openreview.net/pdf/060fc5a68cf9e8cd99067fa71d86b9b2407c68af.pdf
Research Area: safety_in_machine_learning
Venue: NeurIPS 2024 poster
TLDR: We train models to behave poorly except when the prompt contains a password, and study when supervised fine-tuning and RL can recover high performance.
Abstract: To determine the safety of large language models (LLMs), AI developers must be able to assess their dangerous capabilities. But simple prompting strategies often fail to elicit an LLM’s full capabilities. One way to elicit capabilities more robustly is to fine-tune the LLM to complete the task. In this paper, we investigate the conditions under which fine-tuning-based elicitation suffices to elicit capabilities. To do this, we introduce password-locked models, LLMs fine-tuned such that some of their capabilities are deli

In [4]:
from markitdown import MarkItDown
md = MarkItDown() 
# 获取纯文本
pdf_text = md.convert("../data/pdfs/1.pdf") 
print(len(pdf_text.text_content))

106547


读取PDF并处理思路：
1. 第一次循环： 去除空白行
2. 第二次，记录标题行，目的是后续可以按照标题行来把文本内容分为多个chunks
    2.1 首先按标题来摘分
    2.2 标题内的文本，考虑（1） 最大行数 max_line_per_chunk
                        (2) 考虑要文本重叠， 重叠行术 overlap_lines
3. 关键标题： abstract -- abstract 之前的内容，默认是含有作者信息，用于解析作者信息
    提取作者信息，借助metadata 以及 chatgpt 综合来提取，尽可能提取
4. 关键标题： appendix -- appendix 之后的信息，丢弃
5. 关键标题： reference --- 从reference 到appenix 之间的信息默认是全部reference, 如果没有appendix 那就文末
    可以按x 行（5-10）行来处理，调用大模型来分析，中间结果存储起来，以名字作为key, 可以避免重复

In [5]:
text = pdf_text.text_content
lines = [line.strip() for line in text.split("\n") if line.strip()]  # 去除空白行
print(lines[:10])

['Stress-Testing Capability Elicitation With', 'Password-Locked Models', 'Ryan Greenblatt∗', 'Redwood Research', 'ryan@rdwrs.com', 'Fabien Roger∗', 'Redwood Research', 'fabien.d.roger@gmail.com', 'Dmitrii Krasheninnikov', 'University of Cambridge']


In [6]:
# 第二次，记录标题行，目的是后续可以按照标题行来把文本内容分为多个chunks
# 可以继续扩展
SECTION_TITLES = {
    "abstract": {"abstract"},
    "introduction": {"introduction", "background", "preliminaries", "preliminary"},
    "related_work": {"related work", "prior work", "literature review", "related studies"},
    "methodology": {"methodology", "methods", "method", "approach", "proposed method", 
                    "model", "architecture", "framework", "algorithm", "system design"},
    "experiment": {"experiment", "experiments", "experimental setup", "experiment setup",
                   "setup", "implementation details", "evaluation", "evaluation setup"},
    "ablation":{"ablation","ablation study"},
    "results": {"results", "performance", "findings", "observations", "empirical results"},
    "discussion": {"discussion", "analysis", "interpretation"},
    "summary":{"summary"},
    "conclusion": {"conclusion", "final remarks", "closing remarks"},
    "limitations":{"limitations"},
    "acknowledgments": {"acknowledgments", "acknowledgements", "funding", "author contributions"},
    "references": {"references", "bibliography", "cited works"},
    "appendix": {"appendix", "supplementary material", "supplementary", "additional materials"}
}

In [7]:
# 论文拆分参数
MAX_LINE_PER_CHUNK = 25  # 每个 chunk 最大行数
OVERLAP_LINES = 5  # 上下文重叠行数
REFERENCE_BLOCK_SIZE = 8  # 参考文献块大小（5~10行）

In [8]:
import re

def detect_section_titles(lines):
    """检测标题行，返回标题索引"""
    title_indices = {}
    section_num = 0 # 对于 summary、limitation 这类标题，可能在多个位置出现，需要全部保留。加一个编码，保证独立
    for i, line in enumerate(lines):
        clean_line = re.sub(r"[^a-zA-Z\s]", "", line).strip().lower()
        for section, keywords in SECTION_TITLES.items():
            if any(re.match(rf"^\s*(\d+([\.\-）]\d+)*[\.\-）]*\s*)?{re.escape(kw)}\s*$", clean_line, re.IGNORECASE) for kw in keywords):
                if (section in title_indices):
                    title_indices[section+"_"+str(section_num)] = i
                    section_num += 1
                else:
                    title_indices[section] = i
                break
    return title_indices


In [9]:
title_indices = detect_section_titles(lines)  # 2. 识别标题
print(title_indices)

abstract_index = title_indices.get("abstract", None)  # 获取 Abstract 标题的索引
print(abstract_index)

{'abstract': 14, 'introduction': 36, 'related_work': 145, 'experiment': 182, 'methodology': 388, 'limitations': 468, 'conclusion': 509, 'references': 522, 'limitations_0': 1120}
14


In [10]:
from collections import defaultdict

def split_chunks_by_title(lines, title_indices):
    """按照标题行分块"""
    chunks = defaultdict(list)
    sorted_sections = sorted(title_indices.items(), key=lambda x: x[1])  # 按出现顺序排序

    for idx, (section, start_idx) in enumerate(sorted_sections):
        end_idx = sorted_sections[idx + 1][1] if idx + 1 < len(sorted_sections) else len(lines)
        chunks[section] = lines[start_idx:end_idx]
    
    return chunks

In [11]:
section_chunks = split_chunks_by_title(lines, title_indices)  # 3. 按标题拆分
print(section_chunks)

defaultdict(<class 'list'>, {'abstract': ['Abstract', 'To determine the safety of large language models (LLMs), AI developers must', 'be able to assess their dangerous capabilities. But simple prompting strategies', 'often fail to elicit an LLM’s full capabilities. One way to elicit capabilities more', 'robustly is to fine-tune the LLM to complete the task. In this paper, we inves-', 'tigate the conditions under which fine-tuning-based elicitation suffices to elicit', 'capabilities. To do this, we introduce password-locked models, LLMs fine-tuned', 'such that some of their capabilities are deliberately hidden. Specifically, these', 'LLMs are trained to exhibit these capabilities only when a password is present', 'in the prompt, and to imitate a much weaker LLM otherwise. Password-locked', 'models enable a novel method of evaluating capabilities elicitation methods, by', 'testing whether these password-locked capabilities can be elicited without using', 'the password. We find that a few

In [12]:
# 获取 Abstract 之前的所有内容（作为作者信息）
abstract_index = title_indices.get("abstract", None)  # 获取 Abstract 标题的索引
if abstract_index is not None:
    author_info_lines = lines[:abstract_index]  # 从文档开头到 Abstract 之前的所有行
else:
    author_info_lines = lines[:10]  # 兜底逻辑，取前10行作为作者信息
print(author_info_lines)

#todo call llm to extract the author info

['Stress-Testing Capability Elicitation With', 'Password-Locked Models', 'Ryan Greenblatt∗', 'Redwood Research', 'ryan@rdwrs.com', 'Fabien Roger∗', 'Redwood Research', 'fabien.d.roger@gmail.com', 'Dmitrii Krasheninnikov', 'University of Cambridge', 'dk655@cam.ac.uk', 'David Krueger', 'University of Cambridge', 'david.scott.krueger@gmail.com']


In [13]:
import json
from ollama import Client

# 初始化 Ollama 客户端，确保连接到本地 Ollama 服务器
client = Client(host='http://localhost:11434')

def extract_author_info(text_before_abstract, default_name_list, model_name="llama3.2"):
    """
    使用 Ollama 提取论文作者信息，包括姓名、机构、邮箱和贡献顺序。

    :param text_before_abstract: str, 论文中摘要之前的文本内容
    :meta_data: str, 元数据，可以辅助推理
    :param model_name: str, 选择的 Ollama 语言模型
    :return: dict, 解析后的作者信息
    """
    prompt = f"""
    下面是论文摘要前的文本内容，请提取作者信息，并返回 JSON 格式的数据，包含以下字段：
    - "authors": 一个列表，每个元素是一个字典，包含：
        - "name": 作者姓名
        - "affiliation": 作者所属机构
        - "email": 作者邮箱（如无可省略）
        - "contribution_order": 作者的贡献顺序（从 1 开始，第一作者为 1，依次递增）
    元数据中获取到作者列表名字列表如下，可以辅助你推理：
    {default_name_list}
    
    请确保返回的 JSON 格式正确，不要包含额外的解释或前缀。

    论文信息：
    {text_before_abstract}
    """

    # 调用 Ollama API 进行对话
    response = client.chat(model=model_name, messages=[{"role": "user", "content": prompt}])
    # 如果是r1模型，移除 <think> 标签及其内容
    cleaned_text = re.sub(r'<think>.*?</think>', '', response.message.content, flags=re.DOTALL).strip()
    
    # 使用正则表达式查找 JSON 对象
    json_match = re.search(r'\{.*\}', cleaned_text, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
        try:
            # 尝试解析 JSON 字符串
            json_data = json.loads(json_str)
            return json_data
        except json.JSONDecodeError:
            print("JSON 解析失败")
            return None
    else:
        print("未找到作者的数据")
        return None


In [14]:
default_name_list = "Ryan Greenblatt, Fabien Roger, Dmitrii Krasheninnikov, David Krueger"
author_info = extract_author_info(author_info_lines,default_name_list, model_name="deepseek-r1:7b")

# 打印解析结果
print(json.dumps(author_info, ensure_ascii=False, indent=2))

{
  "authors": [
    {
      "name": "Ryan Greenblatt",
      "affiliation": "Redwood Research",
      "email": "ryan@rdwrs.com",
      "contribution_order": 1
    },
    {
      "name": "Fabien Roger",
      "affiliation": "Redwood Research",
      "email": "fabien.d.roger@gmail.com",
      "contribution_order": 2
    },
    {
      "name": "Dmitrii Krasheninnikov",
      "affiliation": "University of Cambridge",
      "email": "dk655@cam.ac.uk",
      "contribution_order": 3
    },
    {
      "name": "David Krueger",
      "affiliation": "University of Cambridge",
      "email": "david.scott.krueger@gmail.com",
      "contribution_order": 4
    }
  ]
}


In [15]:
print(len(section_chunks.get("abstract", [])))
section_chunks.get("abstract", [])[:5]


22


['Abstract',
 'To determine the safety of large language models (LLMs), AI developers must',
 'be able to assess their dangerous capabilities. But simple prompting strategies',
 'often fail to elicit an LLM’s full capabilities. One way to elicit capabilities more',
 'robustly is to fine-tune the LLM to complete the task. In this paper, we inves-']

In [16]:
def split_text_with_titles(section_chunks, max_line_per_chunk=25, overlap_lines=5):
    """
    按 Section 拆分文本，并进行 Chunk 切分，考虑标题标记、最大行数和重叠行数。

    :param section_chunks: dict, 以 section title 为 key，存储对应的文本行
    :param max_line_per_chunk: int, 每个 chunk 最大行数
    :param overlap_lines: int, 允许的上下文重叠行数
    :return: List[str]，每个 chunk 作为字符串存入列表
    """
    text_chunks = []
    
    for section, content_lines in section_chunks.items():
        print(f'=debug=processing section: {section}, with line amount: {len(content_lines)}\n')
        if section in {"authors", "acknowledgments","references","appendix"}:  # 跳过作者, 感谢，附录和参考文献， 只提取正文，用于RAG 
            print("=debug=skiped")
            continue  
        
        current_chunk = []
        for i, line in enumerate(content_lines):
            current_chunk.append(line)

            # 当达到 max_line_per_chunk 时，存储当前 chunk，并留出 overlap
            if len(current_chunk) >= max_line_per_chunk or i == len(content_lines) - 1:
                text_chunks.append("\n".join(current_chunk).strip())

                # 重叠部分：保留最后 overlap_lines 行作为下一个 chunk 的起点
                current_chunk = current_chunk[-overlap_lines:] if overlap_lines > 0 else []
    
    return text_chunks

In [17]:
processed_chunks = split_text_with_titles(section_chunks,25,5)
print(len(processed_chunks))


=debug=processing section: abstract, with line amount: 22

=debug=processing section: introduction, with line amount: 109

=debug=processing section: related_work, with line amount: 37

=debug=processing section: experiment, with line amount: 206

=debug=processing section: methodology, with line amount: 80

=debug=processing section: limitations, with line amount: 41

=debug=processing section: conclusion, with line amount: 13

=debug=processing section: references, with line amount: 598

=debug=skiped
=debug=processing section: limitations_0, with line amount: 282

41


In [18]:
print(processed_chunks[0])

Abstract
To determine the safety of large language models (LLMs), AI developers must
be able to assess their dangerous capabilities. But simple prompting strategies
often fail to elicit an LLM’s full capabilities. One way to elicit capabilities more
robustly is to fine-tune the LLM to complete the task. In this paper, we inves-
tigate the conditions under which fine-tuning-based elicitation suffices to elicit
capabilities. To do this, we introduce password-locked models, LLMs fine-tuned
such that some of their capabilities are deliberately hidden. Specifically, these
LLMs are trained to exhibit these capabilities only when a password is present
in the prompt, and to imitate a much weaker LLM otherwise. Password-locked
models enable a novel method of evaluating capabilities elicitation methods, by
testing whether these password-locked capabilities can be elicited without using
the password. We find that a few high-quality demonstrations are often sufficient
to fully elicit password-lock

In [19]:
test_chunks = [
    "This is the first chunk of text.",
    "Here is another important paragraph for retrieval.",
    "The final chunk contains critical information."
]  # 示例文本，你可以用你的实际数据替换

In [23]:
# 3. 生成嵌入向量（调用 Ollama）
def get_embedding(text):
    response = client.embeddings(model='nomic-embed-text', prompt=text)
    return response.get('embedding', [])

# 4. 生成 {原文: 向量} 字典
embeddings_list =[]
for chunk in processed_chunks:
    embeddings_list.append([chunk, get_embedding(chunk)])

In [25]:
print(embeddings_list[0][1])
print(len(embeddings_list[0][1]))

[-0.13561296463012695, 1.5017563104629517, -2.9744138717651367, -1.0196518898010254, 1.5531883239746094, -0.22718578577041626, 0.46872732043266296, 0.8949084877967834, 0.3559233844280243, 0.16102436184883118, -1.3834023475646973, 0.27278149127960205, 1.0322133302688599, 0.5698762536048889, 0.4491991102695465, 0.51213538646698, 0.9852634072303772, -1.5781731605529785, -0.23819969594478607, -1.3396438360214233, -0.027469592168927193, -0.6058204174041748, 0.10013654828071594, 0.09776880592107773, 0.7610929012298584, -0.04904220253229141, -0.19126957654953003, -0.07246822118759155, -0.22783362865447998, 0.19742411375045776, 0.9589989185333252, -1.370108962059021, 0.28271612524986267, -0.7195121049880981, -0.6590293049812317, -1.0664746761322021, 0.38836097717285156, 0.8560978174209595, -0.8767804503440857, 0.88774573802948, 0.8770408034324646, 1.2926000356674194, 0.06577807664871216, 0.08470109850168228, 0.19004082679748535, 0.08534689247608185, 1.0368403196334839, -0.42944076657295227, 0.

In [None]:
import os
import sys

# 将mytinyagent目录添加到sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))

In [36]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from datetime import date
from models import *

# 配置本地 PostgreSQL 数据库 URL
DATABASE_URL = "postgresql://admin:admin123@localhost/test_db_0213"
engine = create_engine(DATABASE_URL, echo=True)

# 创建会话
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
session = SessionLocal()

In [37]:
# 插入 Conference（会议）
conference1 = Conference(name="NeurIPS", type="ML Conference", description="Neural Information Processing Systems")
conference2 = Conference(name="ICML", type="ML Conference", description="International Conference on Machine Learning")

session.add_all([conference1, conference2])
session.commit()

# 插入 ConferenceInstance（会议届次）
instance1 = ConferenceInstance(name="NeurIPS 2025", conference_id=conference1.conference_id, year=2025, start_date="2025-12-01", end_date="2025-12-07", location="New Orleans", website="https://neurips.cc/2025")
instance2 = ConferenceInstance(name="ICML 2025", conference_id=conference2.conference_id, year=2025, start_date="2025-07-01", end_date="2025-07-05", location="Paris", website="https://icml.cc/2025")

session.add_all([instance1, instance2])
session.commit()

2025-02-14 00:09:30,593 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-02-14 00:09:30,593 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-02-14 00:09:30,595 INFO sqlalchemy.engine.Engine select current_schema()
2025-02-14 00:09:30,595 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-02-14 00:09:30,596 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-02-14 00:09:30,596 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-02-14 00:09:30,598 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-02-14 00:09:30,599 INFO sqlalchemy.engine.Engine INSERT INTO conference (name, type, description) SELECT p0::VARCHAR, p1::VARCHAR, p2::TEXT FROM (VALUES (%(name__0)s, %(type__0)s, %(description__0)s, 0), (%(name__1)s, %(type__1)s, %(description__1)s, 1)) AS imp_sen(p0, p1, p2, sen_counter) ORDER BY sen_counter RETURNING conference.conference_id, conference.conference_id AS conference_id__1
2025-02-14 00:09:30,599 INFO sqlalchemy.engine.Engine [generated in 0.00004s (inser

In [39]:
instance1 = session.query(ConferenceInstance).filter_by(name="NeurIPS 2025").first()

2025-02-14 00:12:37,863 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-02-14 00:12:37,865 INFO sqlalchemy.engine.Engine SELECT conference_instance.instance_id AS conference_instance_instance_id, conference_instance.name AS conference_instance_name, conference_instance.conference_id AS conference_instance_conference_id, conference_instance.year AS conference_instance_year, conference_instance.start_date AS conference_instance_start_date, conference_instance.end_date AS conference_instance_end_date, conference_instance.location AS conference_instance_location, conference_instance.website AS conference_instance_website 
FROM conference_instance 
WHERE conference_instance.name = %(name_1)s 
 LIMIT %(param_1)s
2025-02-14 00:12:37,865 INFO sqlalchemy.engine.Engine [generated in 0.00056s] {'name_1': 'NeurIPS 2025', 'param_1': 1}


In [41]:
# 插入 Paper（论文）
paper1 = Paper(title="Stress-Testing Capability Elicitation With Password-Locked Models", year=2024, instance_to_paper=instance1)

session.add_all([paper1])
session.commit()

2025-02-14 00:14:11,591 INFO sqlalchemy.engine.Engine INSERT INTO paper (instance_id, title, type, year, publish_date, tldr, abstract, content, pdf_path, citation_count, award, doi, code_url, supplementary_material_url) VALUES (%(instance_id)s, %(title)s, %(type)s, %(year)s, %(publish_date)s, %(tldr)s, %(abstract)s, %(content)s, %(pdf_path)s, %(citation_count)s, %(award)s, %(doi)s, %(code_url)s, %(supplementary_material_url)s) RETURNING paper.paper_id
2025-02-14 00:14:11,592 INFO sqlalchemy.engine.Engine [generated in 0.00088s] {'instance_id': 1, 'title': 'Stress-Testing Capability Elicitation With Password-Locked Models', 'type': None, 'year': 2024, 'publish_date': None, 'tldr': None, 'abstract': None, 'content': None, 'pdf_path': None, 'citation_count': 0, 'award': None, 'doi': None, 'code_url': None, 'supplementary_material_url': None}
2025-02-14 00:14:11,597 INFO sqlalchemy.engine.Engine COMMIT


In [51]:
paper_test = session.query(Paper).first()
print(paper_test)

2025-02-14 00:27:00,826 INFO sqlalchemy.engine.Engine SELECT paper.paper_id AS paper_paper_id, paper.instance_id AS paper_instance_id, paper.title AS paper_title, paper.type AS paper_type, paper.year AS paper_year, paper.publish_date AS paper_publish_date, paper.tldr AS paper_tldr, paper.abstract AS paper_abstract, paper.content AS paper_content, paper.pdf_path AS paper_pdf_path, paper.citation_count AS paper_citation_count, paper.award AS paper_award, paper.doi AS paper_doi, paper.code_url AS paper_code_url, paper.supplementary_material_url AS paper_supplementary_material_url 
FROM paper 
 LIMIT %(param_1)s
2025-02-14 00:27:00,827 INFO sqlalchemy.engine.Engine [cached since 232.7s ago] {'param_1': 1}
<Paper(id=1,title=Stress-Testing Capability Elicitation With Password-Locked Models, year=2024, tldr=None)>


In [58]:
print(processed_chunks[0])

Abstract
To determine the safety of large language models (LLMs), AI developers must
be able to assess their dangerous capabilities. But simple prompting strategies
often fail to elicit an LLM’s full capabilities. One way to elicit capabilities more
robustly is to fine-tune the LLM to complete the task. In this paper, we inves-
tigate the conditions under which fine-tuning-based elicitation suffices to elicit
capabilities. To do this, we introduce password-locked models, LLMs fine-tuned
such that some of their capabilities are deliberately hidden. Specifically, these
LLMs are trained to exhibit these capabilities only when a password is present
in the prompt, and to imitate a much weaker LLM otherwise. Password-locked
models enable a novel method of evaluating capabilities elicitation methods, by
testing whether these password-locked capabilities can be elicited without using
the password. We find that a few high-quality demonstrations are often sufficient
to fully elicit password-lock

In [59]:
try:
    # 初始化数据库会话
    session = SessionLocal()
    paper_test = session.merge(paper_test)  # 合并到当前会话中，避免重复绑定到不同会话

    # 用于存储所有插入的内容embedding
    new_embedding_list = []
    count =0
    # 遍历处理后的chunk并构造Embedding对象
    for embedding_item in embeddings_list:
        count +=1
        # 确保每个chunk有有效的内容和嵌入向量
        if len(embedding_item) >= 2:
            embedding = ContentEmbedding(
                # paper_id 请确保正确传递
                paper_to_embedding=[paper_test],
                # 这里假设 chunk[0] 是文本内容，chunk[1] 是embedding向量
                text=embedding_item[0], 
                embedding=embedding_item[1]
            )
            print(count)
            new_embedding_list.append(embedding)
    # 批量插入所有embedding
    if new_embedding_list:
        session.add_all(new_embedding_list)
        session.commit()
        print(f"✔️ {len(new_embedding_list)} 个 embedding 已成功插入数据库！")
    else:
        print("❌ 没有有效的chunk，未进行任何插入。")

except Exception as e:
    # 出现异常时回滚事务
    session.rollback()
    print(f"❌ 发生错误: {e}")
finally:
    # 关闭会话
    if session:
        session.close()

2025-02-14 00:37:26,215 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-02-14 00:37:26,216 INFO sqlalchemy.engine.Engine SELECT paper.paper_id AS paper_paper_id, paper.instance_id AS paper_instance_id, paper.title AS paper_title, paper.type AS paper_type, paper.year AS paper_year, paper.publish_date AS paper_publish_date, paper.tldr AS paper_tldr, paper.abstract AS paper_abstract, paper.content AS paper_content, paper.pdf_path AS paper_pdf_path, paper.citation_count AS paper_citation_count, paper.award AS paper_award, paper.doi AS paper_doi, paper.code_url AS paper_code_url, paper.supplementary_material_url AS paper_supplementary_material_url 
FROM paper 
WHERE paper.paper_id = %(pk_1)s
2025-02-14 00:37:26,217 INFO sqlalchemy.engine.Engine [cached since 316.2s ago] {'pk_1': 1}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
2025-02-14 00:37:26,222 INFO sqlalchemy.engine.Engine INSERT INTO content_embedding (text, emb

In [68]:
# 查询：SELECT text FROM content_embedding 
#    ORDER BY embedding <=> ARRAY[-0.13561296,1.5017563,-2.9744139,-1.0196519,1.5531883,-0.22718579,0.46872732,0.8949085,0.35592338,0.16102436,-1.3834023,0.2727815,1.0322133,0.56987625,0.4491991,0.5121354,0.9852634,-1.5781732,-0.2381997,-1.3396438,-0.027469592,-0.6058204,0.10013655,0.097768806,0.7610929,-0.049042203,-0.19126958,-0.07246822,-0.22783363,0.19742411,0.9589989,-1.370109,0.28271613,-0.7195121,-0.6590293,-1.0664747,0.38836098,0.8560978,-0.87678045,0.88774574,0.8770408,1.2926,0.06577808,0.0847011,0.19004083,0.08534689,1.0368403,-0.42944077,0.9815187,-1.2138832,1.5540758,-1.3314387,0.31885803,-0.06321413,2.4474118,0.66378456,-0.8006341,-0.29067582,0.6999114,-1.6609373,1.5493946,0.8365586,-0.8786267,0.68815184,-0.37724942,0.47637182,-0.69274676,0.9940658,0.2726352,-1.2479028,-0.32494363,0.21707627,-0.7150006,-0.87104404,-0.72287774,0.4798115,-0.52016854,0.14996614,-0.26305747,1.083716,0.2389049,-0.75544083,0.82188994,-0.24648996,1.3402271,0.16795821,-0.5956495,-0.057138186,-0.3377309,1.70912,0.026561154,0.14881384,0.56827694,0.17479643,-0.5252813,0.6905764,-0.017821986,-0.09922462,-1.0704235,0.042437803,-0.45826283,-0.05335579,-0.14501642,-0.54574496,-0.07206851,0.6687359,0.45600918,-0.58331555,-0.110712245,0.43511453,0.20581137,0.7250899,-0.6534619,0.16909222,-0.9003124,-0.9342614,0.57233393,-0.021118121,0.35808665,0.4302504,-1.0323719,0.12764637,-0.9264041,-0.3573629,0.5764371,1.125186,-1.0085613,0.19418159,-0.032525737,-0.8840226,0.047448162,0.20342444,-0.6938971,0.07084571,0.3981577,2.3237462,-0.7572732,-0.06344352,0.49228635,-0.24062847,-0.17988613,-0.0048016966,-0.57420206,-0.32114977,0.3232884,-1.3383747,-0.43446258,0.5458188,-1.2177154,0.074160546,0.018047307,0.74439335,-0.091014326,0.91928744,0.56381536,0.48414013,0.09056019,0.083118975,-0.49788654,0.9707557,0.3446273,0.25016668,-0.47305447,0.74697495,0.16377653,-0.06601019,0.29478094,0.4776287,0.09268592,0.8370164,-1.3276769,-0.33837003,0.34134334,-0.11817916,-0.15523332,-0.23651548,0.9822138,-2.0384276,0.52377963,-0.86042035,1.1807474,-1.5395293,0.807631,0.6646461,-0.43894848,-0.3215059,0.13102186,0.2504091,0.21063453,-1.2490939,0.08711585,0.2906309,-1.6152889,-0.7539059,-0.30709678,-1.6029377,0.83624446,-0.3098201,1.076838,-0.039855707,-1.0040072,-0.20904085,-0.59441096,-0.09501315,-1.5762455,0.7009403,-0.21525566,0.54303443,-0.6274487,0.36952686,1.5327305,-0.1636104,0.28939036,-0.33691633,0.38759023,-0.14086694,0.26677597,-0.18439515,-0.07384329,-0.41441566,0.884318,-0.16438961,0.587363,0.5677581,0.06779453,0.5080315,-1.7188598,0.7234528,-0.7996076,0.658543,0.14697869,-0.9606673,0.6786908,0.14244191,-0.2088133,0.5834236,0.061045997,1.0972482,-0.24604136,0.052401084,0.3577533,-0.216431,-0.76122344,0.5302488,-0.8230072,-0.3724107,-1.4629495,-0.48908895,0.12639579,0.8755677,-1.2113411,0.46460992,0.3700706,-0.7021511,0.12566306,-0.06418102,0.101759456,0.15399343,0.3592841,0.3174061,0.53223985,-0.18699016,-0.17730431,0.515732,0.21376897,-0.91292953,0.4328725,-0.22528495,-0.088418715,0.10336012,0.21462232,1.1693398,0.86265844,0.9912206,-0.036026165,-0.07425868,-0.6680796,-0.12239826,-0.07271585,0.3841984,-1.2852165,-0.594589,-1.2720195,-0.07558761,0.33150408,-0.23035356,-0.34350273,0.050042618,0.23038325,-0.9899636,-0.16067164,0.5997553,-0.5923111,0.15467614,-0.8071854,0.2991817,0.05202095,-0.47341198,-0.049800273,-0.85348594,0.7245121,0.2887399,0.7718728,0.784659,-0.70467323,-0.3507001,-0.05512715,0.87394667,0.9828761,0.4928561,-1.4089521,0.560446,-0.6364077,-0.2511412,-0.30640233,0.096995585,0.22655909,0.8221145,0.3062682,-1.0874423,0.026963012,-0.7444116,-0.15071142,-0.6957183,0.20097658,0.55059534,-0.078564994,-0.17950907,-1.5965937,-1.2083244,1.9470279,1.1757464,0.6436321,-0.9802346,-0.67092407,-0.29553273,0.2007508,0.021452554,0.52544385,-0.699081,0.902966,-0.5178418,0.34145758,-0.2204699,-0.7489664,0.47331685,-1.2236987,-0.71952194,1.7143967,0.5996457,-0.7611792,0.34421632,-0.5829861,-0.7831398,1.0932041,0.07045657,0.38445365,0.8611564,0.71240234,-0.01219392,1.2103226,0.08209409,0.036168437,-1.5655768,0.20857394,0.47563794,1.0905435,0.08341913,-0.13109395,-0.16468439,0.24363346,-0.71810335,-0.25182843,0.25043353,-0.50272137,0.3955116,-1.3929534,-0.4709283,-0.562517,0.0071876375,-0.3665077,0.13261138,-0.38521764,-0.10490952,0.70492893,0.0221922,1.125618,-0.2373796,0.28767076,0.74819136,0.009925928,-0.92862993,-0.7060484,1.2134959,0.7685116,-0.67659783,-0.5363889,-0.4214305,0.3008266,0.4648735,-1.140739,-0.050075002,0.16555777,0.17715804,-0.16544643,-0.4204353,-0.40815315,0.43655443,0.6445322,-0.49459964,0.65938824,0.31124112,-0.9430619,-0.93266314,-0.278903,-0.17908534,0.62063706,0.2925061,0.08540778,-0.16532135,0.07895324,0.42908895,0.45603615,0.53457266,0.78491384,-0.082737386,-0.051088326,1.2512803,0.38990313,-1.9684211,0.88599145,-0.015149659,0.9800353,-0.8341051,0.35699686,-0.27618915,-0.52707773,0.6955694,-0.20348777,1.2764932,-0.07854563,-0.8299981,-0.026586488,-0.61127067,0.5304645,0.8372102,1.0407574,0.3080412,-1.1344213,0.5377379,-0.6682362,0.03827014,0.79807264,0.3689655,2.0714092,-0.4426303,-0.02989789,0.34261036,0.63761944,0.054521456,0.3301608,0.1017905,-0.94042015,0.8355256,0.24080418,-0.3885649,0.303522,-1.1332839,1.4328179,0.57130474,-0.42475528,0.12022413,0.99488926,0.2837296,-0.6751746,-0.8526672,-0.2803715,0.13520153,0.026160348,1.1849811,0.29661018,0.015343885,-0.4281399,-0.6308137,0.13578965,0.8002763,0.70997477,0.2473217,-0.90095013,0.26245016,0.9101815,0.6304963,-0.3138246,0.24360074,-0.9295683,-0.9744208,-0.18237907,0.84809506,0.4962638,0.4034726,0.21043731,0.7157763,0.50361097,0.117701545,-0.42851847,-1.0903413,0.51767755,-1.3850951,-1.0813844,0.20196497,-0.40189627,0.22042975,-0.52173966,0.51759034,1.4234484,-0.65971345,-0.066651,0.4070358,-0.97771543,-0.6380205,0.3174188,-0.18236999,-0.09089194,-0.8581924,-2.3816593,0.3587069,0.41998973,-0.8044953,-0.09891405,0.010586959,0.32601324,0.83087355,-1.2909701,-0.5909637,0.5870047,0.7011223,0.038599756,-0.364426,-0.64961433,0.8364301,1.18984,0.12059794,0.09526384,-0.1352548,0.1596564,0.22570916,0.10148771,1.0319964,-0.5768641,-1.5605875,-0.022125114,-0.9655461,-0.3110504,-0.3007472,1.11391,-0.3236547,-0.4346991,-0.7915929,0.12290508,-0.09460116,0.015252505,0.5147528,2.0405624,0.18896203,-0.8486341,0.09918193,-0.23755491,-0.56187534,0.07887892,1.5292832,0.35578954,-0.44936916,-0.0069096778,0.6695083,-0.31156012,-0.48943344,-0.026559725,-0.45058292,-0.67962474,-1.1690941,-0.2760829,-0.5926988,0.91995996,0.34831798,0.20489816,0.063080184,-1.3768765,-0.14865662,0.6189588,-0.40268815,-0.9648959,-0.68610144,0.42481452,-0.12967354,0.34111828,-0.07514922,-0.2506305,-1.008998,-0.72362363,-0.8397005,0.62979466,1.4300159,0.66460186,-0.50656885,-0.06662467,1.1134993,-0.37021762,1.3533728,0.97803104,-0.28121033,-0.19518842,0.28237948,-0.11132829,-0.43607056,0.10684109,-0.9330841,1.5705503,0.25313413,-0.69467586,0.5201241,-0.18385041,-1.0018609,1.125615,-1.1039997,0.24497199,-0.150086,-0.71930313,-0.5994483,0.44704604,0.64944506,-1.2550479,0.13805619,-0.52018285,-0.36102727,-0.9108606,0.25805765,-0.6070177,0.39951,0.16279335,0.4863386,0.5887081,-0.094323374,-0.77694637,0.9025212,0.9713494,-0.33941495,0.8159272,1.392321,0.45810822,-0.6877195,1.3947495,1.8634396,-0.97801465,-1.1776103,-0.118989095,0.35357815,0.26228786,-0.06838665,-0.5668164,-0.5157442,0.6698109,0.5098656,0.39516726,-1.285885,0.7702621,-0.66719294,-0.4516222,-0.8019023,-0.57939464,-0.024145497,1.1878724,0.29834473,0.14362758,-0.49936345,0.5370408,-0.104221135,1.1594299,1.2447144,0.5793179,-0.5263348,0.12682728,-0.31219313,0.15729952,-0.38247454,0.23081782,-0.16672668,0.9287017,-0.3954636,-1.1483676,-0.55337733,-1.1855681,-0.4232308,0.44908154,0.35538876,0.23237872,0.2818829,-0.14821328,-0.14884646,0.1629239,1.0516996,-1.0184239,1.4184886,0.4445259,0.25163573,0.1865375,-0.009174484,0.01872689,0.13060293,0.008334868,-0.57513106,-0.3118323,0.85736364,-1.0946157,1.0871769,0.7423399,0.053133775,-0.05575633,-0.20885165,0.15264618,-0.0620587,1.004962,-0.67718107,-0.51033175,0.06503524,0.26837486,-0.6755792,0.7474814,-0.6669781,-0.35538706,0.68600273,0.32372266,-0.8611659,-0.15749641,0.8410618,-0.39111975,-0.35653782,-0.7402334,-0.019570576,-1.2454967,-0.6292328,-1.294249,0.74401903,0.18069483,-0.14713477,0.7598779,-0.35852852,-1.0914581,0.20193274,-0.4894269,-0.7209864,0.7242633,-0.65626484,-1.0576028,0.03503091,-0.3508207,0.5961982,-0.47698063,0.36075437,2.5642653,-0.044167876,0.59890187,-0.05105487,1.0215055,-0.20239115,-0.99134666,-0.84373796,0.19485487,-0.0113715865]::vector(768) 
#    LIMIT 5;
from sqlalchemy import func

session = SessionLocal()

query_out = session.query(ContentEmbedding).filter(
    ContentEmbedding.embedding.op('<->')(embeddings_list[0][1]).isnot(None)  # 使用pgvector的距离操作符
).order_by(func.embedding_distance(ContentEmbedding.embedding, embeddings_list[0][1])).limit(5).all()

print(query_out)
session.close

2025-02-14 00:53:27,088 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-02-14 00:53:27,089 INFO sqlalchemy.engine.Engine SELECT content_embedding.embedding_id AS content_embedding_embedding_id, content_embedding.text AS content_embedding_text, content_embedding.embedding AS content_embedding_embedding 
FROM content_embedding 
WHERE (content_embedding.embedding <-> %(embedding_1)s::FLOAT[]) IS NOT NULL ORDER BY embedding_distance(content_embedding.embedding, %(embedding_distance_1)s) 
 LIMIT %(param_1)s
2025-02-14 00:53:27,090 INFO sqlalchemy.engine.Engine [cached since 106s ago] {'embedding_1': [-0.13561296463012695, 1.5017563104629517, -2.9744138717651367, -1.0196518898010254, 1.5531883239746094, -0.22718578577041626, 0.46872732043266296, 0. ... (15652 characters truncated) ... , -0.05105486884713173, 1.0215054750442505, -0.2023911476135254, -0.9913466572761536, -0.8437379598617554, 0.19485487043857574, -0.011371586471796036], 'embedding_distance_1': [-0.13561296463012695, 1.50175

ProgrammingError: (psycopg2.errors.UndefinedFunction) operator does not exist: vector <-> double precision[]
LINE 3: WHERE (content_embedding.embedding <-> ARRAY[ -0.13561296463...
                                           ^
HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.

[SQL: SELECT content_embedding.embedding_id AS content_embedding_embedding_id, content_embedding.text AS content_embedding_text, content_embedding.embedding AS content_embedding_embedding 
FROM content_embedding 
WHERE (content_embedding.embedding <-> %(embedding_1)s::FLOAT[]) IS NOT NULL ORDER BY embedding_distance(content_embedding.embedding, %(embedding_distance_1)s) 
 LIMIT %(param_1)s]
[parameters: {'embedding_1': [-0.13561296463012695, 1.5017563104629517, -2.9744138717651367, -1.0196518898010254, 1.5531883239746094, -0.22718578577041626, 0.46872732043266296, 0. ... (15652 characters truncated) ... , -0.05105486884713173, 1.0215054750442505, -0.2023911476135254, -0.9913466572761536, -0.8437379598617554, 0.19485487043857574, -0.011371586471796036], 'embedding_distance_1': [-0.13561296463012695, 1.5017563104629517, -2.9744138717651367, -1.0196518898010254, 1.5531883239746094, -0.22718578577041626, 0.46872732043266296, 0. ... (15652 characters truncated) ... , -0.05105486884713173, 1.0215054750442505, -0.2023911476135254, -0.9913466572761536, -0.8437379598617554, 0.19485487043857574, -0.011371586471796036], 'param_1': 5}]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
# todo 如何处理向量查询，还是一个待解决的问题

In [70]:
reference_index = title_indices.get("references", None)  # 获取 Abstract 标题的索引
print(reference_index)

522


In [72]:
print(section_chunks["references"])
print(len(section_chunks["references"]))

['References', 'Anthony, T., Tian, Z., and Barber, D. Thinking fast and slow with deep learning and tree search.', 'Advances in neural information processing systems, 30, 2017. 8', 'Anthropic. Anthropics responsible scaling policy.', 'https://www.anthropic.com/index/', 'anthropics-responsible-scaling-policy, 2023. 1, 2, 4', 'Anthropic. Responsible scaling policy evaluations report – claude 3 opus. https://cdn.sanity.', 'io/files/4zrzovbb/website/210523b8e11b09c704c5e185fd362fe9e648d457.pdf,', '2024. 6', 'Anwar, U., Saparov, A., Rando, J., Paleka, D., Turpin, M., Hase, P., Lubana, E. S., Jenner, E., Casper,', 'S., Sourbut, O., et al. Foundational challenges in assuring alignment and safety of large language', 'models. arXiv preprint arXiv:2404.09932, 2024. 3', 'Austin, J., Odena, A., Nye, M., Bosma, M., Michalewski, H., Dohan, D., Jiang, E., Cai, C., Terry, M.,', 'Le, Q., et al. Program synthesis with large language models. arXiv preprint arXiv:2108.07732,', '2021. 5', 'Bi, X., Chen, D.

In [79]:
# 接下来处理reference

def split_references_with_overlap(reference_lines, block_size=REFERENCE_BLOCK_SIZE, overlap=2):    
    # 计算每个参考文献块的开始和结束位置，并在相邻块之间添加重叠
    reference_chunks = []
    for i in range(0, len(reference_lines), block_size - overlap):
        chunk = "\n".join(reference_lines[i:i + block_size])
        reference_chunks.append(chunk)
    
    return reference_chunks


In [82]:
reference_lines = section_chunks["references"]
reference_chunk = split_references_with_overlap(reference_lines)
print(reference_chunk[1])

Anthropic. Responsible scaling policy evaluations report – claude 3 opus. https://cdn.sanity.
io/files/4zrzovbb/website/210523b8e11b09c704c5e185fd362fe9e648d457.pdf,
2024. 6
Anwar, U., Saparov, A., Rando, J., Paleka, D., Turpin, M., Hase, P., Lubana, E. S., Jenner, E., Casper,
S., Sourbut, O., et al. Foundational challenges in assuring alignment and safety of large language
models. arXiv preprint arXiv:2404.09932, 2024. 3
Austin, J., Odena, A., Nye, M., Bosma, M., Michalewski, H., Dohan, D., Jiang, E., Cai, C., Terry, M.,
Le, Q., et al. Program synthesis with large language models. arXiv preprint arXiv:2108.07732,


In [94]:
# 调用 Ollama 模型
def process_references_with_model(reference_chunk_text, model_name="llama3.2"):
    prompt = f"""
    我会给你很多论文的引用，请你根据片段文字，帮我格式化成为一个json 格式，包括以下属性：
    title = Column(String(255), nullable=False) # 参考文献标题，不能为空 
    author = Column(Text) # 作者，多个作者用逗号分隔，可以为空 
    year = Column(Integer) # 参考文献出版年份 
    journal = Column(String(255)) # 参考文献所属期刊名称 
    web_url = Column(String(255)) # 参考文献的网页 URL 或指向原始论文的 URL
    输入：
    {reference_chunk_text}
    请注意，这里的输入信息会有截断现象，如果你无法分析某一个小段文字，就忽略，你的任务是经可能从片段信息中推理出论文引用的关键信息。
    严格按照json格式输出，能够保证被解析。
    不需要任何别的注解信息，不需要任何解释和辅助信息。如果不知道，就写留空白或者写unknown。
    """

    # 调用 Ollama API 进行对话
    response = client.chat(model=model_name, messages=[{"role": "user", "content": prompt}])
    # print(f'==debug == llm message:{response.message}')
    # 如果是r1模型，移除 <think> 标签及其内容
    cleaned_text = re.sub(r'<think>.*?</think>', '', response.message.content, flags=re.DOTALL).strip()
    # 去掉前后 ```json 和 ```
    cleaned_text = cleaned_text.lstrip('`').lstrip('json\n').rstrip('`').rstrip('\n')
    # print(f'==debug == clean response:{cleaned_text}')
    # 使用正则表达式查找 JSON 对象
    json_match = re.search(r'\{.*\}', cleaned_text, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
        try:
            # 尝试解析 JSON 字符串
            json_data = json.loads(cleaned_text)
            return json_data
        except json.JSONDecodeError:
            print("JSON 解析失败")
            return None
    else:
        print("未找到")
        return None

In [92]:
reference_info = process_references_with_model(reference_chunk[1], model_name="deepseek-r1:7b")

# 打印解析结果
print(json.dumps(reference_info, ensure_ascii=False, indent=2))


==debug == llm message:role='assistant' content='<think>\n好的，我现在需要帮用户将提供的论文引用片段格式化成JSON。首先，我要仔细分析用户的输入内容。\n\n用户给了三个引用片段：\n\n第一个是Anthropic的报告，标题中有“Responsible scaling policy evaluations report – claude 3 opus”，后面跟着年份2024和一个链接。所以title应该是完整标题，作者为空，因为没有提到，year是2024， journal可能是Anthropic或者报告中的公司名称，web_url就是给的链接。\n\n第二个引用有多个作者，用逗号分隔，年份是2024，后面跟着期刊名和arXiv的预印本链接。所以title要完整，author是名字列表，year是2024， journal可能是arXiv，web_url则是arXiv的链接。\n\n第三个引用同样有作者列表，年份2024，后面跟的是arXiv链接。所以结构类似第二个。\n\n我需要注意用户提到输入信息可能有截断，如果无法分析某个片段就忽略。因此，每个引用都要尽可能提取相关信息。\n\n现在整理每个条目的属性：\n\n第一个条目：\ntitle: "Responsible scaling policy evaluations report – claude 3 opus"\nauthor: null\nyear: 2024\njournal: Anthropic（可能不确定是否正确，但根据内容推断是报告）\nweb_url: https://cdn.sanity.io/files/4zrzovbb/website/210523b8e11b09c704c5e185fd362fe9e648d457.pdf\n\n第二个条目：\ntitle需要完整，应该是“Foundational challenges in assuring alignment and safety of large language models”\nauthor是多个名字，后面跟年份和链接。所以作者部分用逗号分隔，year是2024， journal可能是arXiv，web_url是给的链接。\n\n第三个条目：\n标题是“Program 

In [97]:
print(len(reference_chunk))
print(reference_chunk[0])

100
References
Anthony, T., Tian, Z., and Barber, D. Thinking fast and slow with deep learning and tree search.
Advances in neural information processing systems, 30, 2017. 8
Anthropic. Anthropics responsible scaling policy.
https://www.anthropic.com/index/
anthropics-responsible-scaling-policy, 2023. 1, 2, 4
Anthropic. Responsible scaling policy evaluations report – claude 3 opus. https://cdn.sanity.
io/files/4zrzovbb/website/210523b8e11b09c704c5e185fd362fe9e648d457.pdf,


In [98]:
# 用来存储参考文献的列表
references_result_list = []

# 用于去重的字典
seen_titles = set()
for chunk in reference_chunk[:5]:
    reference_info = process_references_with_model(chunk, model_name="deepseek-r1:7b")
    if reference_info:
        for ref in reference_info:
            title = ref.get("title")
            print(f'==debug== ref:title {title}')
            if ref.get("title") and title not in seen_titles:
                references_result_list.append(ref)
                # 将 title 加入去重集合
                seen_titles.add(title)

print(len(references_result_list))


==debug== ref:title Thinking Fast and Slow with Deep Learning and Tree Search
==debug== ref:title Anthropics Responsible Scaling Policy
==debug== ref:title Responsible Scaling Policy Evaluations Report – Claude 3 Opus
==debug== ref:title Responsible scaling policy evaluations report – claude 3 opus
==debug== ref:title Foundational challenges in assuring alignment and safety of large language models
==debug== ref:title Program synthesis with large language models
==debug== ref:title Program synthesis with large language models
==debug== ref:title Deepseek llm: Scaling open-source language models with longtermism
==debug== ref:title Pythia: A suite for analyzing large language models
==debug== ref:title Pythia: A suite for analyzing large language models across training and scaling
==debug== ref:title Language models are few-shot learners
==debug== ref:title information processing systems
==debug== ref:title Weak-to-strong generalization: Eliciting strong capabilities with weak supervisi