# output the codes for the following task, need to setup a good framework


**Aim**: classification of articles based on abstract

**structure**: use LLM to automate create tags based on the 

     1 Read the abstract and catagory from db,  

     2 catagory is first level tags, too many articles have the same tags, create a loop for all catagory

        2.1 in the loop, for each catagory, label all of them with multiple tags using an openAI LLM tool using vllm engine,
        2.2 the tags could be 10~20 , capture the essence of the abstract,
        2.3 add multiple columes in the database, put all tags in one colume 
        
     3 Now get a new papar abstract, analyze its key word and serch in the databese to find the most relevent ones, apply a threhold to get rid of irrelevent papers


In [2]:
# DB connection & sample read (PostgreSQL)

import psycopg2
from psycopg2 import sql

HOST = 'localhost'  # 数据库主机
PORT = '5432'  # PostgreSQL 默认端口
USER = 'postgres'  # 用于创建数据库的管理员用户名
PASSWORD = '123456'  # 管理员密码
DB_NAME = 'arxiv_db'  # 要创建的数据库名称
DB_USER = 'postgres'  # 新用户
DB_USER_PASSWORD = '123456'  # 新用户密码

try:
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_USER_PASSWORD,
        host=HOST,
        port=PORT
    )
    cur = conn.cursor()

    # 读取摘要和一级类别标签
    cur.execute("""
        SELECT id, abstract, categories FROM arxiv_papers
        WHERE abstract IS NOT NULL AND categories IS NOT NULL AND llm_tags IS NULL;
    """)
    papers = cur.fetchall()

    # 示例：打印前5条记录
    for paper in papers[:1]:
        print(f"ID: {paper[0]}\nCategory: {paper[2]}\nAbstract: {paper[1][:200]}...\n{'-'*40}")

    cur.close()
    conn.close()
except Exception as e:
    print(f"Error reading abstracts from database: {e}")

Error reading abstracts from database: could not connect to server: Connection refused
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5432?
could not connect to server: Connection refused
	Is the server running on host "localhost" (127.0.0.1) and accepting
	TCP/IP connections on port 5432?



In [4]:

import requests
import aiohttp
import asyncio
import re

def remove_think_tag(text):
    # Remove <think>...</think> and any leading/trailing whitespace/newlines
    return re.sub(r"<think>.*</think>\s*", "", text, flags=re.DOTALL)


async def async_generate_tags_with_llm(abstracts_batch, n_tags=20, session=None):
    """
    Use vllm engine (OpenAI-compatible API at localhost:8889) to generate tags for the given abstract.
    Returns a list of tags.
    """
    # abstracts_batch = 【id, abstract, catagory】
    system_prompt = (
        f"Read the following scientific abstract and generate less than {n_tags} concise, relevant tags "
        f"that capture its main topics and concepts. Return the tags as a comma-separated list.\n\n "
        f"Abstract:{abstracts_batch[1]}  Tags: \n\n /no_think"
    )
    messages= [
    #     {"role": "system", "content": system_prompt},
            {"role": "user", "content": system_prompt }
    ]
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": "/models/Qwen3-8B",
        "messages": messages,
        "max_tokens": 1280,
        "temperature": 0.3,
    }
    try:
        async with session.post(
            "http://localhost:8889/v1/chat/completions",
            headers=headers,
            json=payload,
            timeout=30
        ) as response:
            response.raise_for_status()
            result = await response.json()
            tag_text = result["choices"][0]["message"]["content"]
            tag_text = remove_think_tag(tag_text)
            tags = [tag.strip() for tag in tag_text.split(",") if tag.strip()]
            return tags
    
 
    except Exception as e:
        print(f"LLM tag generation failed: {e}")
        return []
    

async def batch_generate_tags_with_llm(abstracts_batch, n_tags=20):
    """
    批量异步处理摘要，返回 [(paper_id, category, tags), ...]
    """
    semaphore = asyncio.Semaphore(50)


    async with aiohttp.ClientSession() as session:
        tasks = [
            async_generate_tags_with_llm(abstract, n_tags, session)
            for abstract in abstracts_batch
        ]
        tags_list = await asyncio.gather(*tasks)
    # 组装结果
    return [
        (paper_id, category, tags)
        for (paper_id, _, category), tags in zip(abstracts_batch, tags_list)
    ]


from typing import List, Tuple

def store_tags_in_db(tagged_papers: List[Tuple[str, str, List[str]]]) -> None:
    """
    Store the generated tags into the original arxiv_papers table.
    Adds a new column 'llm_tags' if it does not exist, then updates each row.

    Args:
        tagged_papers: List of tuples (paper_id, category, tags)
    Returns:
        None
    """
    try:
        conn = psycopg2.connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_USER_PASSWORD,
            host=HOST,
            port=PORT
        )
        cur = conn.cursor()

        # Add the 'llm_tags' column if it doesn't exist
        cur.execute("""
            DO $$
            BEGIN
                IF NOT EXISTS (
                    SELECT 1 FROM information_schema.columns 
                    WHERE table_name='arxiv_papers' AND column_name='llm_tags'
                ) THEN
                    ALTER TABLE arxiv_papers ADD COLUMN llm_tags TEXT;
                END IF;
            END$$;
        """)
        conn.commit()

        # Update each paper with its tags
        for paper_id, category, tags in tagged_papers:
            tags_str = ", ".join(tags)
            #print(paper_id)

            cur.execute(
                "UPDATE arxiv_papers SET llm_tags = %s WHERE id = %s;",
                (tags_str, paper_id)
            )
        conn.commit()
        cur.close()
        conn.close()
        print("Tags successfully stored in the database.")
    except Exception as e:
        print(f"Error storing tags in database: {e}")

In [5]:
import nest_asyncio
from pprint import pprint
import time

nest_asyncio.apply()

BATCH_SIZE = 50

def fetch_papers_batch(offset, limit):
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_USER_PASSWORD,
        host=HOST,
        port=PORT
    )
    cur = conn.cursor()
    cur.execute("""
        SELECT id, abstract, categories FROM arxiv_papers
        WHERE abstract IS NOT NULL AND categories IS NOT NULL AND llm_tags IS NULL
        ORDER BY id
        OFFSET %s LIMIT %s;
    """, (offset, limit))
    batch = cur.fetchall()
    cur.close()
    conn.close()
    return batch

start_time = time.time()

offset = 0
while offset<1000000:
    papers_batch = fetch_papers_batch(offset, BATCH_SIZE)
    if not papers_batch:
        break
    tagged_batch = await batch_generate_tags_with_llm(papers_batch, n_tags=20)
    store_tags_in_db(tagged_batch)
    print(f"Processed batch offset {offset}")
    offset += BATCH_SIZE

elapsed = time.time() - start_time
print(f"✅ Completed in {elapsed:.2f} seconds")



OperationalError: could not connect to server: Connection refused
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5432?
could not connect to server: Connection refused
	Is the server running on host "localhost" (127.0.0.1) and accepting
	TCP/IP connections on port 5432?


In [6]:
import psycopg2

HOST = 'localhost'  # 数据库主机
PORT = '5432'  # PostgreSQL 默认端口
USER = 'postgres'  # 用于创建数据库的管理员用户名
PASSWORD = '123456'  # 管理员密码
DB_NAME = 'arxiv_db'  # 要创建的数据库名称
DB_USER = 'postgres'  # 新用户
DB_USER_PASSWORD = '123456'  # 新用户密码

try:
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_USER_PASSWORD,
        host=HOST,
        port=PORT
    )
    cur = conn.cursor()
    cur.execute("""
        SELECT id, categories, llm_tags
        FROM arxiv_papers
        WHERE llm_tags IS NOT NULL;
    """)
    rows = cur.fetchall()
    print(f"rows 共返回 {len(rows)} 条")
    cur.close()
    conn.close()
except Exception as e:
    print(f"Error fetching papers with LLM tags: {e}")

# 获取最后写入数据库的llm_tags记录（按更新时间排序，假设有updated_at或类似时间戳字段）
try:
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_USER_PASSWORD,
        host=HOST,
        port=PORT
    )
    cur = conn.cursor()
    # 如果表中有updated_at或类似字段，按其排序
    cur.execute("""
        SELECT id, categories, llm_tags
        FROM arxiv_papers
        WHERE llm_tags IS NOT NULL AND TRIM(llm_tags) <> ''
        ORDER BY arxiv_papers.update_date DESC
        LIMIT 10;
    """)
    last_written_rows = cur.fetchall()
    print("最后写入的10条llm_tags：")
    for row in last_written_rows:
        print(f"ID: {row[0]}, Categories: {row[1]}, LLM Tags: {row[2]}")
    cur.close()
    conn.close()
except Exception as e:
    print(f"Error fetching last written llm_tags: {e}")
    
try:
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_USER_PASSWORD,
        host=HOST,
        port=PORT
    )
    cur = conn.cursor()
    cur.execute("""
        SELECT COUNT(*) FROM arxiv_papers
        WHERE llm_tags IS NOT NULL AND TRIM(llm_tags) = '';
    """)
    count = cur.fetchone()[0]
    print(f"llm_tags为非空但内容为空字符串的条数: {count}")
    cur.close()
    conn.close()
except Exception as e:
    print(f"Error counting empty llm_tags: {e}")

try:
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_USER_PASSWORD,
        host=HOST,
        port=PORT
    )
    cur = conn.cursor()
    cur.execute("""
        UPDATE arxiv_papers
        SET llm_tags = NULL
        WHERE llm_tags IS NOT NULL AND TRIM(llm_tags) = '';
    """)
    conn.commit()
    print(f"已将llm_tags为非空但内容为空字符串的行恢复为NULL")
    cur.close()
    conn.close()
except Exception as e:
    print(f"Error updating empty llm_tags to NULL: {e}")

Error fetching papers with LLM tags: could not connect to server: Connection refused
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5432?
could not connect to server: Connection refused
	Is the server running on host "localhost" (127.0.0.1) and accepting
	TCP/IP connections on port 5432?

Error fetching last written llm_tags: could not connect to server: Connection refused
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5432?
could not connect to server: Connection refused
	Is the server running on host "localhost" (127.0.0.1) and accepting
	TCP/IP connections on port 5432?

Error counting empty llm_tags: could not connect to server: Connection refused
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5432?
could not connect to server: Connection refused
	Is the server running on host "localhost" (127.0.0.1) and accepting
	TCP/IP connections on port 5432?

Err

Error counting empty llm_tags: name 'psycopg2' is not defined


In [None]:
test_abstract=""" Scientific progress depends on researchers’ ability to synthesize the growing body
of literature. Can large language models (LMs) assist scientists in this task? We
introduce OPENSCHOLAR, a specialized retrieval-augmented LM that answers
scientific queries by identifying relevant passages from 45 million open-access
papers and synthesizing citation-backed responses. To evaluate OPENSCHOLAR,
we develop SCHOLARQABENCH, the first large-scale multi-domain benchmark
for literature search, comprising 2,967 expert-written queries and 208 long-form
answers across computer science, physics, neuroscience, and biomedicine. On
SCHOLARQABENCH, OPENSCHOLAR-8B outperforms GPT-4o by 5% and PaperQA2 by 7% in correctness, despite being a smaller, open model. While GPT4o
hallucinates citations 78–90% of the time, OPENSCHOLAR achieves citation accuracy on par with human experts. OPENSCHOLAR’s datastore, retriever, and
self-feedback inference loop also improves off-the-shelf LMs: for instance, OPENSCHOLAR-GPT4o improves GPT-4o’s correctness by 12%. In human evaluations,
experts preferred OPENSCHOLAR-8B and OPENSCHOLAR-GPT4o responses over
expert-written ones 51% and 70% of the time, respectively, compared to GPT4o’s
32%. We open-source all of our code, models, datastore, data and a public demo."""

In [None]:
import psycopg2
import aiohttp
import asyncio
import re
import nest_asyncio
from pprint import pprint
import time

nest_asyncio.apply()

# Database connection parameters
HOST = 'localhost'
PORT = '5432'
DB_NAME = 'arxiv_db'
DB_USER = 'postgres'
DB_USER_PASSWORD = '123456'

def remove_think_tag(text):
    return re.sub(r"<think>.*</think>\s*", "", text, flags=re.DOTALL)

async def generate_tags_for_abstract(abstract, n_tags=20):
    system_prompt = (
        f"Read the following scientific abstract and generate less than {n_tags} concise, relevant tags "
        f"that capture its main topics and concepts. Return the tags as a comma-separated list.\n\n "
        f"Abstract:{abstract}  Tags: \n\n /no_think"
    )
    messages = [{"role": "user", "content": system_prompt}]
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": "/models/Qwen3-8B",
        "messages": messages,
        "max_tokens": 1280,
        "temperature": 0.3,
    }
    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(
                "http://localhost:8889/v1/chat/completions",
                headers=headers,
                json=payload,
                timeout=30
            ) as response:
                response.raise_for_status()
                result = await response.json()
                tag_text = result["choices"][0]["message"]["content"]
                tag_text = remove_think_tag(tag_text)
                tags = [tag.strip() for tag in tag_text.split(",") if tag.strip()]
                return tags
        except Exception as e:
            print(f"LLM tag generation failed: {e}")
            return []

def test_single_abstract(abstract, n_tags=20):
    tags = asyncio.run(generate_tags_for_abstract(abstract, n_tags))
    print("Generated Tags:")
    print(tags)
    return tags

# Example usage:
test_abstract = """ Scientific progress depends on researchers’ ability to synthesize the growing body
of literature. Can large language models (LMs) assist scientists in this task? We
introduce OPENSCHOLAR, a specialized retrieval-augmented LM that answers
scientific queries by identifying relevant passages from 45 million open-access
papers and synthesizing citation-backed responses. To evaluate OPENSCHOLAR,
we develop SCHOLARQABENCH, the first large-scale multi-domain benchmark
for literature search, comprising 2,967 expert-written queries and 208 long-form
answers across computer science, physics, neuroscience, and biomedicine. On
SCHOLARQABENCH, OPENSCHOLAR-8B outperforms GPT-4o by 5% and PaperQA2 by 7% in correctness, despite being a smaller, open model. While GPT4o
hallucinates citations 78–90% of the time, OPENSCHOLAR achieves citation accuracy on par with human experts. OPENSCHOLAR’s datastore, retriever, and
self-feedback inference loop also improves off-the-shelf LMs: for instance, OPENSCHOLAR-GPT4o improves GPT-4o’s correctness by 12%. In human evaluations,
experts preferred OPENSCHOLAR-8B and OPENSCHOLAR-GPT4o responses over
expert-written ones 51% and 70% of the time, respectively, compared to GPT4o’s
32%. We open-source all of our code, models, datastore, data and a public demo."""

# Run the test
test_single_abstract(test_abstract, n_tags=20)

 