In [1]:
from utils import llm_utils

In [2]:
llm = llm_utils.get_llm()

In [3]:
# llm.invoke('介绍你自己')

In [4]:
llm_utils.run_generate_queries_chain('图在反欺诈领域的研究',llm)

['graph fraud detection',
 'graph neural network fraud',
 'GNN anti-fraud',
 'graph mining fraud detection',
 'network analysis fraud',
 'knowledge graph fraud',
 'heterogeneous graph fraud',
 'graph embedding fraud',
 'social network fraud detection',
 'financial fraud graph',
 'anomaly detection graph',
 'fraud prevention graph',
 'graph algorithms fraud',
 'relational learning fraud',
 'graph-based anomaly detection']

In [5]:
import os
import json
from typing import Dict, List
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain.chat_models.base import BaseChatModel
from langchain.prompts import ChatPromptTemplate
from mcp_prompts import query_prompt
from langchain_core.output_parsers.openai_tools import JsonOutputKeyToolsParser


# 加载环境变量（读取 .env 文件中的 DEEPSEEK_API_KEY）
load_dotenv(override=True)

def get_llm(model_name: str = "deepseek-chat", model_provider: str = "deepseek") -> BaseChatModel:
    """
    初始化一个 LLM（这里默认是 DeepSeek）。
    
    Args:
        model_name: 模型名称，例如 "deepseek-chat"
        model_provider: 模型提供商，例如 "deepseek"
    Returns:
        已初始化的 LangChain ChatModel
    """
    return init_chat_model(model=model_name, model_provider=model_provider)


import json
from langchain.schema import BaseOutputParser

class QueriesListParser(BaseOutputParser):
    def parse(self, text: str) -> list[str]:
        try:
            # 移除可能的 Markdown 包裹
            if text.startswith("```") and text.endswith("```"):
                text = "\n".join(text.split("\n")[1:-1])
            data = json.loads(text)
            queries = data.get("queries", [])
            if not isinstance(queries, list):
                raise ValueError(f"'queries' 不是列表: {queries}")
            return queries
        except Exception as e:
            print("解析 queries 失败:", e)
            print("原始文本:", text)
            return []


chat_prompt = ChatPromptTemplate.from_template(query_prompt.generate_search_query.content)
    
# 构建链式调用
parser = QueriesListParser()
querry_chain = chat_prompt | llm | parser

queries = querry_chain.invoke({"topic": "行为序列在反欺诈领域的研究"})
print(queries)
    

['behavior sequence fraud detection', 'user behavior sequence fraud', 'sequential behavior fraud detection', 'temporal sequence fraud detection', 'fraud detection behavioral patterns', 'sequence mining fraud detection', 'RNN fraud detection behavior', 'LSTM fraud detection sequence', 'Transformer fraud detection behavior', 'graph neural network fraud detection behavior', 'anomaly detection behavior sequence', 'financial fraud behavior sequence', 'e-commerce fraud behavior sequence', 'clickstream fraud detection', 'user action sequence fraud']


In [8]:
type(queries)

list

In [6]:
import arxiv

In [7]:
queries = querry_chain.invoke({"topic": "异常检测在反欺诈领域的研究"})
print(queries)

['fraud detection anomaly', 'anomaly detection fraud', 'financial fraud anomaly', 'credit card fraud anomaly', 'insurance fraud anomaly', 'banking fraud anomaly', 'transaction fraud anomaly', 'network intrusion anomaly', 'cybersecurity fraud anomaly', 'money laundering anomaly', 'outlier detection fraud', 'unsupervised fraud detection', 'semi-supervised fraud detection', 'deep learning fraud detection', 'machine learning fraud detection', 'GAN fraud detection', 'autoencoder fraud detection', 'isolation forest fraud', 'one-class SVM fraud', 'graph neural network fraud', 'time series anomaly fraud', 'real-time fraud detection', 'imbalanced learning fraud', 'fraud pattern recognition', 'behavioral analytics fraud']


In [None]:
import arxiv
import json
import time
import random

query = 'fraud detection behavior sequence'

print(query)
batch_size = 20
max_results = 10000  # 总共最多抓多少篇
save_path = "arxiv_results.json"

fetched = 0


def _process_paper(paper: arxiv.Result) -> Dict[str, Any]:
    """Process paper information with resource URI."""
    return {
        "id": paper.get_short_id(),
        "title": paper.title,
        "authors": [author.name for author in paper.authors],
        "abstract": paper.summary,
        "categories": paper.categories,
        "published": paper.published.isoformat(),
        "url": paper.pdf_url,
        "doi": paper.doi
    }
result_meta_data = []
# 创建搜索对象
search = arxiv.Search(
    query=query,
    max_results=max_results,
    sort_by=arxiv.SortCriterion.Relevance
)

client = arxiv.Client(page_size=batch_size, delay_seconds=3)

results_iter = client.results(search)

while True:
    try:
        paper = next(results_iter)
        paper_meta_data = _process_paper(paper)
        result_meta_data.append(paper_meta_data)
        fetched += 1

        # 每抓 batch_size 条就写入文件
        if fetched % batch_size == 0:
            # with open(save_path, "w", encoding="utf-8") as f:
            #     json.dump([{"title": t, "pdf_url": u} for t, u in zip(titles, pdf_urls)],
            #               f, ensure_ascii=False, indent=2)
            # print(f"✅ 已抓取 {fetched} 篇论文，已保存到 {save_path}")
            time.sleep(3 + random.random() * 2) # 防限流

    except StopIteration:
        break

    except Exception as e:  # 捕获空页等所有异常
        print(f"⚠️ 遇到异常: {e}，sleep 10秒后继续...")
        time.sleep(10)

# 写入最终结果
# with open(save_path, "w", encoding="utf-8") as f:
#     json.dump([{"title": t, "pdf_url": u} for t, u in zip(titles, pdf_urls)],
#               f, ensure_ascii=False, indent=2)

print(f"🎉 抓取完成，总共 {fetched} 篇论文")

In [9]:
search = arxiv.Search(
    query='behavior sequence fraud detection', max_results=10)
client = arxiv.Client(page_size=200, delay_seconds=3)

In [17]:
paper = next(client.results(arxiv.Search(id_list=['2201.01004v1'])))
paper.download_pdf(dirpath='/Users/zhangmin/Documents/3-大模型/5-项目/fraud_research_agent')

'/Users/zhangmin/Documents/3-大模型/5-项目/fraud_research_agent/2201.01004v1.Modeling_Users__Behavior_Sequences_with_Hierarchical_Explainable_Network_for_Cross_domain_Fraud_Detection.pdf'

In [23]:
r = {'entry_id':[],
    'updated':[],
     'published':[],
     'title':[],
     'authors':[],
     'summary':[],
     'comment':[],
     'journal_ref':[],
     'doi':[],
     'primary_category':[],
     'categories':[],
     'links':[]
}

for result in client.results(search):
    # print(result.entry_id)
    # print(result.updated)
    # print(result.published.isoformat())
    # print(result.title)
    # print([author.name for author in result.authors])
    # print(result.summary)
    # print(result.comment)
    # print(result.journal_ref)
    # print(result.doi)
    # print(result.primary_category)
    # print(result.categories)
    # print(result.pdf_url)
    
    # result.download_pdf('/Users/zhangmin/Documents/3-大模型/5-项目/fraud_research_agent')
    print(result._from_feed_entry)
    break
    # print(help(result))
    # print(result.get_short_id())
    # print(result)

<bound method Result._from_feed_entry of arxiv.Result(entry_id='http://arxiv.org/abs/2201.01004v1', updated=datetime.datetime(2022, 1, 4, 6, 37, 16, tzinfo=datetime.timezone.utc), published=datetime.datetime(2022, 1, 4, 6, 37, 16, tzinfo=datetime.timezone.utc), title="Modeling Users' Behavior Sequences with Hierarchical Explainable Network for Cross-domain Fraud Detection", authors=[arxiv.Result.Author('Yongchun Zhu'), arxiv.Result.Author('Dongbo Xi'), arxiv.Result.Author('Bowen Song'), arxiv.Result.Author('Fuzhen Zhuang'), arxiv.Result.Author('Shuai Chen'), arxiv.Result.Author('Xi Gu'), arxiv.Result.Author('Qing He')], summary="With the explosive growth of the e-commerce industry, detecting online\ntransaction fraud in real-world applications has become increasingly important\nto the development of e-commerce platforms. The sequential behavior history of\nusers provides useful information in differentiating fraudulent payments from\nregular ones. Recently, some approaches have been pr

In [11]:
def _process_paper(paper: arxiv.Result) -> Dict[str, Any]:
    """Process paper information with resource URI."""
    return {
        "id": paper.get_short_id(),
        "title": paper.title,
        "authors": [author.name for author in paper.authors],
        "abstract": paper.summary,
        "categories": paper.categories,
        "published": paper.published.isoformat(),
        "url": paper.pdf_url,
        "doi": paper.doi
    }

itertools.islice

In [24]:
import arxiv
import json
import time
import random
from typing import Dict, Any, List

# ===============================
# 配置
# ===============================
QUERY = "fraud detection behavior sequence"
BATCH_SIZE = 20          # 每次请求多少篇
CHUNK_SIZE = 2000        # 每批抓多少篇
TOTAL_LIMIT = 10000      # 总共最多抓多少篇
SAVE_PATH = "arxiv_results.json"

# ===============================
# 工具函数
# ===============================
def _process_paper(paper: arxiv.Result) -> Dict[str, Any]:
    """抽取论文关键信息"""
    return {
        "id": paper.get_short_id(),
        "title": paper.title.strip(),
        "authors": [author.name for author in paper.authors],
        "abstract": paper.summary.strip(),
        "categories": paper.categories,
        "published": paper.published.isoformat(),
        "url": paper.pdf_url,
        "doi": paper.doi
    }

def fetch_chunk(query: str, start: int, max_results: int) -> List[Dict[str, Any]]:
    """抓取一批论文"""
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        start=start,
        sort_by=arxiv.SortCriterion.Relevance
    )

    client = arxiv.Client(
        page_size=BATCH_SIZE,
        delay_seconds=3,
        num_retries=5
    )

    results = []
    for paper in client.results(search):
        results.append(_process_paper(paper))
        # 每篇随机 sleep，模拟正常用户
        time.sleep(random.uniform(1.5, 3.5))

    return results

# ===============================
# 主流程：分批抓取 + 合并
# ===============================
def fetch_all(query: str, total_limit: int, chunk_size: int, save_path: str):
    all_results = []
    start = 0

    while start < total_limit:
        batch_size = min(chunk_size, total_limit - start)
        print(f"\n📥 抓取第 {start} ~ {start+batch_size} 篇...")

        try:
            chunk_results = fetch_chunk(query, start, batch_size)
            all_results.extend(chunk_results)

            # 增量保存，防止中途崩溃丢数据
            with open(save_path, "w", encoding="utf-8") as f:
                json.dump(all_results, f, ensure_ascii=False, indent=2)

            print(f"✅ 已累计抓取 {len(all_results)} 篇论文，保存至 {save_path}")

        except Exception as e:
            print(f"⚠️ 批次抓取失败: {e}，等待 60 秒后重试...")
            time.sleep(60)
            continue

        start += batch_size

        # 批次之间也随机 sleep，进一步防止限流
        time.sleep(random.uniform(15, 30))

    print(f"\n🎉 抓取完成，总共 {len(all_results)} 篇论文，已保存至 {save_path}")


# ===============================
# 运行
# ===============================

fetch_all(QUERY, TOTAL_LIMIT, CHUNK_SIZE, SAVE_PATH)


📥 抓取第 0 ~ 2000 篇...
⚠️ 批次抓取失败: Search.__init__() got an unexpected keyword argument 'start'，等待 60 秒后重试...


KeyboardInterrupt: 

In [26]:
help(arxiv.Client)

Help on class Client in module arxiv:

class Client(builtins.object)
 |  Client(page_size: 'int' = 100, delay_seconds: 'float' = 3.0, num_retries: 'int' = 3)
 |  
 |  Specifies a strategy for fetching results from arXiv's API.
 |  
 |  This class obscures pagination and retry logic, and exposes
 |  `Client.results`.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, page_size: 'int' = 100, delay_seconds: 'float' = 3.0, num_retries: 'int' = 3)
 |      Constructs an arXiv API client with the specified options.
 |      
 |      Note: the default parameters should provide a robust request strategy
 |      for most use cases. Extreme page sizes, delays, or retries risk
 |      violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou),
 |      brittle behavior, and inconsistent results.
 |  
 |  __repr__(self) -> 'str'
 |      Return repr(self).
 |  
 |  __str__(self) -> 'str'
 |      Return str(self).
 |  
 |  results(self, search: 'Search', offset: 'int' = 0) -> 'Generator

In [27]:
help(client.results)

Help on method results in module arxiv:

results(search: 'Search', offset: 'int' = 0) -> 'Generator[Result, None, None]' method of arxiv.Client instance
    Uses this client configuration to fetch one page of the search results
    at a time, yielding the parsed `Result`s, until `max_results` results
    have been yielded or there are no more search results.
    
    If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`.
    
    Setting a nonzero `offset` discards leading records in the result set.
    When `offset` is greater than or equal to `search.max_results`, the full
    result set is discarded.
    
    For more on using generators, see
    [Generators](https://wiki.python.org/moin/Generators).



In [41]:
import arxiv
import json
import time
import random
from typing import Dict, Any, List

def _process_paper(paper: arxiv.Result) -> Dict[str, Any]:
    """格式化论文信息"""
    return {
        "id": paper.get_short_id(),
        "title": paper.title,
        "authors": [author.name for author in paper.authors],
        "abstract": paper.summary,
        "categories": paper.categories,
        "published": paper.published.isoformat(),
        "url": paper.pdf_url,
        "doi": paper.doi
    }


def fetch_arxiv_papers(query: str, batch_size: int = 50, max_results: int = 500):
    client = arxiv.Client(page_size=batch_size, delay_seconds=3, num_retries=3)
    # client = arxiv.Client()
    all_results = []
    offset = 0

    search = arxiv.Search(
        query=query,
        max_results=max_results,  # ✅ 设置为总目标
        sort_by=arxiv.SortCriterion.Relevance
    )

    while True:
        try:
            print(offset)
            results_iter = client.results(search, offset)
            batch = [_process_paper(p) for p in results_iter]

            if not batch:
                print(f"⚠️ offset={offset} 返回空页，可能到头了，停止抓取。")
                break

            all_results.extend(batch)
            print(f"✅ 已抓取 {len(all_results)} 篇 (offset={offset})")

            offset += batch_size

            if len(all_results) >= max_results:
                break

        except Exception as e:
            print(f"⚠️ 抓取失败: {e}，等待 60 秒后重试...")
            time.sleep(60)

    print(f"🎉 抓取完成，总共 {len(all_results)} 篇论文")
    return all_results


all_results = fetch_arxiv_papers("fraud detection behavior sequence", batch_size=20, max_results=100)


0
✅ 已抓取 100 篇 (offset=0)
🎉 抓取完成，总共 100 篇论文


In [31]:
query = "fraud detection behavior sequence"
max_results = 1000
search = arxiv.Search(
        query=query,
        max_results=max_results,  # ✅ 设置为总目标
        sort_by=arxiv.SortCriterion.Relevance
    )

In [33]:
search.results

<bound method Search.results of arxiv.Search(query='fraud detection behavior sequence', id_list=[], max_results=1000, sort_by=<SortCriterion.Relevance: 'relevance'>, sort_order=<SortOrder.Descending: 'descending'>)>

In [34]:
help(arxiv.Client)

Help on class Client in module arxiv:

class Client(builtins.object)
 |  Client(page_size: 'int' = 100, delay_seconds: 'float' = 3.0, num_retries: 'int' = 3)
 |  
 |  Specifies a strategy for fetching results from arXiv's API.
 |  
 |  This class obscures pagination and retry logic, and exposes
 |  `Client.results`.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, page_size: 'int' = 100, delay_seconds: 'float' = 3.0, num_retries: 'int' = 3)
 |      Constructs an arXiv API client with the specified options.
 |      
 |      Note: the default parameters should provide a robust request strategy
 |      for most use cases. Extreme page sizes, delays, or retries risk
 |      violating the arXiv [API Terms of Use](https://arxiv.org/help/api/tou),
 |      brittle behavior, and inconsistent results.
 |  
 |  __repr__(self) -> 'str'
 |      Return repr(self).
 |  
 |  __str__(self) -> 'str'
 |      Return str(self).
 |  
 |  results(self, search: 'Search', offset: 'int' = 0) -> 'Generator

In [44]:
help(client.results)

Help on method results in module arxiv:

results(search: 'Search', offset: 'int' = 0) -> 'Generator[Result, None, None]' method of arxiv.Client instance
    Uses this client configuration to fetch one page of the search results
    at a time, yielding the parsed `Result`s, until `max_results` results
    have been yielded or there are no more search results.
    
    If all tries fail, raises an `UnexpectedEmptyPageError` or `HTTPError`.
    
    Setting a nonzero `offset` discards leading records in the result set.
    When `offset` is greater than or equal to `search.max_results`, the full
    result set is discarded.
    
    For more on using generators, see
    [Generators](https://wiki.python.org/moin/Generators).



In [46]:
batch_size = 20
max_results = 100
client = arxiv.Client(page_size=batch_size, delay_seconds=3, num_retries=3)
search = arxiv.Search(
        query=query,
        max_results=max_results,  # ✅ 设置为总目标
        sort_by=arxiv.SortCriterion.Relevance
    )

results_iter = client.results(search, offset=0)

j = 0
i_list = []
for i in results_iter:
    i_list.append(i.get_short_id())
    if j%5 ==0:
        print(j)
    j+=1

In [68]:
import arxiv
import json
import time
import os

def fetch_arxiv_papers(query, batch_size=50, max_results=1000, output_file="arxiv_results.json"):
    """
    分批抓取 arXiv 论文，自动保存 JSON
    """
    # 初始化客户端和搜索
    client = arxiv.Client(page_size=batch_size, delay_seconds=3, num_retries=3)
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )

    all_results = []

    # 如果已有文件，先读取已有数据
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            all_results = json.load(f)
        start_offset = len(all_results)
        print(f"发现已有 {start_offset} 条记录，将从 offset={start_offset} 继续抓取")
    else:
        start_offset = 0

    offset = start_offset
    while offset < max_results:
        try:
            results_iter = client.results(search, offset=offset)
            batch = []
            for j, result in enumerate(results_iter, 1):
                paper = {
                    "id": result.get_short_id(),
                    "title": result.title,
                    "authors": [a.name for a in result.authors],
                    "summary": result.summary,
                    "published": result.published.strftime("%Y-%m-%d"),
                    "updated": result.updated.strftime("%Y-%m-%d"),
                    "primary_category": result.primary_category,
                    "categories": result.categories,
                    "pdf_url": result.pdf_url
                }
                batch.append(paper)

                # 每 batch_size 条处理一次
                if j % batch_size == 0:
                    time.sleep(3 + random.random() * 2)
                    break

            if not batch:
                print(f"⚠️ offset={offset} 返回空页，等待 60 秒后重试...")
                time.sleep(60)
                continue

            all_results.extend(batch)
            offset += len(batch)

            # 保存到 JSON
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(all_results, f, ensure_ascii=False, indent=2)

            print(f"✅ 已抓取 {len(all_results)} 篇 (offset={offset})")

        except Exception as e:
            print(f"⚠️ 抓取失败: {e}, 等待 60 秒后重试...")
            time.sleep(60)
            continue

    print(f"🎉 抓取完成，总共 {len(all_results)} 篇论文，已保存到 {output_file}")
    return all_results



query = "fraud detection behavior sequence"
papers = fetch_arxiv_papers(query, batch_size=100, max_results=1000)


发现已有 205 条记录，将从 offset=205 继续抓取
✅ 已抓取 305 篇 (offset=305)
✅ 已抓取 405 篇 (offset=405)
✅ 已抓取 505 篇 (offset=505)
✅ 已抓取 605 篇 (offset=605)
✅ 已抓取 705 篇 (offset=705)
✅ 已抓取 805 篇 (offset=805)
⚠️ offset=805 返回空页，等待 60 秒后重试...


KeyboardInterrupt: 

In [116]:
import arxiv
import json
import time
from datetime import datetime, timedelta

def fetch_arxiv_auto_window(
    query,
    batch_size=50,
    save_path="arxiv_results.json",
    start_date=datetime(2020, 1, 1),
    max_retries=3,
    delay_seconds=3,
    window_days=1
):
    """
    自动按时间窗口抓取 Arxiv 论文，直到最新
    """
    client = arxiv.Client(page_size=batch_size, delay_seconds=delay_seconds, num_retries=max_retries)

    # 读取已有结果，支持断点续抓
    try:
        with open(save_path, "r", encoding="utf-8") as f:
            all_results = json.load(f)
            if all_results:
                # 已抓取的最新时间
                latest_date = max(datetime.fromisoformat(p['published']) for p in all_results)
                start_date = latest_date + timedelta(seconds=1)
    except FileNotFoundError:
        all_results = []

    # query_keywords = '((ti:"{query}") OR (abs:"{query}"))'.format(query=query)
    query_keywords = '((abs:fraud detection behavior sequence) OR (ti:fraud detection behavior sequence))'
    query_keywords = '((ti:fraud AND ti:detection AND ti:behavior AND ti:sequence) OR (abs:fraud AND abs:detection AND abs:behavior AND abs:sequence))'
    query_keywords = '((ti: fraud AND detection AND behavior AND sequence) OR (abs: fraud AND detection AND behavior AND sequence))'
    query_keywords = 'fraud AND detection AND behavior AND sequence'
    query_keywords = '((behavior AND sequence AND fraud AND detection) OR (user AND behavior AND sequence AND fraud) OR (sequential AND behavior AND fraud AND detection) OR (temporal AND sequence AND fraud AND detection) OR (fraud AND detection AND behavioral AND patterns) OR (sequence AND mining AND fraud AND detection) OR (RNN AND fraud AND detection AND behavior) OR (LSTM AND fraud AND detection AND sequence) OR (Transformer AND fraud AND detection AND behavior) OR (anomaly AND detection AND behavior AND sequence) OR (financial AND fraud AND behavior AND sequence) OR (e-commerce AND fraud AND behavior AND sequence) OR (clickstream AND fraud AND detection) OR (user AND action AND sequence AND fraud))'
    category_filter = ' AND cat:cs'
    
    today = datetime.utcnow()
    current_start = start_date

    while current_start < today:
        current_end = min(current_start + timedelta(days=window_days), today)
        time_filter = f" AND submittedDate:[{current_start.strftime('%Y%m%d0000')} TO {current_end.strftime('%Y%m%d2359')}]"
        # search_query = query + time_filter + category_filter
        search_query = query_keywords + time_filter

        print(f"\n⏳ 抓取时间窗口: {current_start.date()} -> {current_end.date()}")

        search = arxiv.Search(
            query=search_query,
            max_results=batch_size * 100,  # 设置足够大，分批抓取
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        offset = 0
        while True:
            retries = 0
            while retries < max_retries:
                try:
                    results_iter = client.results(search, offset=offset)
                    batch = []
                    for i, result in enumerate(results_iter):
                        batch.append({
                            "id": result.get_short_id(),
                            "title": result.title,
                            "authors": [a.name for a in result.authors],
                            "abstract": result.summary,
                            "categories": result.categories,
                            "published": result.published.isoformat(),
                            "updated": result.updated.isoformat(),
                            "url": result.pdf_url
                        })
                        if len(batch) >= batch_size:
                            break

                    if not batch:
                        # 空页结束当前时间窗口
                        break

                    # 保存增量结果
                    all_results.extend(batch)
                    with open(save_path, "w", encoding="utf-8") as f:
                        json.dump(all_results, f, ensure_ascii=False, indent=2)
                    
                    print(f"✅ 已抓取 {len(batch)} 篇 (offset={offset})")
                    offset += len(batch)
                    time.sleep(delay_seconds)
                    break  # 成功跳出重试

                except Exception as e:
                    retries += 1
                    wait_time = delay_seconds * 2 ** retries
                    print(f"⚠️ 抓取失败: {e}, 重试 {retries}/{max_retries}, 等待 {wait_time} 秒...")
                    time.sleep(wait_time)
            else:
                print(f"❌ 多次重试仍失败，跳过 offset={offset}")
                offset += batch_size

            # 当抓取结果少于 batch_size，说明该窗口抓完
            if len(batch) < batch_size:
                break

        current_start = current_end

    print(f"\n🎉 抓取完成，总共 {len(all_results)} 篇论文，已保存到 {save_path}")
    return all_results


# ===========================
# 示例调用
# ===========================

papers = fetch_arxiv_auto_window(
    query="fraud detection behavior sequence",
    batch_size=20,
    start_date=datetime(2000, 1, 1),
    save_path="arxiv_results.json",
    window_days=30
)



⏳ 抓取时间窗口: 2000-01-01 -> 2000-01-31

⏳ 抓取时间窗口: 2000-01-31 -> 2000-03-01

⏳ 抓取时间窗口: 2000-03-01 -> 2000-03-31

⏳ 抓取时间窗口: 2000-03-31 -> 2000-04-30

⏳ 抓取时间窗口: 2000-04-30 -> 2000-05-30

⏳ 抓取时间窗口: 2000-05-30 -> 2000-06-29

⏳ 抓取时间窗口: 2000-06-29 -> 2000-07-29

⏳ 抓取时间窗口: 2000-07-29 -> 2000-08-28

⏳ 抓取时间窗口: 2000-08-28 -> 2000-09-27

⏳ 抓取时间窗口: 2000-09-27 -> 2000-10-27

⏳ 抓取时间窗口: 2000-10-27 -> 2000-11-26

⏳ 抓取时间窗口: 2000-11-26 -> 2000-12-26

⏳ 抓取时间窗口: 2000-12-26 -> 2001-01-25

⏳ 抓取时间窗口: 2001-01-25 -> 2001-02-24

⏳ 抓取时间窗口: 2001-02-24 -> 2001-03-26

⏳ 抓取时间窗口: 2001-03-26 -> 2001-04-25

⏳ 抓取时间窗口: 2001-04-25 -> 2001-05-25

⏳ 抓取时间窗口: 2001-05-25 -> 2001-06-24

⏳ 抓取时间窗口: 2001-06-24 -> 2001-07-24

⏳ 抓取时间窗口: 2001-07-24 -> 2001-08-23

⏳ 抓取时间窗口: 2001-08-23 -> 2001-09-22

⏳ 抓取时间窗口: 2001-09-22 -> 2001-10-22

⏳ 抓取时间窗口: 2001-10-22 -> 2001-11-21

⏳ 抓取时间窗口: 2001-11-21 -> 2001-12-21

⏳ 抓取时间窗口: 2001-12-21 -> 2002-01-20

⏳ 抓取时间窗口: 2002-01-20 -> 2002-02-19

⏳ 抓取时间窗口: 2002-02-19 -> 2002-03-21

⏳ 抓取时间窗口: 2002-03-21 -> 200

In [None]:
# 原始关键词
query_keywords = '(ti:"{query}") OR (abs:"{query}")'.format(query=query)

# 时间窗口
time_filter = ' AND submittedDate:[2023-01-01 TO 2025-09-08]'

# 指定类别
category_filter = ' AND (cat:cs.CR OR cat:stat.ML)'

# 最终 arXiv 查询
search_query = query_keywords + time_filter + category_filter

In [117]:
x = ['behavior sequence fraud detection', 'user behavior sequence fraud', 'sequential behavior fraud detection', 'temporal sequence fraud detection', 'fraud detection behavioral patterns', 'sequence mining fraud detection', 'RNN fraud detection behavior', 'LSTM fraud detection sequence', 'Transformer fraud detection behavior', 'anomaly detection behavior sequence', 'financial fraud behavior sequence', 'e-commerce fraud behavior sequence', 'clickstream fraud detection', 'user action sequence fraud']
# x = [' AND '.join(i.split(' ')) for i in x]

In [114]:
x

['behavior AND sequence AND fraud AND detection',
 'user AND behavior AND sequence AND fraud',
 'sequential AND behavior AND fraud AND detection',
 'temporal AND sequence AND fraud AND detection',
 'fraud AND detection AND behavioral AND patterns',
 'sequence AND mining AND fraud AND detection',
 'RNN AND fraud AND detection AND behavior',
 'LSTM AND fraud AND detection AND sequence',
 'Transformer AND fraud AND detection AND behavior',
 'anomaly AND detection AND behavior AND sequence',
 'financial AND fraud AND behavior AND sequence',
 'e-commerce AND fraud AND behavior AND sequence',
 'clickstream AND fraud AND detection',
 'user AND action AND sequence AND fraud']

In [118]:
query = x
query_keywords = '(('+') OR ('.join([' AND '.join(i.split(' ')) for i in query])+'))'

In [119]:
query_keywords

'((behavior AND sequence AND fraud AND detection) OR (user AND behavior AND sequence AND fraud) OR (sequential AND behavior AND fraud AND detection) OR (temporal AND sequence AND fraud AND detection) OR (fraud AND detection AND behavioral AND patterns) OR (sequence AND mining AND fraud AND detection) OR (RNN AND fraud AND detection AND behavior) OR (LSTM AND fraud AND detection AND sequence) OR (Transformer AND fraud AND detection AND behavior) OR (anomaly AND detection AND behavior AND sequence) OR (financial AND fraud AND behavior AND sequence) OR (e-commerce AND fraud AND behavior AND sequence) OR (clickstream AND fraud AND detection) OR (user AND action AND sequence AND fraud))'

In [115]:
') OR ('.join(x)

'behavior AND sequence AND fraud AND detection) OR (user AND behavior AND sequence AND fraud) OR (sequential AND behavior AND fraud AND detection) OR (temporal AND sequence AND fraud AND detection) OR (fraud AND detection AND behavioral AND patterns) OR (sequence AND mining AND fraud AND detection) OR (RNN AND fraud AND detection AND behavior) OR (LSTM AND fraud AND detection AND sequence) OR (Transformer AND fraud AND detection AND behavior) OR (anomaly AND detection AND behavior AND sequence) OR (financial AND fraud AND behavior AND sequence) OR (e-commerce AND fraud AND behavior AND sequence) OR (clickstream AND fraud AND detection) OR (user AND action AND sequence AND fraud'

In [None]:
((behavior AND sequence AND fraud AND detection) OR (user AND behavior AND sequence AND fraud) OR (sequential AND behavior AND fraud AND detection) OR (temporal AND sequence AND fraud AND detection) OR (fraud AND detection AND behavioral AND patterns) OR (sequence AND mining AND fraud AND detection) OR (RNN AND fraud AND detection AND behavior) OR (LSTM AND fraud AND detection AND sequence) OR (Transformer AND fraud AND detection AND behavior) OR (anomaly AND detection AND behavior AND sequence) OR (financial AND fraud AND behavior AND sequence) OR (e-commerce AND fraud AND behavior AND sequence) OR (clickstream AND fraud AND detection) OR (user AND action AND sequence AND fraud))

In [1]:
from mcp_tools import arxiv_tool

In [3]:
# query = ['behavior sequence fraud detection', 'user behavior sequence fraud', 'sequential behavior fraud detection', 'temporal sequence fraud detection', 'fraud detection behavioral patterns', 'sequence mining fraud detection', 'RNN fraud detection behavior', 'LSTM fraud detection sequence', 'Transformer fraud detection behavior', 'anomaly detection behavior sequence', 'financial fraud behavior sequence', 'e-commerce fraud behavior sequence', 'clickstream fraud detection', 'user action sequence fraud']
# query_result = arxiv_tool.search_arxiv(query)

In [2]:
from utils.llm_utils import *

In [1]:
import json

# 打开文件并解析 JSON
with open("data/raw/arxiv_results.json", "r", encoding="utf-8") as f:
    papers = json.load(f)

print(type(papers))
print(len(papers))  # 如果是list，输出论文数量


<class 'list'>
127


In [6]:
papers[0]

{'id': '2001.04734v1',
 'title': 'Change Detection in Dynamic Attributed Networks',
 'authors': ['Isuru Udayangani Hewapathirana'],
 'abstract': 'A network provides powerful means of representing complex relationships\nbetween entities by abstracting entities as vertices, and relationships as\nedges connecting vertices in a graph. Beyond the presence or absence of\nrelationships, a network may contain additional information that can be\nattributed to the entities and their relationships. Attaching these additional\nattribute data to the corresponding vertices and edges yields an attributed\ngraph. Moreover, in the majority of real-world applications, such as online\nsocial networks, financial networks and transactional networks, relationships\nbetween entities evolve over time.\n  Change detection in dynamic attributed networks is an important problem in\nmany areas, such as fraud detection, cyber intrusion detection and health care\nmonitoring. It is a challenging problem because it i

In [3]:
paper_classified = run_classification_chain(papers[0])

In [4]:
paper_classified

{'data_source_type': ['social networks',
  'financial networks',
  'transactional networks'],
 'data_source_name': [],
 'fraud_type': 'change detection in dynamic attributed networks',
 'technical_approach_category': ['survey',
  'anomaly detection',
  'graph analysis'],
 'technical_approach_method': [],
 'technical_approach_description': 'This survey paper provides an overview of existing change detection methods in dynamic attributed networks that utilize attribute information. Methods are categorized based on the levels of graph structure exploited: vertices, edges, subgraphs, communities, and overall graph. The paper analyzes strengths, weaknesses, performance, and scalability of these approaches.',
 'innovation_points': 'Categorization of change detection methods based on structural levels in attributed graphs, comprehensive analysis of performance and scalability trade-offs, and overview of available datasets and simulation models for dynamic attributed networks.',
 'github_repo'

In [5]:
# 合并字典
merged_dict = {**papers[0], **paper_classified}  # 如果键重复，dict2的值会覆盖dict1

# 转换回JSON字符串
merged_json = json.dumps(merged_dict)


In [6]:
merged_json

'{"id": "2001.04734v1", "title": "Change Detection in Dynamic Attributed Networks", "authors": ["Isuru Udayangani Hewapathirana"], "abstract": "A network provides powerful means of representing complex relationships\\nbetween entities by abstracting entities as vertices, and relationships as\\nedges connecting vertices in a graph. Beyond the presence or absence of\\nrelationships, a network may contain additional information that can be\\nattributed to the entities and their relationships. Attaching these additional\\nattribute data to the corresponding vertices and edges yields an attributed\\ngraph. Moreover, in the majority of real-world applications, such as online\\nsocial networks, financial networks and transactional networks, relationships\\nbetween entities evolve over time.\\n  Change detection in dynamic attributed networks is an important problem in\\nmany areas, such as fraud detection, cyber intrusion detection and health care\\nmonitoring. It is a challenging problem bec

In [12]:
import json

# 打开文件并解析 JSON
with open("data/raw/arxiv_results.json", "r", encoding="utf-8") as f:
    papers = json.load(f)

paper_classification = []
for i, paper in enumerate(papers):
    if i%10 == 0:
        print(i)
    classification = run_classification_chain(paper)
    merged_dict = {**paper, **classification}
    merged_json = json.dumps(merged_dict)
    
    paper_classification.append(merged_json)

0


KeyboardInterrupt: 

In [13]:
len(paper_classification)

0

In [14]:
from utils.llm_utils import get_llm

In [15]:
llm = get_llm()

In [16]:
import json
with open("data/processed/arxiv_results.json", "r", encoding="utf-8") as f:
    papers = json.load(f)

In [17]:
len(papers)

127

In [26]:
query = '行为序列在反欺诈领域的研究'
filtered_papers = []
for paper in papers[0:5]:
    papers_str = json.dumps(paper, ensure_ascii=False, indent=2)
    print(paper['title'])
    prompt = "判断这篇论文是否与用户查询 '{query}' 相关: '{paper}'，相关输出为1，否则输出0".format(query=query, paper=papers_str)
    match = llm.invoke(prompt)
    if match:
        print(paper['title'])
        filtered_papers.append(paper)

Change Detection in Dynamic Attributed Networks
Change Detection in Dynamic Attributed Networks
Detecting Deep-Fake Videos from Appearance and Behavior
Detecting Deep-Fake Videos from Appearance and Behavior
Multi-IF : An Approach to Anomaly Detection in Self-Driving Systems
Multi-IF : An Approach to Anomaly Detection in Self-Driving Systems
Sequential Anomaly Detection using Inverse Reinforcement Learning
Sequential Anomaly Detection using Inverse Reinforcement Learning
Analyze and Development System with Multiple Biometric Identification
Analyze and Development System with Multiple Biometric Identification


In [54]:
filtered_papers = papers
clustering_prompt = f"""
    我有 {len(filtered_papers)} 篇论文, 每篇论文包含三个字段:
    data_source_type, fraud_type, technical_approach_category。
    请将每个字段的值聚类成大约10类，不要超过20类，并统计每类数量。
    返回 JSON 格式:
    {{
        "data_source_type": {{}},
        "fraud_type": {{}},
        "technical_approach_category": {{}}
    }}
    """

filtered_papers_str = json.dumps(filtered_papers, ensure_ascii=False, indent=2)
clustering_prompt += "\n" + str(filtered_papers)
cluster_stats = llm.invoke(clustering_prompt)

In [55]:
raw_str = cluster_stats.content
clean_str = raw_str.strip()
if clean_str.startswith("```json"):
    clean_str = clean_str[len("```json"):].strip()
if clean_str.endswith("```"):
    clean_str = clean_str[:-3].strip()

In [56]:
clean_json = json.loads(clean_str)['data_source_type']
i = 0
for k in clean_json:
    i+=int(clean_json[k])
print(i)

164


In [57]:
clean_json = json.loads(clean_str)['fraud_type']
i = 0
for k in clean_json:
    i+=int(clean_json[k])
print(i)

144


In [58]:
clean_json = json.loads(clean_str)['technical_approach_category']
i = 0
for k in clean_json:
    i+=int(clean_json[k])
print(i)

308


In [59]:
json.loads(clean_str)['technical_approach_category']

{'anomaly detection': 42,
 'deep learning/neural networks': 38,
 'graph neural networks (GNN)': 27,
 'sequence modeling/RNN/LSTM': 25,
 'unsupervised learning': 22,
 'feature engineering': 19,
 'Transformer/attention mechanisms': 18,
 'reinforcement learning': 9,
 'probabilistic modeling': 8,
 'clustering/pattern mining': 12,
 'explainable AI/interpretability': 11,
 'transfer learning/domain adaptation': 10,
 'multimodal fusion': 13,
 'self-supervised learning': 11,
 'generative models (GAN/VAE)': 9,
 'statistical methods': 8,
 'ensemble methods': 7,
 'computer vision': 11,
 'natural language processing': 8}

In [4]:
from agent import paper_search_agent

ModuleNotFoundError: No module named 'fraud_research_agent'