In [1]:
import pathlib
import re

import pandas as pd

# FutureWarningのメッセージに従って
pd.set_option('future.no_silent_downcasting', True)

In [2]:
DATA_DIR = pathlib.Path('../data')

# 4. LLM関連論文の判定
LLM に関連する論文を判定します。  
判定方法は [Topics, Authors, and Networks in Large Language Model Research: Trends from a Survey of 17K arXiv Papers](https://arxiv.org/abs/2307.10700) の手法を参考にしています。  

In [3]:
DATA_PATH = DATA_DIR / 'papers_med.csv'

SAVE_PATH = DATA_DIR / 'papers_llm_med.csv'

In [4]:
RELATED_TO_MEDICINE_COL = 'related_to_medicine'

df = pd.read_csv(DATA_PATH)
df.replace({RELATED_TO_MEDICINE_COL: {'No': 0, 'Yes': 1}}, inplace=True)

print(f'{len(df):,}')
df.head(2)

191,185


Unnamed: 0,date,year_month,title,abstract,categories,url,related_to_medicine
0,2022-01-01,2022-01,PatchTrack: Multiple Object Tracking Using Fra...,Object motion and object appearance are common...,['cs.CV'],https://arxiv.org/abs/2201.00080,0
1,2022-01-01,2022-01,Automated Fake News Detection using cross-chec...,"Over the past decade, fake news and misinforma...","['cs.CL', 'cs.SI', 'physics.soc-ph']",https://arxiv.org/abs/2201.00083,0


In [5]:
TITLE_COL = 'title'
ABSTRACT_COL = 'abstract'

LLM_CASE_INSENSITIVE_KEYWORDS = [
    'LLM', 'LLMs', 'Large Language Model', 'Large Language Models'
]
LLM_CASE_SENSITIVE_KEYWORDS = [
    'GPT3', 'GPT-3', 'GPT 3', 'GPT4', 'GPT-4', 'GPT 4', 'ChatGPT', 'Chat-GPT',
    'Chat GPT', 'PaLM', 'Bard', 'LLaMA', 'Llama', 'Claude'
]
LLM_CASE_INSENSITIVE_KEYWORDS_REGEX = re.compile(
    fr'\b({"|".join([kw.lower() for kw in LLM_CASE_INSENSITIVE_KEYWORDS])})\b')
LLM_CASE_SENSITIVE_KEYWORDS_REGEX = re.compile(
    fr'\b({"|".join(LLM_CASE_SENSITIVE_KEYWORDS)})\b')


def add_related_to_llm_col(df: pd.DataFrame, target_col: str = 'related_to_llm') -> None:
    """LLM関連論文かどうかを表すカラムを追加する"""
    df[target_col] = [
        1 if is_related_to_llm(f'{getattr(row, TITLE_COL)}\n{getattr(row, ABSTRACT_COL)}') else 0
        for row in df.itertuples()
    ]


def is_related_to_llm(text: str) -> bool:
    """textがLLM関連キーワードを含むかを判定する"""
    if LLM_CASE_INSENSITIVE_KEYWORDS_REGEX.search(text.lower()):
        return True
    if LLM_CASE_SENSITIVE_KEYWORDS_REGEX.search(text):
        return True
    return False

In [6]:
add_related_to_llm_col(df)

df.head(2)

Unnamed: 0,date,year_month,title,abstract,categories,url,related_to_medicine,related_to_llm
0,2022-01-01,2022-01,PatchTrack: Multiple Object Tracking Using Fra...,Object motion and object appearance are common...,['cs.CV'],https://arxiv.org/abs/2201.00080,0,0
1,2022-01-01,2022-01,Automated Fake News Detection using cross-chec...,"Over the past decade, fake news and misinforma...","['cs.CL', 'cs.SI', 'physics.soc-ph']",https://arxiv.org/abs/2201.00083,0,0


In [7]:
df.to_csv(SAVE_PATH, header=True, index=False)