In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

import statextract.helpers as helpers


In [2]:
all_topics = helpers.all_topics()

all_topics[0]

import pydantic


class Summary(pydantic.BaseModel):
    display_name: str
    description: str
    keywords: list[str]


# all_topics[0]
def process(topic: dict):
    # try:
    return Summary(
        display_name=topic["display_name"],
        description=topic["description"],
        keywords=topic["keywords"],
    )
    # except Exception as e:
    #     print(e)


topics = [process(t) for t in all_topics]


In [18]:
all_topics[0]

{'id': 'https://openalex.org/T11881',
 'display_name': 'Crystallization Processes and Control',
 'description': 'This cluster of papers focuses on the crystallization processes and control, including topics such as nucleation, solubility, polymorphism, ultrasound-assisted crystallization, process analytical technology, crystal growth, pharmaceutical crystallization, continuous crystallization, and crystal engineering.',
 'keywords': ['Crystallization',
  'Nucleation',
  'Solubility',
  'Polymorphism',
  'Ultrasound-Assisted Crystallization',
  'Process Analytical Technology',
  'Crystal Growth',
  'Pharmaceutical Crystallization',
  'Continuous Crystallization',
  'Crystal Engineering'],
 'ids': {'openalex': 'https://openalex.org/T11881',
  'wikipedia': 'https://en.wikipedia.org/wiki/Crystallization'},
 'subfield': {'id': 'https://openalex.org/subfields/2505',
  'display_name': 'Materials Chemistry'},
 'field': {'id': 'https://openalex.org/fields/25',
  'display_name': 'Materials Scien

In [23]:
unique_domains = list(set([(t['field']['id'], t['field']['display_name']) if 'field' in t else None for t in all_topics]))
# unique_domains

target_domains = ['Social Sciences']

topics_in_social_sciences = [t for t in all_topics if t['field']['display_name'] in target_domains]
[t['display_name'] for t in topics_in_social_sciences]


['Territorial Governance and Environmental Participation',
 'American Political Thought and History',
 'Society and Economy in Ancient Mediterranean Civilizations',
 'Social Work in Spanish-Speaking Countries',
 'Vocational Education and Training in Australia',
 'Judicial Review in European Legal Systems',
 'Religious Diversity and Regulation in Chinese Society',
 'Pedagogical and Educational Research',
 'Research Methodology in Social Sciences and Education',
 'Role of Social Innovation in Territorial Development',
 'Evolution of Legal Systems and Jurisprudence',
 'Economy and Society in Ancient Rome',
 'Nazi Germany and Postwar Europe',
 'Civil Rights Movement and Black Studies',
 'Political Governance and Leadership in Modern Societies',
 'The Responsibility to Protect in International Relations',
 'Qualitative Research in Social Sciences and Education',
 'Impact of the Covid-19 Pandemic on Small and Medium Enterprises in Indonesia',
 'Reform of Family Law and Democratic Progress',


In [3]:
import vllm.entrypoints.chat_utils


def form_prompt(topic: Summary) -> list[vllm.entrypoints.chat_utils.ChatCompletionMessageParam]:
    return [
        {
            "role": "system",
            "content": "You are a helpful expert research taxonomy bot.",
        },
        {
            "role": "user",
            "content": f"""
Your task is to decide whether the following detailed research topic can be classified as Social Science. For example, economics, sociology, psychology, anthropology, political science would return Yes, but botany, physics, chemistry, mathematics, or biology would return No. The topic is:

Name: {topic.display_name}
Description: {topic.description}
Keywords: {topic.keywords}

Can this topic be classified as Social Science?
""",
        },
    ]


In [4]:
from vllm import LLM, SamplingParams

llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct", max_model_len=2048)


INFO 10-07 13:42:45 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Meta-Llama-3.1-8B-Instruct, use_v2_block_manager=False, num_scheduler

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 10-07 13:42:57 model_runner.py:1025] Loading model weights took 14.9888 GB
INFO 10-07 13:42:58 gpu_executor.py:122] # GPU blocks: 1841, # CPU blocks: 2048
INFO 10-07 13:42:58 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-07 13:42:58 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-07 13:43:08 model_runner.py:1456] Graph capturing finished in 10 secs.


In [5]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]


In [6]:
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct"
)

# prompt_formatt

In [7]:
tokenizer.encode("Yes", add_special_tokens=False), \
tokenizer.encode("No", add_special_tokens=False)


([9642], [2822])

In [8]:
# LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor],
import torch


def logit_processor_yes_no(_, logits: torch.Tensor):
    # set all logits to -inf except for the yes and no logits
    mask = torch.full_like(logits, fill_value=-float("inf"))
    mask[9642] = 0
    mask[2822] = 0
    return logits + mask


sampling_params = SamplingParams(
    temperature=0.0, logits_processors=[logit_processor_yes_no], max_tokens=1
)

res: list[str] = []

import tqdm

BATCH_SIZE = 32
for batch_range in tqdm.tqdm(range(0, len(topics), 16)):
    batch = topics[batch_range:batch_range+BATCH_SIZE]
    ppts = [form_prompt(t) for t in batch]
    
    outputs = llm.chat(messages=ppts, sampling_params=sampling_params, use_tqdm=False)
    res.extend([output.outputs[0].text for output in outputs])
    
    # break

res
    
# ptt = form_prompt(topics[0])
# outputs = llm.chat(messages=[ptt], sampling_params=sampling_params)

# # batch_size

# # Print the outputs.
# for output in outputs:
#     prompt = output.prompt
#     generated_text = output.outputs[0].text
#     toks = output.outputs[0].token_ids
#     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}, Tokens: {toks}")


100%|██████████| 283/283 [04:12<00:00,  1.12it/s]


['No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'N

In [10]:
ss_topics = [t for t, r in zip(topics, res) if r == "Yes"]
[(s.display_name, s.description) for s in ss_topics]


[('Territorial Governance and Environmental Participation',
  'This cluster of papers explores the intersection of territorial governance, environmental participation, and sustainable development. It delves into topics such as citizen participation, local development, social justice, and community engagement in the context of rural territories. The papers also discuss the role of proximity and innovation in shaping public policy for sustainable and inclusive development.'),
 ('American Political Thought and History',
  'This cluster of papers explores the development of American political thought and history, focusing on topics such as the American founding, constitutional government, religious freedom, public opinion, separation of church and state, presidential prerogative, early republic, federalism, political philosophy, and nationalism.'),
 ('Society and Economy in Ancient Mediterranean Civilizations',
  'This cluster of papers explores various aspects of ancient Mediterranean civ

In [None]:
logits_processor_yes_no = 