In [6]:
pip install a2t


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pandas as pd
import json
from types import SimpleNamespace
from a2t.base import EntailmentClassifier
from a2t.tasks import TopicClassificationTask, TopicClassificationFeatures


In [8]:
# Define your topic labels (customize these based on your needs)
labels = [
    "Technology",
    "Data Analysis", 
    "Natural Language Processing",
    "Education",
    "Personal Development",
    "Literature and Books",
    "Finance and Economics",
    "Programming",
    "Philosophy",
    "General Knowledge"
]

# Create the task
task = TopicClassificationTask(
    name="Prompt Topic Classification",
    labels=labels,
    hypothesis_template="This prompt is about {label}."
)

# Initialize the classifier
nlp = EntailmentClassifier(
    'roberta-large-mnli',
    use_cuda=False,  # Set to True if you have GPU
    half=False
)

In [14]:
# Load your prompts
df = pd.read_csv('conversation.csv')
prompts = df['title'].tolist()

In [None]:
from concurrent.futures import ThreadPoolExecutor

import concurrent.futures

MAX_CHARS = 500
truncated_prompts = [text[:MAX_CHARS] for text in prompts]

def classify_batch(batch_texts):
    batch_features = [TopicClassificationFeatures(context=text, label=None) for text in batch_texts]
    return nlp(
        task=task,
        features=batch_features,
        return_labels=True,
        return_confidences=True
    )

# Split into batches and process in parallel
batch_size = 10
batches = [truncated_prompts[i:i+batch_size] for i in range(0, len(truncated_prompts), batch_size)]

all_topics = []
all_confidences = []

with ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(classify_batch, batch) for batch in batches]
    for future in concurrent.futures.as_completed(futures):
        topics, confidences = future.result()
        all_topics.extend(topics)
        all_confidences.extend(confidences)

df['predicted_topic'] = all_topics[:len(df)]
df['confidence'] = all_confidences[:len(df)]

 27%|██▋       | 271/988 [02:43<07:11,  1.66it/s]


KeyboardInterrupt: 

In [11]:
# To process all prompts:
all_features = [TopicClassificationFeatures(context=text, label=None) for text in prompts]  # Process first 100
all_predictions = nlp(
    task=task,
    features=all_features,
    return_labels=True,
    return_confidences=True
)

# Add predictions to dataframe
df['predicted_topic'] = all_predictions[0][:len(df)]
df['confidence'] = all_predictions[1][:len(df)]

  1%|          | 1/100 [00:01<02:21,  1.43s/it]


RuntimeError: The expanded size of the tensor (544) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [10, 544].  Tensor sizes: [1, 514]

In [15]:
# Process a sample prompt or all prompts
sample_text = "fix this bug?"  # First prompt
features = [TopicClassificationFeatures(context=sample_text, label=None)]

# Get predictions
predictions = nlp(
    task=task,
    features=features,
    return_labels=True,
    return_confidences=True
)

print(f"Prompt: {sample_text}")
print(f"Predictions: {predictions}")

100%|██████████| 1/1 [00:00<00:00,  2.30it/s]

Prompt: fix this bug?
Predictions: [('Programming', 0.20142296)]



