Label Step One

In [4]:
import pandas as pd
from datetime import datetime
from together import Together
from tqdm import tqdm

# Initialize the Together AI client
client = Together(api_key='')

# Path to the input CSV file containing BERTopic results
input_csv_path = 'First_Step_BERTopic_topic_info.csv'

# Load the dataset from the CSV file
df = pd.read_csv(input_csv_path)

def generate_human_readable_label(topic, representation, representative_docs):
    """
    Generates a human-readable label for a given topic using Together AI.

    Args:
        topic (str): The non-human readable topic string.
        representation (str): Keywords associated with the topic.
        representative_docs (str): Representative documents for the topic.

    Returns:
        str: A concise, human-readable label for the topic.
    """
    prompt = (
        f"You are an intelligent assistant skilled in generating labels for topics. "
        f"Given the following topic details about the data:\n"
        f"Non-human readable Topic: {topic}\n"
        f"Representative Documents: {representative_docs}\n"
        f"Keywords: {representation}\n"
        f"The label has to be a maximum of 60 characters long, concise and descriptive of the data."
        f"Return stricly only the label nothing else! "
    )

    response = ""
    try:
        stream = client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            messages=[{"role": "user", "content": prompt}],
            stream=True,
            temperature=0.1,
        )

        for chunk in stream:
            response += chunk.choices[0].delta.content or ""
        
    except Exception as e:
        print(f"Error generating label: {e}")
        return None

    return response.strip()

# Initialize a progress bar
with tqdm(total=len(df), desc="Generating Labels") as pbar:
    # Generate human-readable labels for each topic in the dataset
    labels = []
    for _, row in df.iterrows():
        label = generate_human_readable_label(
            topic=row['Topic'],
            representation=row['Representation'],
            representative_docs=row['Representative_Docs']
        )
        labels.append(label)
        pbar.update(1)  # Update progress bar after processing each row

# Add the generated labels to the DataFrame
df['Human_Readable_Topic'] = labels

# Path to the output CSV file
output_csv_path = 'First_Step_BERTopic_topic_info_labelled.csv'

# Save the updated DataFrame with the new labels to a CSV file
df.to_csv(output_csv_path, index=False)

print(f"Label generation complete. Updated CSV file saved to {output_csv_path}.")


Generating Labels: 100%|██████████| 524/524 [06:02<00:00,  1.45it/s]

Label generation complete. Updated CSV file saved to First_Step_BERTopic_topic_info_labelled.csv.





Label Step Two

In [5]:
import pandas as pd
from datetime import datetime
from together import Together
from tqdm import tqdm

# Initialize the Together AI client
client = Together(api_key='')

# Path to the input CSV file containing BERTopic results
input_csv_path = 'Second_step_clustering_results.csv'

# Load the dataset from the CSV file
df = pd.read_csv(input_csv_path)

def generate_human_readable_label(topic, representation, representative_docs):
    """
    Generates a human-readable label for a given topic using Together AI.

    Args:
        topic (str): The non-human readable topic string.
        representation (str): Keywords associated with the topic.
        representative_docs (str): Representative documents for the topic.

    Returns:
        str: A concise, human-readable label for the topic.
    """
    prompt = (
        f"You are an intelligent assistant skilled in generating labels for topics. "
        f"Given the following topic details about the data:\n"
        f"Non-human readable Topic: {topic}\n"
        f"Representative Documents: {representative_docs}\n"
        f"Keywords: {representation}\n"
        f"The label has to be a maximum of 60 characters long, concise and descriptive of the data."
        f"Return stricly only the label nothing else! "
    )

    response = ""
    try:
        stream = client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            messages=[{"role": "user", "content": prompt}],
            stream=True,
            temperature=0.1,
        )

        for chunk in stream:
            response += chunk.choices[0].delta.content or ""
        
    except Exception as e:
        print(f"Error generating label: {e}")
        return None

    return response.strip()

# Initialize a progress bar
with tqdm(total=len(df), desc="Generating Labels") as pbar:
    # Generate human-readable labels for each topic in the dataset
    labels = []
    for _, row in df.iterrows():
        label = generate_human_readable_label(
            topic=row['Topic'],
            representation=row['Representation'],
            representative_docs=row['Representative_Docs']
        )
        labels.append(label)
        pbar.update(1)  # Update progress bar after processing each row

# Add the generated labels to the DataFrame
df['Human_Readable_Topic'] = labels

# Path to the output CSV file
output_csv_path = 'Second_step_clustering_results_labelled.csv'

# Save the updated DataFrame with the new labels to a CSV file
df.to_csv(output_csv_path, index=False)

print(f"Label generation complete. Updated CSV file saved to {output_csv_path}.")


Generating Labels: 100%|██████████| 313/313 [03:24<00:00,  1.53it/s]

Label generation complete. Updated CSV file saved to Second_step_clustering_results_labelled.csv.



