In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the DBpedia dataset
def load_dbpedia_dataset(test_file_path, class_file_path):
    # Load classes from classes.txt
    with open(class_file_path, "r", encoding="utf-8") as f:
        classes = [line.strip() for line in f.readlines()]
    class_map = {str(i + 1): cls for i, cls in enumerate(classes)}
    
    # Load the dataset
    df = pd.read_csv(
        test_file_path, 
        header=None, 
        names=["class", "title", "content"], 
        encoding="utf-8"
    )
    
    # Map numeric classes to meaningful labels
    df["class"] = df["class"].astype(str).map(class_map)
    
    return df

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item["class"] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item["class"]]) < num_samples_per_class:
            selected_data[item["class"]].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the category from the following options: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Meta-Llama-3.1-8B-Instruct': (0.10, 0.20),
        'Meta-Llama-3.2-1B-Instruct': (0.04, 0.08),
        'Meta-Llama-3.2-3B-Instruct': (0.08, 0.16),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, classes, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    # Ensure all classes are strings
    classes_formatted = ", ".join(map(str, classes))

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item["content"], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == "direct":
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == "chain-of-thought":
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == "few-shot":
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex["content"], ex["class"]) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            # Count input tokens and check token budget
            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # Simulated API Call (replace this with actual API call)
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1,
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    # Count output tokens and estimate cost
                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    # Check if the prediction is correct
                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    file_exists = os.path.isfile(file_path)
    result = pd.DataFrame([{
        "model": model_name,
        "prompting_method": prompting_method,
        "accuracy": accuracy,
        "cost": cost
    }])

    if file_exists:
        result.to_csv(file_path, mode="a", header=False, index=False)
    else:
        result.to_csv(file_path, index=False)

# Main execution
if __name__ == "__main__":
    test_file_path = "data/dbpedia_csv/test.csv"
    class_file_path = "data/dbpedia_csv/classes.txt"
    
    dbpedia_data = load_dbpedia_dataset(test_file_path, class_file_path)
    classes = dbpedia_data["class"].unique()

    model_name = "Meta-Llama-3.2-1B-Instruct"  # Change this to the desired model name
    prompting_method = "direct"  # Change this to the desired method

    accuracy, cost = evaluate_model(
        dbpedia_data.to_dict(orient="records"),
        classes,
        model_name,
        prompting_method
    )

    print(f"Model: {model_name}, Prompting Method: {prompting_method}")
    print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

    results_file = "dbpedia_results.csv"
    save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
    print(f"Results saved to {results_file}")


Model: Meta-Llama-3.2-1B-Instruct, Prompting Method: direct
Accuracy: 0.52, Cost: $0.00
Results saved to dbpedia_results.csv


In [2]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the DBpedia dataset
def load_dbpedia_dataset(test_file_path, class_file_path):
    # Load classes from classes.txt
    with open(class_file_path, "r", encoding="utf-8") as f:
        classes = [line.strip() for line in f.readlines()]
    class_map = {str(i + 1): cls for i, cls in enumerate(classes)}
    
    # Load the dataset
    df = pd.read_csv(
        test_file_path, 
        header=None, 
        names=["class", "title", "content"], 
        encoding="utf-8"
    )
    
    # Map numeric classes to meaningful labels
    df["class"] = df["class"].astype(str).map(class_map)
    
    return df

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item["class"] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item["class"]]) < num_samples_per_class:
            selected_data[item["class"]].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the category from the following options: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Meta-Llama-3.1-8B-Instruct': (0.10, 0.20),
        'Meta-Llama-3.2-1B-Instruct': (0.04, 0.08),
        'Meta-Llama-3.2-3B-Instruct': (0.08, 0.16),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, classes, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    # Ensure all classes are strings
    classes_formatted = ", ".join(map(str, classes))

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item["content"], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == "direct":
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == "chain-of-thought":
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == "few-shot":
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex["content"], ex["class"]) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            # Count input tokens and check token budget
            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # Simulated API Call (replace this with actual API call)
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1,
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    # Count output tokens and estimate cost
                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    # Check if the prediction is correct
                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    file_exists = os.path.isfile(file_path)
    result = pd.DataFrame([{
        "model": model_name,
        "prompting_method": prompting_method,
        "accuracy": accuracy,
        "cost": cost
    }])

    if file_exists:
        result.to_csv(file_path, mode="a", header=False, index=False)
    else:
        result.to_csv(file_path, index=False)

# Main execution
if __name__ == "__main__":
    test_file_path = "data/dbpedia_csv/test.csv"
    class_file_path = "data/dbpedia_csv/classes.txt"
    
    dbpedia_data = load_dbpedia_dataset(test_file_path, class_file_path)
    classes = dbpedia_data["class"].unique()

    model_name = "Meta-Llama-3.2-1B-Instruct"  # Change this to the desired model name
    prompting_method = "chain-of-thought"  # Change this to the desired method

    accuracy, cost = evaluate_model(
        dbpedia_data.to_dict(orient="records"),
        classes,
        model_name,
        prompting_method
    )

    print(f"Model: {model_name}, Prompting Method: {prompting_method}")
    print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

    results_file = "dbpedia_results.csv"
    save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
    print(f"Results saved to {results_file}")


Model: Meta-Llama-3.2-1B-Instruct, Prompting Method: chain-of-thought
Accuracy: 0.47, Cost: $0.00
Results saved to dbpedia_results.csv


In [3]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the DBpedia dataset
def load_dbpedia_dataset(test_file_path, class_file_path):
    # Load classes from classes.txt
    with open(class_file_path, "r", encoding="utf-8") as f:
        classes = [line.strip() for line in f.readlines()]
    class_map = {str(i + 1): cls for i, cls in enumerate(classes)}
    
    # Load the dataset
    df = pd.read_csv(
        test_file_path, 
        header=None, 
        names=["class", "title", "content"], 
        encoding="utf-8"
    )
    
    # Map numeric classes to meaningful labels
    df["class"] = df["class"].astype(str).map(class_map)
    
    return df

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item["class"] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item["class"]]) < num_samples_per_class:
            selected_data[item["class"]].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the category from the following options: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Meta-Llama-3.1-8B-Instruct': (0.10, 0.20),
        'Meta-Llama-3.2-1B-Instruct': (0.04, 0.08),
        'Meta-Llama-3.2-3B-Instruct': (0.08, 0.16),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, classes, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    # Ensure all classes are strings
    classes_formatted = ", ".join(map(str, classes))

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item["content"], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == "direct":
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == "chain-of-thought":
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == "few-shot":
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex["content"], ex["class"]) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            # Count input tokens and check token budget
            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # Simulated API Call (replace this with actual API call)
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1,
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    # Count output tokens and estimate cost
                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    # Check if the prediction is correct
                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    file_exists = os.path.isfile(file_path)
    result = pd.DataFrame([{
        "model": model_name,
        "prompting_method": prompting_method,
        "accuracy": accuracy,
        "cost": cost
    }])

    if file_exists:
        result.to_csv(file_path, mode="a", header=False, index=False)
    else:
        result.to_csv(file_path, index=False)

# Main execution
if __name__ == "__main__":
    test_file_path = "data/dbpedia_csv/test.csv"
    class_file_path = "data/dbpedia_csv/classes.txt"
    
    dbpedia_data = load_dbpedia_dataset(test_file_path, class_file_path)
    classes = dbpedia_data["class"].unique()

    model_name = "Meta-Llama-3.2-1B-Instruct"  # Change this to the desired model name
    prompting_method = "few-shot"  # Change this to the desired method

    accuracy, cost = evaluate_model(
        dbpedia_data.to_dict(orient="records"),
        classes,
        model_name,
        prompting_method
    )

    print(f"Model: {model_name}, Prompting Method: {prompting_method}")
    print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

    results_file = "dbpedia_results.csv"
    save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
    print(f"Results saved to {results_file}")


Model: Meta-Llama-3.2-1B-Instruct, Prompting Method: few-shot
Accuracy: 0.40, Cost: $0.00
Results saved to dbpedia_results.csv


In [4]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the DBpedia dataset
def load_dbpedia_dataset(test_file_path, class_file_path):
    # Load classes from classes.txt
    with open(class_file_path, "r", encoding="utf-8") as f:
        classes = [line.strip() for line in f.readlines()]
    class_map = {str(i + 1): cls for i, cls in enumerate(classes)}
    
    # Load the dataset
    df = pd.read_csv(
        test_file_path, 
        header=None, 
        names=["class", "title", "content"], 
        encoding="utf-8"
    )
    
    # Map numeric classes to meaningful labels
    df["class"] = df["class"].astype(str).map(class_map)
    
    return df

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item["class"] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item["class"]]) < num_samples_per_class:
            selected_data[item["class"]].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the category from the following options: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Meta-Llama-3.1-8B-Instruct': (0.10, 0.20),
        'Meta-Llama-3.2-1B-Instruct': (0.04, 0.08),
        'Meta-Llama-3.2-3B-Instruct': (0.08, 0.16),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, classes, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    # Ensure all classes are strings
    classes_formatted = ", ".join(map(str, classes))

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item["content"], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == "direct":
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == "chain-of-thought":
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == "few-shot":
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex["content"], ex["class"]) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            # Count input tokens and check token budget
            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # Simulated API Call (replace this with actual API call)
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1,
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    # Count output tokens and estimate cost
                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    # Check if the prediction is correct
                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    file_exists = os.path.isfile(file_path)
    result = pd.DataFrame([{
        "model": model_name,
        "prompting_method": prompting_method,
        "accuracy": accuracy,
        "cost": cost
    }])

    if file_exists:
        result.to_csv(file_path, mode="a", header=False, index=False)
    else:
        result.to_csv(file_path, index=False)

# Main execution
if __name__ == "__main__":
    test_file_path = "data/dbpedia_csv/test.csv"
    class_file_path = "data/dbpedia_csv/classes.txt"
    
    dbpedia_data = load_dbpedia_dataset(test_file_path, class_file_path)
    classes = dbpedia_data["class"].unique()

    model_name = "Meta-Llama-3.2-3B-Instruct"  # Change this to the desired model name
    prompting_method = "direct"  # Change this to the desired method

    accuracy, cost = evaluate_model(
        dbpedia_data.to_dict(orient="records"),
        classes,
        model_name,
        prompting_method
    )

    print(f"Model: {model_name}, Prompting Method: {prompting_method}")
    print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

    results_file = "dbpedia_results.csv"
    save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
    print(f"Results saved to {results_file}")


Model: Meta-Llama-3.2-3B-Instruct, Prompting Method: direct
Accuracy: 0.81, Cost: $0.00
Results saved to dbpedia_results.csv


In [5]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the DBpedia dataset
def load_dbpedia_dataset(test_file_path, class_file_path):
    # Load classes from classes.txt
    with open(class_file_path, "r", encoding="utf-8") as f:
        classes = [line.strip() for line in f.readlines()]
    class_map = {str(i + 1): cls for i, cls in enumerate(classes)}
    
    # Load the dataset
    df = pd.read_csv(
        test_file_path, 
        header=None, 
        names=["class", "title", "content"], 
        encoding="utf-8"
    )
    
    # Map numeric classes to meaningful labels
    df["class"] = df["class"].astype(str).map(class_map)
    
    return df

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item["class"] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item["class"]]) < num_samples_per_class:
            selected_data[item["class"]].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the category from the following options: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Meta-Llama-3.1-8B-Instruct': (0.10, 0.20),
        'Meta-Llama-3.2-1B-Instruct': (0.04, 0.08),
        'Meta-Llama-3.2-3B-Instruct': (0.08, 0.16),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, classes, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    # Ensure all classes are strings
    classes_formatted = ", ".join(map(str, classes))

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item["content"], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == "direct":
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == "chain-of-thought":
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == "few-shot":
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex["content"], ex["class"]) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            # Count input tokens and check token budget
            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # Simulated API Call (replace this with actual API call)
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1,
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    # Count output tokens and estimate cost
                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    # Check if the prediction is correct
                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    file_exists = os.path.isfile(file_path)
    result = pd.DataFrame([{
        "model": model_name,
        "prompting_method": prompting_method,
        "accuracy": accuracy,
        "cost": cost
    }])

    if file_exists:
        result.to_csv(file_path, mode="a", header=False, index=False)
    else:
        result.to_csv(file_path, index=False)

# Main execution
if __name__ == "__main__":
    test_file_path = "data/dbpedia_csv/test.csv"
    class_file_path = "data/dbpedia_csv/classes.txt"
    
    dbpedia_data = load_dbpedia_dataset(test_file_path, class_file_path)
    classes = dbpedia_data["class"].unique()

    model_name = "Meta-Llama-3.2-3B-Instruct"  # Change this to the desired model name
    prompting_method = "chain-of-thought"  # Change this to the desired method

    accuracy, cost = evaluate_model(
        dbpedia_data.to_dict(orient="records"),
        classes,
        model_name,
        prompting_method
    )

    print(f"Model: {model_name}, Prompting Method: {prompting_method}")
    print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

    results_file = "dbpedia_results.csv"
    save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
    print(f"Results saved to {results_file}")


Model: Meta-Llama-3.2-3B-Instruct, Prompting Method: chain-of-thought
Accuracy: 0.75, Cost: $0.00
Results saved to dbpedia_results.csv


In [6]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the DBpedia dataset
def load_dbpedia_dataset(test_file_path, class_file_path):
    # Load classes from classes.txt
    with open(class_file_path, "r", encoding="utf-8") as f:
        classes = [line.strip() for line in f.readlines()]
    class_map = {str(i + 1): cls for i, cls in enumerate(classes)}
    
    # Load the dataset
    df = pd.read_csv(
        test_file_path, 
        header=None, 
        names=["class", "title", "content"], 
        encoding="utf-8"
    )
    
    # Map numeric classes to meaningful labels
    df["class"] = df["class"].astype(str).map(class_map)
    
    return df

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item["class"] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item["class"]]) < num_samples_per_class:
            selected_data[item["class"]].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the category from the following options: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Meta-Llama-3.1-8B-Instruct': (0.10, 0.20),
        'Meta-Llama-3.2-1B-Instruct': (0.04, 0.08),
        'Meta-Llama-3.2-3B-Instruct': (0.08, 0.16),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, classes, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    # Ensure all classes are strings
    classes_formatted = ", ".join(map(str, classes))

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item["content"], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == "direct":
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == "chain-of-thought":
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == "few-shot":
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex["content"], ex["class"]) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            # Count input tokens and check token budget
            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # Simulated API Call (replace this with actual API call)
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1,
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    # Count output tokens and estimate cost
                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    # Check if the prediction is correct
                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    file_exists = os.path.isfile(file_path)
    result = pd.DataFrame([{
        "model": model_name,
        "prompting_method": prompting_method,
        "accuracy": accuracy,
        "cost": cost
    }])

    if file_exists:
        result.to_csv(file_path, mode="a", header=False, index=False)
    else:
        result.to_csv(file_path, index=False)

# Main execution
if __name__ == "__main__":
    test_file_path = "data/dbpedia_csv/test.csv"
    class_file_path = "data/dbpedia_csv/classes.txt"
    
    dbpedia_data = load_dbpedia_dataset(test_file_path, class_file_path)
    classes = dbpedia_data["class"].unique()

    model_name = "Meta-Llama-3.2-3B-Instruct"  # Change this to the desired model name
    prompting_method = "few-shot"  # Change this to the desired method

    accuracy, cost = evaluate_model(
        dbpedia_data.to_dict(orient="records"),
        classes,
        model_name,
        prompting_method
    )

    print(f"Model: {model_name}, Prompting Method: {prompting_method}")
    print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

    results_file = "dbpedia_results.csv"
    save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
    print(f"Results saved to {results_file}")


Model: Meta-Llama-3.2-3B-Instruct, Prompting Method: few-shot
Accuracy: 0.88, Cost: $0.00
Results saved to dbpedia_results.csv


In [7]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the DBpedia dataset
def load_dbpedia_dataset(test_file_path, class_file_path):
    # Load classes from classes.txt
    with open(class_file_path, "r", encoding="utf-8") as f:
        classes = [line.strip() for line in f.readlines()]
    class_map = {str(i + 1): cls for i, cls in enumerate(classes)}
    
    # Load the dataset
    df = pd.read_csv(
        test_file_path, 
        header=None, 
        names=["class", "title", "content"], 
        encoding="utf-8"
    )
    
    # Map numeric classes to meaningful labels
    df["class"] = df["class"].astype(str).map(class_map)
    
    return df

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item["class"] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item["class"]]) < num_samples_per_class:
            selected_data[item["class"]].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the category from the following options: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Meta-Llama-3.1-8B-Instruct': (0.10, 0.20),
        'Meta-Llama-3.2-1B-Instruct': (0.04, 0.08),
        'Meta-Llama-3.2-3B-Instruct': (0.08, 0.16),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, classes, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    # Ensure all classes are strings
    classes_formatted = ", ".join(map(str, classes))

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item["content"], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == "direct":
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == "chain-of-thought":
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == "few-shot":
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex["content"], ex["class"]) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            # Count input tokens and check token budget
            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # Simulated API Call (replace this with actual API call)
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1,
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    # Count output tokens and estimate cost
                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    # Check if the prediction is correct
                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    file_exists = os.path.isfile(file_path)
    result = pd.DataFrame([{
        "model": model_name,
        "prompting_method": prompting_method,
        "accuracy": accuracy,
        "cost": cost
    }])

    if file_exists:
        result.to_csv(file_path, mode="a", header=False, index=False)
    else:
        result.to_csv(file_path, index=False)

# Main execution
if __name__ == "__main__":
    test_file_path = "data/dbpedia_csv/test.csv"
    class_file_path = "data/dbpedia_csv/classes.txt"
    
    dbpedia_data = load_dbpedia_dataset(test_file_path, class_file_path)
    classes = dbpedia_data["class"].unique()

    model_name = "Meta-Llama-3.1-8B-Instruct"  # Change this to the desired model name
    prompting_method = "direct"  # Change this to the desired method

    accuracy, cost = evaluate_model(
        dbpedia_data.to_dict(orient="records"),
        classes,
        model_name,
        prompting_method
    )

    print(f"Model: {model_name}, Prompting Method: {prompting_method}")
    print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

    results_file = "dbpedia_results.csv"
    save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
    print(f"Results saved to {results_file}")


Model: Meta-Llama-3.1-8B-Instruct, Prompting Method: direct
Accuracy: 0.92, Cost: $0.00
Results saved to dbpedia_results.csv


In [8]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the DBpedia dataset
def load_dbpedia_dataset(test_file_path, class_file_path):
    # Load classes from classes.txt
    with open(class_file_path, "r", encoding="utf-8") as f:
        classes = [line.strip() for line in f.readlines()]
    class_map = {str(i + 1): cls for i, cls in enumerate(classes)}
    
    # Load the dataset
    df = pd.read_csv(
        test_file_path, 
        header=None, 
        names=["class", "title", "content"], 
        encoding="utf-8"
    )
    
    # Map numeric classes to meaningful labels
    df["class"] = df["class"].astype(str).map(class_map)
    
    return df

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item["class"] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item["class"]]) < num_samples_per_class:
            selected_data[item["class"]].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the category from the following options: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Meta-Llama-3.1-8B-Instruct': (0.10, 0.20),
        'Meta-Llama-3.2-1B-Instruct': (0.04, 0.08),
        'Meta-Llama-3.2-3B-Instruct': (0.08, 0.16),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, classes, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    # Ensure all classes are strings
    classes_formatted = ", ".join(map(str, classes))

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item["content"], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == "direct":
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == "chain-of-thought":
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == "few-shot":
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex["content"], ex["class"]) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            # Count input tokens and check token budget
            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # Simulated API Call (replace this with actual API call)
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1,
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    # Count output tokens and estimate cost
                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    # Check if the prediction is correct
                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    file_exists = os.path.isfile(file_path)
    result = pd.DataFrame([{
        "model": model_name,
        "prompting_method": prompting_method,
        "accuracy": accuracy,
        "cost": cost
    }])

    if file_exists:
        result.to_csv(file_path, mode="a", header=False, index=False)
    else:
        result.to_csv(file_path, index=False)

# Main execution
if __name__ == "__main__":
    test_file_path = "data/dbpedia_csv/test.csv"
    class_file_path = "data/dbpedia_csv/classes.txt"
    
    dbpedia_data = load_dbpedia_dataset(test_file_path, class_file_path)
    classes = dbpedia_data["class"].unique()

    model_name = "Meta-Llama-3.1-8B-Instruct"  # Change this to the desired model name
    prompting_method = "chain-of-thought"  # Change this to the desired method

    accuracy, cost = evaluate_model(
        dbpedia_data.to_dict(orient="records"),
        classes,
        model_name,
        prompting_method
    )

    print(f"Model: {model_name}, Prompting Method: {prompting_method}")
    print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

    results_file = "dbpedia_results.csv"
    save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
    print(f"Results saved to {results_file}")


Model: Meta-Llama-3.1-8B-Instruct, Prompting Method: chain-of-thought
Accuracy: 0.93, Cost: $0.00
Results saved to dbpedia_results.csv


In [9]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the DBpedia dataset
def load_dbpedia_dataset(test_file_path, class_file_path):
    # Load classes from classes.txt
    with open(class_file_path, "r", encoding="utf-8") as f:
        classes = [line.strip() for line in f.readlines()]
    class_map = {str(i + 1): cls for i, cls in enumerate(classes)}
    
    # Load the dataset
    df = pd.read_csv(
        test_file_path, 
        header=None, 
        names=["class", "title", "content"], 
        encoding="utf-8"
    )
    
    # Map numeric classes to meaningful labels
    df["class"] = df["class"].astype(str).map(class_map)
    
    return df

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item["class"] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item["class"]]) < num_samples_per_class:
            selected_data[item["class"]].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following categories: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the category from the following options: {classes_formatted}.\n"
        f"Provide only the category name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Meta-Llama-3.1-8B-Instruct': (0.10, 0.20),
        'Meta-Llama-3.2-1B-Instruct': (0.04, 0.08),
        'Meta-Llama-3.2-3B-Instruct': (0.08, 0.16),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, classes, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    # Ensure all classes are strings
    classes_formatted = ", ".join(map(str, classes))

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item["content"], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == "direct":
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == "chain-of-thought":
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == "few-shot":
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex["content"], ex["class"]) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            # Count input tokens and check token budget
            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # Simulated API Call (replace this with actual API call)
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1,
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    # Count output tokens and estimate cost
                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    # Check if the prediction is correct
                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)

    # Calculate accuracy
    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    file_exists = os.path.isfile(file_path)
    result = pd.DataFrame([{
        "model": model_name,
        "prompting_method": prompting_method,
        "accuracy": accuracy,
        "cost": cost
    }])

    if file_exists:
        result.to_csv(file_path, mode="a", header=False, index=False)
    else:
        result.to_csv(file_path, index=False)

# Main execution
if __name__ == "__main__":
    test_file_path = "data/dbpedia_csv/test.csv"
    class_file_path = "data/dbpedia_csv/classes.txt"
    
    dbpedia_data = load_dbpedia_dataset(test_file_path, class_file_path)
    classes = dbpedia_data["class"].unique()

    model_name = "Meta-Llama-3.1-8B-Instruct"  # Change this to the desired model name
    prompting_method = "few-shot"  # Change this to the desired method

    accuracy, cost = evaluate_model(
        dbpedia_data.to_dict(orient="records"),
        classes,
        model_name,
        prompting_method
    )

    print(f"Model: {model_name}, Prompting Method: {prompting_method}")
    print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

    results_file = "dbpedia_results.csv"
    save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
    print(f"Results saved to {results_file}")


Model: Meta-Llama-3.1-8B-Instruct, Prompting Method: few-shot
Accuracy: 0.96, Cost: $0.00
Results saved to dbpedia_results.csv
