In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the NYT dataset
def load_nyt_dataset(data_dir='data/nyt_data'):
    # File paths
    phrase_file = os.path.join(data_dir, 'phrase_text.txt')
    locations_file = os.path.join(data_dir, 'locations.txt')
    locations_label_file = os.path.join(data_dir, 'locations_label.txt')

    # Load phrases
    with open(phrase_file, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f]

    # Load locations and their labels
    with open(locations_file, 'r', encoding='utf-8') as f:
        locations = [line.strip() for line in f]
    with open(locations_label_file, 'r', encoding='utf-8') as f:
        location_labels = [int(line.strip()) for line in f]

    # Map location labels to location names
    location_mapping = {i: name for i, name in enumerate(locations)}

    # Combine data into a structured dictionary
    data = []
    for i, phrase in enumerate(phrases):
        location = location_mapping.get(location_labels[i], "Unknown")
        data.append({
            'text': phrase,
            'location': location,
        })

    return data, locations

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item['location'] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item['location']]) < num_samples_per_class:
            selected_data[item['location']].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the location from the following options: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Llama 3.1 8B': (0.10, 0.20),
        'Llama 3.1 70B': (0.60, 1.20),
        'Llama 3.1 405B': (5.00, 10.00),
        'Llama 3.2 1B': (0.04, 0.08),
        'Llama 3.2 3B': (0.08, 0.16),
        'Llama 3.2 11B Vision': (0.15, 0.30),
        'Llama 3.2 90B Vision': (0.80, 1.60),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, locations, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    classes_formatted = ", ".join(locations)

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item['text'], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == 'direct':
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == 'chain-of-thought':
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == 'few-shot':
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex['text'], ex['location']) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # API call
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    if "maximum sequence length" in str(e):
                        print("Token limit exceeded. Please adjust the input size.")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)  # Respect delay between requests

    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Create a DataFrame with the current results
    result = pd.DataFrame([{
        'model': model_name,
        'prompting_method': prompting_method,
        'accuracy': accuracy,
        'cost': cost
    }])

    # Append to the file or create a new one
    if file_exists:
        # Append without writing the header again
        result.to_csv(file_path, mode='a', header=False, index=False)
    else:
        # Write with header if the file is new
        result.to_csv(file_path, index=False)

# Define your model and method here
model_name = "Meta-Llama-3.2-1B-Instruct"  # Change to the desired model
prompting_method = "direct"  # Change to 'direct', 'chain-of-thought', or 'few-shot'

# Load dataset and evaluate
data, locations = load_nyt_dataset()
accuracy, cost = evaluate_model(data, locations, model_name, prompting_method)

# Print and save results
print(f"Model: {model_name}, Prompting Method: {prompting_method}")
print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

results_file = 'nyt_location_result'
save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
print(f"Results saved to {results_file}")


Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
API Error: Requested generation length 1 is not possible! The provided prompt is 4445 tokens long, so generating 1 tokens requires a sequence length of 4446, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4445 tokens long, so generating 1 tokens requires a sequence length of 4446, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4445 tokens long, so generating 1 tokens requires a sequence length of 4446, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4445 tokens long, so generating 1 tokens requires a sequence length of 4446, but the 

In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the NYT dataset
def load_nyt_dataset(data_dir='data/nyt_data'):
    # File paths
    phrase_file = os.path.join(data_dir, 'phrase_text.txt')
    locations_file = os.path.join(data_dir, 'locations.txt')
    locations_label_file = os.path.join(data_dir, 'locations_label.txt')

    # Load phrases
    with open(phrase_file, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f]

    # Load locations and their labels
    with open(locations_file, 'r', encoding='utf-8') as f:
        locations = [line.strip() for line in f]
    with open(locations_label_file, 'r', encoding='utf-8') as f:
        location_labels = [int(line.strip()) for line in f]

    # Map location labels to location names
    location_mapping = {i: name for i, name in enumerate(locations)}

    # Combine data into a structured dictionary
    data = []
    for i, phrase in enumerate(phrases):
        location = location_mapping.get(location_labels[i], "Unknown")
        data.append({
            'text': phrase,
            'location': location,
        })

    return data, locations

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item['location'] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item['location']]) < num_samples_per_class:
            selected_data[item['location']].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the location from the following options: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Llama 3.1 8B': (0.10, 0.20),
        'Llama 3.1 70B': (0.60, 1.20),
        'Llama 3.1 405B': (5.00, 10.00),
        'Llama 3.2 1B': (0.04, 0.08),
        'Llama 3.2 3B': (0.08, 0.16),
        'Llama 3.2 11B Vision': (0.15, 0.30),
        'Llama 3.2 90B Vision': (0.80, 1.60),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, locations, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    classes_formatted = ", ".join(locations)

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item['text'], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == 'direct':
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == 'chain-of-thought':
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == 'few-shot':
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex['text'], ex['location']) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # API call
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    if "maximum sequence length" in str(e):
                        print("Token limit exceeded. Please adjust the input size.")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)  # Respect delay between requests

    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Create a DataFrame with the current results
    result = pd.DataFrame([{
        'model': model_name,
        'prompting_method': prompting_method,
        'accuracy': accuracy,
        'cost': cost
    }])

    # Append to the file or create a new one
    if file_exists:
        # Append without writing the header again
        result.to_csv(file_path, mode='a', header=False, index=False)
    else:
        # Write with header if the file is new
        result.to_csv(file_path, index=False)

# Define your model and method here
model_name = "Meta-Llama-3.2-1B-Instruct"  # Change to the desired model
prompting_method = "chain-of-thought"  # Change to 'direct', 'chain-of-thought', or 'few-shot'

# Load dataset and evaluate
data, locations = load_nyt_dataset()
accuracy, cost = evaluate_model(data, locations, model_name, prompting_method)

# Print and save results
print(f"Model: {model_name}, Prompting Method: {prompting_method}")
print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

results_file = 'nyt_location_result'
save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
print(f"Results saved to {results_file}")

Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
API Error: Requested generation length 1 is not possible! The provided prompt is 4449 tokens long, so generating 1 tokens requires a sequence length of 4450, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4449 tokens long, so generating 1 tokens requires a sequence length of 4450, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4449 tokens long, so generating 1 tokens requires a sequence length of 4450, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4449 tokens long, so generating 1 tok

In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the NYT dataset
def load_nyt_dataset(data_dir='data/nyt_data'):
    # File paths
    phrase_file = os.path.join(data_dir, 'phrase_text.txt')
    locations_file = os.path.join(data_dir, 'locations.txt')
    locations_label_file = os.path.join(data_dir, 'locations_label.txt')

    # Load phrases
    with open(phrase_file, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f]

    # Load locations and their labels
    with open(locations_file, 'r', encoding='utf-8') as f:
        locations = [line.strip() for line in f]
    with open(locations_label_file, 'r', encoding='utf-8') as f:
        location_labels = [int(line.strip()) for line in f]

    # Map location labels to location names
    location_mapping = {i: name for i, name in enumerate(locations)}

    # Combine data into a structured dictionary
    data = []
    for i, phrase in enumerate(phrases):
        location = location_mapping.get(location_labels[i], "Unknown")
        data.append({
            'text': phrase,
            'location': location,
        })

    return data, locations

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item['location'] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item['location']]) < num_samples_per_class:
            selected_data[item['location']].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the location from the following options: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Llama 3.1 8B': (0.10, 0.20),
        'Llama 3.1 70B': (0.60, 1.20),
        'Llama 3.1 405B': (5.00, 10.00),
        'Llama 3.2 1B': (0.04, 0.08),
        'Llama 3.2 3B': (0.08, 0.16),
        'Llama 3.2 11B Vision': (0.15, 0.30),
        'Llama 3.2 90B Vision': (0.80, 1.60),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, locations, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    classes_formatted = ", ".join(locations)

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item['text'], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == 'direct':
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == 'chain-of-thought':
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == 'few-shot':
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex['text'], ex['location']) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # API call
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    if "maximum sequence length" in str(e):
                        print("Token limit exceeded. Please adjust the input size.")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)  # Respect delay between requests

    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Create a DataFrame with the current results
    result = pd.DataFrame([{
        'model': model_name,
        'prompting_method': prompting_method,
        'accuracy': accuracy,
        'cost': cost
    }])

    # Append to the file or create a new one
    if file_exists:
        # Append without writing the header again
        result.to_csv(file_path, mode='a', header=False, index=False)
    else:
        # Write with header if the file is new
        result.to_csv(file_path, index=False)

# Define your model and method here
model_name = "Meta-Llama-3.2-1B-Instruct"  # Change to the desired model
prompting_method = "few-shot"  # Change to 'direct', 'chain-of-thought', or 'few-shot'

# Load dataset and evaluate
data, locations = load_nyt_dataset()
accuracy, cost = evaluate_model(data, locations, model_name, prompting_method)

# Print and save results
print(f"Model: {model_name}, Prompting Method: {prompting_method}")
print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

results_file = 'nyt_location_result'
save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
print(f"Results saved to {results_file}")

API Error: Requested generation length 1 is not possible! The provided prompt is 4212 tokens long, so generating 1 tokens requires a sequence length of 4213, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4212 tokens long, so generating 1 tokens requires a sequence length of 4213, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4212 tokens long, so generating 1 tokens requires a sequence length of 4213, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4212 tokens long, so generating 1 tokens requires a sequence length of 4213, but the maximum supported sequence length is just 4096!
Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
Skipping sample due to token limit: 6178 tokens


In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the NYT dataset
def load_nyt_dataset(data_dir='data/nyt_data'):
    # File paths
    phrase_file = os.path.join(data_dir, 'phrase_text.txt')
    locations_file = os.path.join(data_dir, 'locations.txt')
    locations_label_file = os.path.join(data_dir, 'locations_label.txt')

    # Load phrases
    with open(phrase_file, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f]

    # Load locations and their labels
    with open(locations_file, 'r', encoding='utf-8') as f:
        locations = [line.strip() for line in f]
    with open(locations_label_file, 'r', encoding='utf-8') as f:
        location_labels = [int(line.strip()) for line in f]

    # Map location labels to location names
    location_mapping = {i: name for i, name in enumerate(locations)}

    # Combine data into a structured dictionary
    data = []
    for i, phrase in enumerate(phrases):
        location = location_mapping.get(location_labels[i], "Unknown")
        data.append({
            'text': phrase,
            'location': location,
        })

    return data, locations

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item['location'] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item['location']]) < num_samples_per_class:
            selected_data[item['location']].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the location from the following options: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Llama 3.1 8B': (0.10, 0.20),
        'Llama 3.1 70B': (0.60, 1.20),
        'Llama 3.1 405B': (5.00, 10.00),
        'Llama 3.2 1B': (0.04, 0.08),
        'Llama 3.2 3B': (0.08, 0.16),
        'Llama 3.2 11B Vision': (0.15, 0.30),
        'Llama 3.2 90B Vision': (0.80, 1.60),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, locations, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    classes_formatted = ", ".join(locations)

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item['text'], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == 'direct':
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == 'chain-of-thought':
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == 'few-shot':
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex['text'], ex['location']) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # API call
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    if "maximum sequence length" in str(e):
                        print("Token limit exceeded. Please adjust the input size.")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)  # Respect delay between requests

    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Create a DataFrame with the current results
    result = pd.DataFrame([{
        'model': model_name,
        'prompting_method': prompting_method,
        'accuracy': accuracy,
        'cost': cost
    }])

    # Append to the file or create a new one
    if file_exists:
        # Append without writing the header again
        result.to_csv(file_path, mode='a', header=False, index=False)
    else:
        # Write with header if the file is new
        result.to_csv(file_path, index=False)

# Define your model and method here
model_name = "Meta-Llama-3.2-3B-Instruct"  # Change to the desired model
prompting_method = "direct"  # Change to 'direct', 'chain-of-thought', or 'few-shot'

# Load dataset and evaluate
data, locations = load_nyt_dataset()
accuracy, cost = evaluate_model(data, locations, model_name, prompting_method)

# Print and save results
print(f"Model: {model_name}, Prompting Method: {prompting_method}")
print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

results_file = 'nyt_location_result'
save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
print(f"Results saved to {results_file}")

Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
API Error: Requested generation length 1 is not possible! The provided prompt is 4445 tokens long, so generating 1 tokens requires a sequence length of 4446, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4445 tokens long, so generating 1 tokens requires a sequence length of 4446, but the maximum supported sequence length is just 4096!
Rate limit exceeded. Sleeping for 30 seconds...
API Error: Requested generation length 1 is not possible! The provided prompt is 4445 tokens long, so generating 1 tokens requires a sequence length of 4446, but the maximum supported sequence length is just 4096!
Rate limit exceeded. Sleeping for 30 seconds...
Model: Meta-Llama-3.2-3B-Instruct, Prompting Method: direct
Accuracy: 

In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the NYT dataset
def load_nyt_dataset(data_dir='data/nyt_data'):
    # File paths
    phrase_file = os.path.join(data_dir, 'phrase_text.txt')
    locations_file = os.path.join(data_dir, 'locations.txt')
    locations_label_file = os.path.join(data_dir, 'locations_label.txt')

    # Load phrases
    with open(phrase_file, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f]

    # Load locations and their labels
    with open(locations_file, 'r', encoding='utf-8') as f:
        locations = [line.strip() for line in f]
    with open(locations_label_file, 'r', encoding='utf-8') as f:
        location_labels = [int(line.strip()) for line in f]

    # Map location labels to location names
    location_mapping = {i: name for i, name in enumerate(locations)}

    # Combine data into a structured dictionary
    data = []
    for i, phrase in enumerate(phrases):
        location = location_mapping.get(location_labels[i], "Unknown")
        data.append({
            'text': phrase,
            'location': location,
        })

    return data, locations

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item['location'] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item['location']]) < num_samples_per_class:
            selected_data[item['location']].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the location from the following options: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Llama 3.1 8B': (0.10, 0.20),
        'Llama 3.1 70B': (0.60, 1.20),
        'Llama 3.1 405B': (5.00, 10.00),
        'Llama 3.2 1B': (0.04, 0.08),
        'Llama 3.2 3B': (0.08, 0.16),
        'Llama 3.2 11B Vision': (0.15, 0.30),
        'Llama 3.2 90B Vision': (0.80, 1.60),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, locations, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    classes_formatted = ", ".join(locations)

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item['text'], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == 'direct':
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == 'chain-of-thought':
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == 'few-shot':
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex['text'], ex['location']) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # API call
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    if "maximum sequence length" in str(e):
                        print("Token limit exceeded. Please adjust the input size.")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)  # Respect delay between requests

    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Create a DataFrame with the current results
    result = pd.DataFrame([{
        'model': model_name,
        'prompting_method': prompting_method,
        'accuracy': accuracy,
        'cost': cost
    }])

    # Append to the file or create a new one
    if file_exists:
        # Append without writing the header again
        result.to_csv(file_path, mode='a', header=False, index=False)
    else:
        # Write with header if the file is new
        result.to_csv(file_path, index=False)

# Define your model and method here
model_name = "Meta-Llama-3.2-3B-Instruct"  # Change to the desired model
prompting_method = "chain-of-thought"  # Change to 'direct', 'chain-of-thought', or 'few-shot'

# Load dataset and evaluate
data, locations = load_nyt_dataset()
accuracy, cost = evaluate_model(data, locations, model_name, prompting_method)

# Print and save results
print(f"Model: {model_name}, Prompting Method: {prompting_method}")
print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

results_file = 'nyt_location_result'
save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
print(f"Results saved to {results_file}")

Rate limit exceeded. Sleeping for 30 seconds...
API Error: Requested generation length 1 is not possible! The provided prompt is 4449 tokens long, so generating 1 tokens requires a sequence length of 4450, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4449 tokens long, so generating 1 tokens requires a sequence length of 4450, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4449 tokens long, so generating 1 tokens requires a sequence length of 4450, but the maximum supported sequence length is just 4096!
Rate limit exceeded. Sleeping for 30 seconds...
Rate limit exceeded. Sleeping for 30 seconds...
Model: Meta-Llama-3.2-3B-Instruct, Prompting Method: chain-of-thought
Accuracy: 0.78, Cost: $0.01
Results saved to nyt_location_result


In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the NYT dataset
def load_nyt_dataset(data_dir='data/nyt_data'):
    # File paths
    phrase_file = os.path.join(data_dir, 'phrase_text.txt')
    locations_file = os.path.join(data_dir, 'locations.txt')
    locations_label_file = os.path.join(data_dir, 'locations_label.txt')

    # Load phrases
    with open(phrase_file, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f]

    # Load locations and their labels
    with open(locations_file, 'r', encoding='utf-8') as f:
        locations = [line.strip() for line in f]
    with open(locations_label_file, 'r', encoding='utf-8') as f:
        location_labels = [int(line.strip()) for line in f]

    # Map location labels to location names
    location_mapping = {i: name for i, name in enumerate(locations)}

    # Combine data into a structured dictionary
    data = []
    for i, phrase in enumerate(phrases):
        location = location_mapping.get(location_labels[i], "Unknown")
        data.append({
            'text': phrase,
            'location': location,
        })

    return data, locations

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item['location'] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item['location']]) < num_samples_per_class:
            selected_data[item['location']].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the location from the following options: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Llama 3.1 8B': (0.10, 0.20),
        'Llama 3.1 70B': (0.60, 1.20),
        'Llama 3.1 405B': (5.00, 10.00),
        'Llama 3.2 1B': (0.04, 0.08),
        'Llama 3.2 3B': (0.08, 0.16),
        'Llama 3.2 11B Vision': (0.15, 0.30),
        'Llama 3.2 90B Vision': (0.80, 1.60),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, locations, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    classes_formatted = ", ".join(locations)

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item['text'], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == 'direct':
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == 'chain-of-thought':
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == 'few-shot':
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex['text'], ex['location']) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # API call
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    if "maximum sequence length" in str(e):
                        print("Token limit exceeded. Please adjust the input size.")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)  # Respect delay between requests

    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Create a DataFrame with the current results
    result = pd.DataFrame([{
        'model': model_name,
        'prompting_method': prompting_method,
        'accuracy': accuracy,
        'cost': cost
    }])

    # Append to the file or create a new one
    if file_exists:
        # Append without writing the header again
        result.to_csv(file_path, mode='a', header=False, index=False)
    else:
        # Write with header if the file is new
        result.to_csv(file_path, index=False)

# Define your model and method here
model_name = "Meta-Llama-3.2-3B-Instruct"   # Change to the desired model
prompting_method = "few-shot"  # Change to 'direct', 'chain-of-thought', or 'few-shot'

# Load dataset and evaluate
data, locations = load_nyt_dataset()
accuracy, cost = evaluate_model(data, locations, model_name, prompting_method)

# Print and save results
print(f"Model: {model_name}, Prompting Method: {prompting_method}")
print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

results_file = 'nyt_location_result'
save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
print(f"Results saved to {results_file}")

API Error: Requested generation length 1 is not possible! The provided prompt is 4213 tokens long, so generating 1 tokens requires a sequence length of 4214, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4213 tokens long, so generating 1 tokens requires a sequence length of 4214, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4213 tokens long, so generating 1 tokens requires a sequence length of 4214, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4213 tokens long, so generating 1 tokens requires a sequence length of 4214, but the maximum supported sequence length is just 4096!
API Error: Requested generation length 1 is not possible! The provided prompt is 4310 tokens long, so generating 1 tokens requires a sequence le

In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    aapi_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the NYT dataset
def load_nyt_dataset(data_dir='data/nyt_data'):
    # File paths
    phrase_file = os.path.join(data_dir, 'phrase_text.txt')
    locations_file = os.path.join(data_dir, 'locations.txt')
    locations_label_file = os.path.join(data_dir, 'locations_label.txt')

    # Load phrases
    with open(phrase_file, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f]

    # Load locations and their labels
    with open(locations_file, 'r', encoding='utf-8') as f:
        locations = [line.strip() for line in f]
    with open(locations_label_file, 'r', encoding='utf-8') as f:
        location_labels = [int(line.strip()) for line in f]

    # Map location labels to location names
    location_mapping = {i: name for i, name in enumerate(locations)}

    # Combine data into a structured dictionary
    data = []
    for i, phrase in enumerate(phrases):
        location = location_mapping.get(location_labels[i], "Unknown")
        data.append({
            'text': phrase,
            'location': location,
        })

    return data, locations

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item['location'] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item['location']]) < num_samples_per_class:
            selected_data[item['location']].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the location from the following options: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Llama 3.1 8B': (0.10, 0.20),
        'Llama 3.1 70B': (0.60, 1.20),
        'Llama 3.1 405B': (5.00, 10.00),
        'Llama 3.2 1B': (0.04, 0.08),
        'Llama 3.2 3B': (0.08, 0.16),
        'Llama 3.2 11B Vision': (0.15, 0.30),
        'Llama 3.2 90B Vision': (0.80, 1.60),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, locations, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    classes_formatted = ", ".join(locations)

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item['text'], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == 'direct':
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == 'chain-of-thought':
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == 'few-shot':
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex['text'], ex['location']) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # API call
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    if "maximum sequence length" in str(e):
                        print("Token limit exceeded. Please adjust the input size.")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)  # Respect delay between requests

    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Create a DataFrame with the current results
    result = pd.DataFrame([{
        'model': model_name,
        'prompting_method': prompting_method,
        'accuracy': accuracy,
        'cost': cost
    }])

    # Append to the file or create a new one
    if file_exists:
        # Append without writing the header again
        result.to_csv(file_path, mode='a', header=False, index=False)
    else:
        # Write with header if the file is new
        result.to_csv(file_path, index=False)

# Define your model and method here
model_name = "Meta-Llama-3.1-8B-Instruct"  # Change to the desired model
prompting_method = "direct"  # Change to 'direct', 'chain-of-thought', or 'few-shot'

# Load dataset and evaluate
data, locations = load_nyt_dataset()
accuracy, cost = evaluate_model(data, locations, model_name, prompting_method)

# Print and save results
print(f"Model: {model_name}, Prompting Method: {prompting_method}")
print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

results_file = 'nyt_location_result'
save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
print(f"Results saved to {results_file}")

Model: Meta-Llama-3.1-8B-Instruct, Prompting Method: direct
Accuracy: 0.87, Cost: $0.01
Results saved to nyt_location_result


In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the NYT dataset
def load_nyt_dataset(data_dir='data/nyt_data'):
    # File paths
    phrase_file = os.path.join(data_dir, 'phrase_text.txt')
    locations_file = os.path.join(data_dir, 'locations.txt')
    locations_label_file = os.path.join(data_dir, 'locations_label.txt')

    # Load phrases
    with open(phrase_file, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f]

    # Load locations and their labels
    with open(locations_file, 'r', encoding='utf-8') as f:
        locations = [line.strip() for line in f]
    with open(locations_label_file, 'r', encoding='utf-8') as f:
        location_labels = [int(line.strip()) for line in f]

    # Map location labels to location names
    location_mapping = {i: name for i, name in enumerate(locations)}

    # Combine data into a structured dictionary
    data = []
    for i, phrase in enumerate(phrases):
        location = location_mapping.get(location_labels[i], "Unknown")
        data.append({
            'text': phrase,
            'location': location,
        })

    return data, locations

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item['location'] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item['location']]) < num_samples_per_class:
            selected_data[item['location']].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the location from the following options: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Llama 3.1 8B': (0.10, 0.20),
        'Llama 3.1 70B': (0.60, 1.20),
        'Llama 3.1 405B': (5.00, 10.00),
        'Llama 3.2 1B': (0.04, 0.08),
        'Llama 3.2 3B': (0.08, 0.16),
        'Llama 3.2 11B Vision': (0.15, 0.30),
        'Llama 3.2 90B Vision': (0.80, 1.60),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, locations, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    classes_formatted = ", ".join(locations)

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item['text'], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == 'direct':
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == 'chain-of-thought':
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == 'few-shot':
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex['text'], ex['location']) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # API call
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    if "maximum sequence length" in str(e):
                        print("Token limit exceeded. Please adjust the input size.")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)  # Respect delay between requests

    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Create a DataFrame with the current results
    result = pd.DataFrame([{
        'model': model_name,
        'prompting_method': prompting_method,
        'accuracy': accuracy,
        'cost': cost
    }])

    # Append to the file or create a new one
    if file_exists:
        # Append without writing the header again
        result.to_csv(file_path, mode='a', header=False, index=False)
    else:
        # Write with header if the file is new
        result.to_csv(file_path, index=False)

# Define your model and method here
model_name = "Meta-Llama-3.1-8B-Instruct"  # Change to the desired model
prompting_method = "chain-of-thought"  # Change to 'direct', 'chain-of-thought', or 'few-shot'

# Load dataset and evaluate
data, locations = load_nyt_dataset()
accuracy, cost = evaluate_model(data, locations, model_name, prompting_method)

# Print and save results
print(f"Model: {model_name}, Prompting Method: {prompting_method}")
print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

results_file = 'nyt_location_result'
save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
print(f"Results saved to {results_file}")

Model: Meta-Llama-3.1-8B-Instruct, Prompting Method: chain-of-thought
Accuracy: 0.88, Cost: $0.01
Results saved to nyt_location_result


In [None]:
import os
import random
import time
import pandas as pd
import openai
from openai import RateLimitError, APIError

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=os.environ.get("SAMBANOVA_API_KEY"),
    base_url="https://api.sambanova.ai/v1",
)

# Token counting using a simple string split method
def count_tokens(text):
    return len(text.split())

# Truncate text to fit within a specified token budget
def truncate_text(text, max_tokens, reserve_tokens=50):
    tokens = text.split()
    max_input_tokens = max_tokens - reserve_tokens
    if len(tokens) > max_input_tokens:
        tokens = tokens[:max_input_tokens]
    return " ".join(tokens)

# Load the NYT dataset
def load_nyt_dataset(data_dir='data/nyt_data'):
    # File paths
    phrase_file = os.path.join(data_dir, 'phrase_text.txt')
    locations_file = os.path.join(data_dir, 'locations.txt')
    locations_label_file = os.path.join(data_dir, 'locations_label.txt')

    # Load phrases
    with open(phrase_file, 'r', encoding='utf-8') as f:
        phrases = [line.strip() for line in f]

    # Load locations and their labels
    with open(locations_file, 'r', encoding='utf-8') as f:
        locations = [line.strip() for line in f]
    with open(locations_label_file, 'r', encoding='utf-8') as f:
        location_labels = [int(line.strip()) for line in f]

    # Map location labels to location names
    location_mapping = {i: name for i, name in enumerate(locations)}

    # Combine data into a structured dictionary
    data = []
    for i, phrase in enumerate(phrases):
        location = location_mapping.get(location_labels[i], "Unknown")
        data.append({
            'text': phrase,
            'location': location,
        })

    return data, locations

# Select random samples for evaluation
def select_samples(data, num_samples_per_class=10):
    classes = set(item['location'] for item in data)
    selected_data = {cls: [] for cls in classes}

    for item in data:
        if len(selected_data[item['location']]) < num_samples_per_class:
            selected_data[item['location']].append(item)

    return selected_data

# Prompting methods
def direct_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def chain_of_thought_prompt(text, classes_formatted):
    return (
        f"{text}\n\n"
        f"Think step by step to classify the above text into one of the following locations: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

def few_shot_prompt(text, examples, classes_formatted):
    examples_text = ""
    for ex_text, ex_label in examples:
        examples_text += f"Text: {ex_text}\nLabel: {ex_label}\n\n"
    return (
        f"{examples_text}"
        f"Now, classify the following text:\n"
        f"Text: {text}\n\n"
        f"Choose the location from the following options: {classes_formatted}.\n"
        f"Provide only the location name as your answer."
    )

# Cost estimation with actual prices
def estimate_cost(input_tokens, output_tokens, model_name):
    cost_per_million = {
        'Llama 3.1 8B': (0.10, 0.20),
        'Llama 3.1 70B': (0.60, 1.20),
        'Llama 3.1 405B': (5.00, 10.00),
        'Llama 3.2 1B': (0.04, 0.08),
        'Llama 3.2 3B': (0.08, 0.16),
        'Llama 3.2 11B Vision': (0.15, 0.30),
        'Llama 3.2 90B Vision': (0.80, 1.60),
    }
    input_price, output_price = cost_per_million.get(model_name, (0.10, 0.20))
    total_input_cost = (input_tokens / 1e6) * input_price
    total_output_cost = (output_tokens / 1e6) * output_price
    return total_input_cost + total_output_cost

# Evaluate a single method and model
def evaluate_model(
    data, locations, model_name, prompting_method,
    num_samples_per_class=10, token_budget=4000, delay=2, max_retries=3
):
    correct_predictions = 0
    total_cost = 0
    total_samples = 0
    classes_formatted = ", ".join(locations)

    selected_data = select_samples(data, num_samples_per_class)

    for cls, samples in selected_data.items():
        for item in samples:
            # Truncate text to fit within token budget
            text = truncate_text(item['text'], token_budget, reserve_tokens=100)

            # Create prompt based on the chosen prompting method
            if prompting_method == 'direct':
                prompt = direct_prompt(text, classes_formatted)
            elif prompting_method == 'chain-of-thought':
                prompt = chain_of_thought_prompt(text, classes_formatted)
            elif prompting_method == 'few-shot':
                examples = random.sample(data, min(3, len(data)))
                examples = [(ex['text'], ex['location']) for ex in examples]
                prompt = few_shot_prompt(text, examples, classes_formatted)
            else:
                raise ValueError(f"Unknown prompting method: {prompting_method}")

            input_token_count = count_tokens(prompt)
            if input_token_count > token_budget:
                print(f"Skipping sample due to token limit: {input_token_count} tokens")
                continue

            retries = 0
            while retries <= max_retries:
                try:
                    # API call
                    completion = client.chat.completions.create(
                        model=model_name,
                        messages=[{"role": "user", "content": prompt}],
                        stream=True,
                        temperature=0.1,
                        top_p=0.1
                    )
                    response_text = "".join(
                        chunk.choices[0].delta.content or "" for chunk in completion
                    )

                    output_token_count = count_tokens(response_text)
                    total_cost += estimate_cost(input_token_count, output_token_count, model_name)

                    if response_text.strip().lower() == cls.lower():
                        correct_predictions += 1
                    break  # Exit retry loop on success

                except RateLimitError:
                    print("Rate limit exceeded. Sleeping for 30 seconds...")
                    time.sleep(30)
                    retries += 1

                except APIError as e:
                    print(f"API Error: {e}")
                    if "maximum sequence length" in str(e):
                        print("Token limit exceeded. Please adjust the input size.")
                    retries += 1

                except Exception as e:
                    print(f"Unexpected error: {e}")
                    break

            total_samples += 1
            time.sleep(delay)  # Respect delay between requests

    accuracy = correct_predictions / total_samples if total_samples > 0 else 0
    return accuracy, total_cost

# Save results to a CSV file
def save_results_to_csv(file_path, model_name, prompting_method, accuracy, cost):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Create a DataFrame with the current results
    result = pd.DataFrame([{
        'model': model_name,
        'prompting_method': prompting_method,
        'accuracy': accuracy,
        'cost': cost
    }])

    # Append to the file or create a new one
    if file_exists:
        # Append without writing the header again
        result.to_csv(file_path, mode='a', header=False, index=False)
    else:
        # Write with header if the file is new
        result.to_csv(file_path, index=False)

# Define your model and method here
model_name = "Meta-Llama-3.1-8B-Instruct"  # Change to the desired model
prompting_method = "few-shot"  # Change to 'direct', 'chain-of-thought', or 'few-shot'

# Load dataset and evaluate
data, locations = load_nyt_dataset()
accuracy, cost = evaluate_model(data, locations, model_name, prompting_method)

# Print and save results
print(f"Model: {model_name}, Prompting Method: {prompting_method}")
print(f"Accuracy: {accuracy:.2f}, Cost: ${cost:.2f}")

results_file = 'nyt_location_result'
save_results_to_csv(results_file, model_name, prompting_method, accuracy, cost)
print(f"Results saved to {results_file}")

Skipping sample due to token limit: 5970 tokens
Skipping sample due to token limit: 4716 tokens
Skipping sample due to token limit: 4996 tokens
Skipping sample due to token limit: 5762 tokens
Skipping sample due to token limit: 4211 tokens
Skipping sample due to token limit: 4577 tokens
Skipping sample due to token limit: 4251 tokens
Skipping sample due to token limit: 4179 tokens
Skipping sample due to token limit: 7911 tokens
Skipping sample due to token limit: 6328 tokens
Skipping sample due to token limit: 4256 tokens
Skipping sample due to token limit: 4130 tokens
Skipping sample due to token limit: 4574 tokens
Skipping sample due to token limit: 6312 tokens
Skipping sample due to token limit: 4010 tokens
Skipping sample due to token limit: 4429 tokens
Skipping sample due to token limit: 7311 tokens
Skipping sample due to token limit: 5114 tokens
Skipping sample due to token limit: 6350 tokens
Skipping sample due to token limit: 4342 tokens
Skipping sample due to token limit: 4180