In [111]:
from sqlalchemy import text
from sqlmodel import Session

from ypl.backend.db import get_engine

raw_query = text(f"""
SELECT DISTINCT content FROM (
SELECT 
    cm.content, 
    EXTRACT(DAY FROM (NOW() - cm.created_at)) AS age
FROM chat_messages cm
WHERE cm.message_type = 'USER_MESSAGE'
  AND cm.created_at > NOW() - INTERVAL '8 weeks'
  AND array_length(regexp_split_to_array(cm.content, '\s+'), 1) > 3
  AND cm.content <> 'test'
ORDER BY RANDOM()
LIMIT 3000
)
""")

with Session(get_engine()) as session:
    comp_results = session.exec(raw_query).fetchall()

print(f"Number of results: {len(comp_results)}")

Number of results: 1414


In [113]:
import pandas as pd

df = pd.DataFrame(comp_results, columns=["content"])
# df = df.sort_values("created_at")

In [114]:
PROMPTS_FILE="/Users/wangtian/tmp/prompts.txt"
df.to_csv(PROMPTS_FILE, index=False)

In [115]:
prompts = df["content"].tolist()

# Write prompts to file, escaping newlines to preserve one-prompt-per-line format
with open(PROMPTS_FILE, "w", encoding="utf-8") as f:
    for prompt in prompts:
        # Replace newlines with \n escape sequence and write
        escaped_prompt = prompt.replace("\n", "\\n")
        f.write(escaped_prompt + "\n")

# Return original tuple for notebook display
prompts[:10], len(prompts)


(['    ifstream file(fileName, ios::binary);\n    if (!file) {\n        cout << "Failed to open file.\\n";\n        return;\n    }\n\n    vector<string> chunks;\n    char buffer[CHUNK_SIZE];\n    int chunkID = 0;\n\n    while (file.read(buffer, CHUNK_SIZE) || file.gcount() > 0) {\n        string chunk(buffer, file.gcount());\n        string chunkName = fileName + "_chunk" + to_string(chunkID++);\n        chunks.push_back(chunkName);\n\n        auto availableNodes = server.getAvailableNodes(world_size);\n        if (availableNodes.size() < 3) {\n            cout << "Not enough nodes available for replication.\\n";\n            return;\n        }\n\n        vector<int> selectedNodes(availableNodes.begin(), availableNodes.begin() + 3);\n        server.chunkToNodes[chunkName] = selectedNodes;\n\n        for (int node : selectedNodes) {\n            server.assignTask(node, "UPLOAD", chunkName);  // Assign upload task\n        }\n    }\n\nexplain the code ignore the metadataserver parts, I o

Get all results from a local server

In [117]:
import concurrent.futures
import requests
from typing import Any

CLASSIFY_PROMPT_ENDPOINT = "http://localhost:8000/api/v1/classify/prompt"

def fetch_classifications(prompt: str, model_name: str | None = None) -> dict[str, Any]:
    response = requests.post(
        CLASSIFY_PROMPT_ENDPOINT,
        params={"prompt": prompt, "model_name": model_name},
        headers={"X-API-KEY": ""},
    )
    return dict(response.json())

def classify_with_model_parallel(prompts: list[str], model_name: str | None = None, max_workers: int = 30) -> list[tuple[str, list[str], list[str]]]:
    results: list[tuple[str, list[str], list[str]]] = []
    # Create a ThreadPoolExecutor with a maximum of 5 workers.
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all HTTP calls concurrently.
        future_to_prompt = {
            executor.submit(fetch_classifications, prompt, model_name): prompt
            for prompt in prompts
        }
        # As each future completes, gather the result.
        from tqdm import tqdm
        for future in tqdm(concurrent.futures.as_completed(future_to_prompt), total=len(prompts), desc=f"Processing with {model_name or 'default'}"):
            prompt = future_to_prompt[future]
            try:
                resp = future.result()
                results.append((prompt, resp["categories"], resp["modifiers"]))
            except Exception as exc:
                print(f"Prompt {prompt} generated an exception: {exc}")
    return sorted(results, key=lambda x: x[0])




In [118]:
# Run the classifier for before and after

before = classify_with_model_parallel(prompts, None)                  # None means default, current is gpt-4o-mini
after = classify_with_model_parallel(prompts, "gemini-1.5-flash-8b")  # with gemini-1.5-flash-8b

Processing with default: 100%|██████████| 1414/1414 [00:28<00:00, 49.39it/s]
Processing with gemini-1.5-flash-8b: 100%|██████████| 1414/1414 [00:25<00:00, 55.11it/s]


In [119]:
before[:4], after[:4]


([('    ifstream file(fileName, ios::binary);\n    if (!file) {\n        cout << "Failed to open file.\\n";\n        return;\n    }\n\n    vector<string> chunks;\n    char buffer[CHUNK_SIZE];\n    int chunkID = 0;\n\n    while (file.read(buffer, CHUNK_SIZE) || file.gcount() > 0) {\n        string chunk(buffer, file.gcount());\n        string chunkName = fileName + "_chunk" + to_string(chunkID++);\n        chunks.push_back(chunkName);\n\n        auto availableNodes = server.getAvailableNodes(world_size);\n        if (availableNodes.size() < 3) {\n            cout << "Not enough nodes available for replication.\\n";\n            return;\n        }\n\n        vector<int> selectedNodes(availableNodes.begin(), availableNodes.begin() + 3);\n        server.chunkToNodes[chunkName] = selectedNodes;\n\n        for (int node : selectedNodes) {\n            server.assignTask(node, "UPLOAD", chunkName);  // Assign upload task\n        }\n    }\n\nexplain the code ignore the metadataserver parts, I 

In [134]:
from collections import Counter
from typing import Literal, TypeAlias

ComparisonResult: TypeAlias = Literal["same", "more_before", "more_after", "others"]

def compare_lists(before: list[str], after: list[str]) -> ComparisonResult:
    before_set = set(before)
    after_set = set(after)
    
    if before_set == after_set:
        return "same"
    elif before_set.issubset(after_set):  # before is all in after
        return "more_after"
    elif after_set.issubset(before_set):  # after is all in before
        return "more_before"
    else:
        return "others"


before_results = before
after_results = after
    
category_comparisons: list[tuple[str, ComparisonResult, list[str], list[str]]] = []
modifier_comparisons: list[tuple[str, ComparisonResult, list[str], list[str]]] = []

for (prompt1, cats1, mods1), (prompt2, cats2, mods2) in zip(before_results, after_results):
    assert prompt1 == prompt2, f"Prompts don't match: {prompt1} vs {prompt2}"
    
    cat_result = compare_lists(cats1, cats2)
    mod_result = compare_lists(mods1, mods2)
    
    category_comparisons.append((prompt1, cat_result, cats1, cats2))
    modifier_comparisons.append((prompt1, mod_result, mods1, mods2))

# Count results
cat_counts = Counter(comp[1] for comp in category_comparisons)
mod_counts = Counter(comp[1] for comp in modifier_comparisons)

# Store results in separate lists for categories and modifiers
cat_results = []
mod_results = []
for (prompt, cat_diff, cats_before, cats_after), (_, mod_diff, mods_before, mods_after) in zip(category_comparisons, modifier_comparisons):
    cat_results.append((
        prompt,
        cat_diff,
        cats_before,
        cats_after
    ))
    mod_results.append((
        prompt, 
        mod_diff,
        mods_before,
        mods_after
    ))

# Print category comparison counts
print("Category comparison counts:")
for result, count in cat_counts.items():
    print(f"{result}: {count} ({count/sum(cat_counts.values())*100:.1f}%)")

print("\nModifier comparison counts:")
for result, count in mod_counts.items():
    print(f"{result}: {count} ({count/sum(mod_counts.values())*100:.1f}%)")

# Show examples
cat_marked_results = []
mod_marked_results = []

print("\nCategory comparison examples:")
for result in ["same","more_before", "more_after", "others"]:
    examples = [r for r in cat_results if r[1] == result][:5]
    if examples:
        total = len(cat_results)
        count = len([r for r in cat_results if r[1] == result])
        pct = count / total * 100
        print(f"\n{result.upper()} ({count}/{total}, {pct:.1f}%):")
        for prompt, diff_type, before_cats, after_cats in examples:
            print(f"Prompt: {prompt[:200]}...")
            # Mark differences between before and after
            before_marked = [cat if cat in after_cats else f"<{cat}>" for cat in before_cats]
            after_marked = [cat if cat in before_cats else f"<{cat}>" for cat in after_cats]
            cat_marked_results.append((prompt, diff_type, before_marked, after_marked))
            print(f"Before: {before_marked}")
            print(f"After: {after_marked}")
            print("-" * 50)

print("\nModifier comparison examples:")
for result in ["same", "more_before", "more_after", "others"]:
    examples = [r for r in mod_results if r[1] == result][:5]
    if examples:
        total = len(mod_results)
        count = len([r for r in mod_results if r[1] == result])
        pct = count / total * 100
        print(f"\n{result.upper()} ({count}/{total}, {pct:.1f}%):")
        for prompt, diff_type, before_mods, after_mods in examples:
            print(f"Prompt: {prompt[:200]}...")
            # Mark differences between before and after
            before_marked = [mod if mod in after_mods else f"<{mod}>" for mod in before_mods]
            after_marked = [mod if mod in before_mods else f"<{mod}>" for mod in after_mods]
            mod_marked_results.append((prompt, diff_type, before_marked, after_marked))
            print(f"Before: {before_marked}")
            print(f"After: {after_marked}")
            print("-" * 50)




Category comparison counts:
others: 593 (41.9%)
more_after: 97 (6.9%)
same: 289 (20.4%)
more_before: 435 (30.8%)

Modifier comparison counts:
others: 529 (37.4%)
more_before: 314 (22.2%)
same: 306 (21.6%)
more_after: 265 (18.7%)

Category comparison examples:

MORE_BEFORE (435/1414, 30.8%):
Prompt: 1 + 1 ?...
Before: ['offline', 'Mathematics', '<Reasoning>']
After: ['offline', 'Mathematics']
--------------------------------------------------
Prompt: 1+1 in step by step...
Before: ['offline', 'Mathematics', '<Factual>', '<Reasoning>']
After: ['offline', 'Mathematics']
--------------------------------------------------
Prompt: 150 hourly rate to salary...
Before: ['offline', 'Mathematics', '<Factual>', '<Analysis>']
After: ['offline', 'Mathematics']
--------------------------------------------------
Prompt: 300 hourly rate to salary at 40 hours per week...
Before: ['offline', 'Mathematics', '<Factual>']
After: ['offline', 'Mathematics']
--------------------------------------------------


In [None]:
cat_marked_results[:10]

In [136]:
# Save results to CSV files
import pandas as pd

# Create DataFrames
cat_df = pd.DataFrame(cat_marked_results, columns=['Prompt', 'Diff_Type', 'Before_Categories', 'After_Categories'])
mod_df = pd.DataFrame(mod_marked_results, columns=['Prompt', 'Diff_Type', 'Before_Modifiers', 'After_Modifiers'])

CSV_CAT = "/Users/wangtian/tmp/category_comparisons.csv"
CSV_MOD = "/Users/wangtian/tmp/modifier_comparisons.csv"

# Save to CSV files
cat_df.to_csv(CSV_CAT, index=False)
mod_df.to_csv(CSV_MOD, index=False)