In [None]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer‚Äîno explanation.",
                                model: str = MODEL,
                                temperature: float = 0.3,
                                timeout: int = 60,
                                max_tokens: int = 128) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }

    try:
        #{'id': 'chatcmpl-88b6d7e18a5542b5bed5bf2828f0661e', 'object': 'chat.completion', 'created': 1763204718, 'model': 'bens_model', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'US Highway 281', 'refusal': None, 'annotations': None, 'audio': None, 'function_call': None, 'tool_calls': [], 'reasoning_content': None}, 'logprobs': None, 'finish_reason': 'stop', 'stop_reason': None, 'token_ids': None}], 'service_tier': None, 'system_fingerprint': None, 'usage': {'prompt_tokens': 50, 'total_tokens': 57, 'completion_tokens': 7, 'prompt_tokens_details': None}, 'prompt_logprobs': None, 'prompt_token_ids': None, 'kv_transfer_params': None}
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            #print(data)
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            tokens_used = data.get("usage",[{}]).get("completion_tokens", {})
            #print('used tokens:', tokens_used)
            
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs, "tokens_used":tokens_used}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


In [107]:
from IPython.display import Markdown, display

In [None]:
# %% Direct call example
def direct_call(prompt="What is 17 + 28? Answer with just the number.", temperature=0.2, max_tokens=128):
    demo_prompt = prompt
    result = call_model_chat_completions(demo_prompt, temperature=temperature, max_tokens=max_tokens)
    print("OK:", result["ok"], "HTTP:", result["status"])
    print("MODEL SAYS:", (result["text"] or "").strip())

    # Optional: Inspect rate-limit headers if your provider exposes them
    for k in ["x-ratelimit-remaining-requests", "x-ratelimit-limit-requests", "x-request-id"]:
        if k in result["headers"]:
            print(f"{k}: {result['headers'][k]}")


In [90]:
# %% Define three tests: input + expected
my_tests = [
    {
        "id": "math_inequality",
        "type": "numeric",  # grader will prefer numeric extraction
        "prompt": "Solve for the smallest integer n such that 3n + 5 > 26. Answer with just the integer.",
        "expected": "8",    # Because 3n > 21 => n > 7, smallest integer is 8
    },
    {
        "id": "commonsense_ice",
        "type": "text",
        "prompt": (
            "You place an ice cube in a glass of water and mark the water level. "
            "After the ice melts, does the water level rise, fall, or stay the same? "
            "Answer with exactly one of: 'rise', 'fall', 'stay the same'."
        ),
        "expected": "stay the same",
    },
    {
        "id": "logic_race",
        "type": "text",
        "prompt": (
            "In a race, you pass the person in second place. What position are you now in? "
            "Answer with a single word like 'first', 'second', 'third'."
        ),
        "expected": "second",
    },
]


In [142]:
import json
from pprint import pprint
import random
from collections import Counter

POSSIBLE_TYPES = ['math', 'common_sense', 'planning', 'coding', 'future_prediction']

all_tests = json.load(open("parsed_dev_data.json", "r", encoding="utf-8"))

type_counts = Counter(t['domain'] for t in all_tests)
print(type_counts)

formatted_tests = []
for i, t in enumerate(all_tests, start=1):
    
    formatted_tests.append({
        "id": t['id'], # domain_domainIndex_domainTestIndex_testIndex
        "type": t['domain'],
        "prompt": t['input'],
        "expected": t['output'],
        "char_count": t['input_char_count'],
        "exp_word_count": t['exp_word_count']
    })
    
all_tests = formatted_tests

Counter({'common_sense': 400, 'math': 300, 'coding': 100, 'future_prediction': 100, 'planning': 100})


In [298]:
def print_test(test):
    print(json.dumps(test, indent=2, ensure_ascii=False))

#pass test_type as a list of types
#generalized get test function
def get_tests(n=0, test_type=POSSIBLE_TYPES, start=0, end=None, lower_char=0, upper_char=float('inf'), lower_exp=0, upper_exp=float('inf'), seed=None):
    filtered_tests = [t for t in all_tests if t['type'] in test_type and lower_char <= t['char_count'] <= upper_char and lower_exp <= t['exp_word_count'] <= upper_exp]
    print('filtered size:', len(filtered_tests))
    sample_size = min(n, len(filtered_tests))
    
    if seed is not None: random.seed(seed)
    
    if n == 0:
        return filtered_tests[start:end]
    elif n == -1:
        filtered_type_counts = Counter(t['type'] for t in filtered_tests)
        each_test = []
        count = 0
        
        for val in filtered_type_counts.values():
            rand = random.randint(count, count + val)
            count = count + val
            each_test.append(filtered_tests[rand])
            
        print("sampled size:", len(each_test))    
        return each_test
    else:
        return random.sample(filtered_tests, sample_size)
    
""" def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):
    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]
    return tests[start:end]

def get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):
    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]
    sample_size = min(n, len(filtered_tests)) #prevent error
    return random.sample(filtered_tests, sample_size) """

" def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):\n    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]\n    return tests[start:end]\n\ndef get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):\n    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]\n    sample_size = min(n, len(filtered_tests)) #prevent error\n    return random.sample(filtered_tests, sample_size) "

In [None]:
tests = get_tests(n=1, upper_char=300) #get_test_type('math', end=10, lower=0, upper=500)
pprint(tests)

filtered size: 440
['First',
 'figure',
 'out',
 'how',
 'many',
 'square',
 'feet',
 'the',
 'original',
 'bolt',
 'of',
 'fabric',
 'was:',
 '16',
 'feet',
 '*',
 '12',
 'feet',
 '=',
 '<<16*12=192>>192',
 'square',
 'feet',
 'Then',
 'figure',
 'out',
 'how',
 'much',
 'fabric',
 'Ann',
 'took',
 'for',
 'the',
 'living',
 'room',
 'curtains:',
 '4',
 'feet',
 '*',
 '6',
 'feet',
 '=',
 '<<4*6=24>>24',
 'square',
 'feet',
 'Then',
 'figure',
 'out',
 'how',
 'much',
 'fabric',
 'Ann',
 'took',
 'for',
 'the',
 'bathroom',
 'curtains:',
 '2',
 'feet',
 '*',
 '4',
 'feet',
 '=',
 '<<2*4=8>>8',
 'square',
 'feet',
 'Finally,',
 'subtract',
 'the',
 'square',
 'footage',
 'of',
 'both',
 'sets',
 'of',
 'curtains',
 'from',
 'the',
 'total',
 'square',
 'footage:',
 '192',
 '-',
 '24',
 '-',
 '8',
 '=',
 '<<192-24-8=160>>160',
 'square',
 'feet',
 '####',
 '160']


In [94]:
#simple hello world call to kick off the commits
#direct_call(prompt="how do I find the derivative of y=x^2 using python?")

In [95]:
def interactive_chat():
    messages = ["<Start of message history>"]
    count = 0
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Exiting chat.")
            break
        response = call_model_chat_completions(prompt=f"Old messages{messages}, CURRENT USER INPUT:{user_input} <--- ANSWER THIS QUESTION", temperature=0.7)
        count += 1
        messages.append(f"MESSAGE_{count}_[previous user input: {user_input}, previous system response: {response['text']}]")
        if response["ok"]:
            print("Model:", response["text"].strip())
        else:
            print("Error:", response["error"])
        print(messages)
#interactive_chat()

In [None]:
""" def execute_tests():
    rows = []
    for t in tests:
        r = call_model_chat_completions(
            prompt,
            system=system,
            model=model,
            temperature=0.3,
            max_tokens=128
        ) """

' def execute_tests():\n    rows = []\n    for t in tests:\n        r = call_model_chat_completions(\n            prompt,\n            system=system,\n            model=model,\n            temperature=0.3,\n            max_tokens=128\n        ) '

In [198]:
def self_evaluate(question, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading a question-answer pair.

Return exactly True if the PREDICTION would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION:
{question}

PREDICTION:
{prediction}

EXPECTED_ANSWER:
{expected_answer}

Answer with exactly: True or False
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [272]:
def self_evaluate2(question, model_output, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly Yes or No. No punctuation. No explanation."
    prompt = f"""MODEL_1 thinks this ANSWER is {prediction}, do you agree with MODEL_1 decision?

QUESTION:
{question}

ANSWER:
{model_output}

EXPECTED_ANSWER:
{expected_answer}

-----------------------
MODEL_1 OUTPUT:
{prediction}
-----------------------

Answer with exactly: Yes or No. Do you agree with MODEL_1?
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true") or reply.startswith("yes"):
        return True
    if reply.startswith("false") or reply.startswith("no"):
        return False

    # No Fallback yet


In [296]:
tf_map = {'yes':'true', 'no':'false'}
def map_tf(output, exp):
    exp = exp.lower().strip('.')
    out = output.lower().strip('.')
    
    #rare case when exp is actually yes/now and model output is true/false
    if exp == "yes" and out == "true": return "yes"
    if exp == "no" and out == "false": return "no"
    
    return tf_map.get(out) if out in tf_map else output

In [303]:

def basic_match_check(test, output):
    exp = test["expected"]
    
    output = map_tf(output, exp)
    
    matches = re.findall(re.escape(str(exp)), output, re.IGNORECASE)
    
    num_matches = len(matches)
    if num_matches > 0:
        #print('MATCH(ES) FOUND:', matches)
        return True
    
    #print('NO MATCH FOUND')
    return False

In [279]:
def seperator(text, tokens_used=None, max_tokens=400):
    if tokens_used is not None:
        print(f'{text} (TOKENS USED: {tokens_used}/{max_tokens})')
        if int(tokens_used) == max_tokens:
            print('MAXED TOKENS REACHED - OUTPUT TRUNCATED')
            return False
    else:
        print(text)
    print('-'*32)
    
    return True

In [273]:
def check_correct(bool1, bool2):
    correctness = bool1 and bool2
    agreement = bool1 == bool2
    
    print('‚úÖ CORRECT') if correctness else print('‚ùå INCORRECT')
    print('üÜó AGREED') if agreement else print('üÜò DISAGREED')
    
    return correctness, agreement

In [311]:
import math

def create_matches(toCount, toMatch):
    counter = Counter(toCount)
    match_counts = {word: counter.get(word, 0) for word in toMatch}
    total_matches = sum(match_counts.values())
    output_len = len(toCount)
    print(f"{total_matches}/{output_len} : {(total_matches / output_len) * 100 if output_len != 0 else 0}%")
    print('match counts:', match_counts)
    return total_matches, output_len

def get_cosine(expected_counter, output_counter):
    dot_product = sum(expected_counter[word] * output_counter.get(word, 0) for word in expected_counter)
    
    print(f"Dot product: {dot_product}")
    
    exp_mag = math.sqrt(sum(v**2 for v in expected_counter.values()))
    out_mag = math.sqrt(sum(v**2 for v in output_counter.values()))
    
    if exp_mag > 0 and out_mag > 0:
        cosine_sim = dot_product / (exp_mag * out_mag)
        print(f"Cosine similarity: {cosine_sim}")
        
    return cosine_sim

def super_match(test, output):
    expected = test["expected"].lower().split()
    output = output.lower().split()
    
    expected_counter = Counter(expected)
    output_counter = Counter(output)
    
    #not very helpful in the long run...
    get_cosine(expected_counter, output_counter)
    
    exp_matches, out_len = create_matches(output, expected)
    out_matches, exp_len = create_matches(expected, output)
    
    start_matches = False
    end_matches = False
    if expected[0] in output[0]: start_matches = True
    if expected[exp_len-1] in output[out_len-1]: end_matches = True
    print(f"start {start_matches} end {end_matches}")
    #return match_counts

In [304]:
def self_evaluate_tests(tests, model=MODEL, grader_model=None, sleep_sec=0.2, verbose=True):
    """
    Run the tests by querying the model for each prompt, then use LLM-as-a-judge
    (self_evaluate) to determine correctness.

    Args:
        tests: list of dicts with keys: id, prompt, expected (and optionally type)
        model: model used to generate predictions
        grader_model: model used to judge correctness (defaults to `model` if None)
        sleep_sec: small delay between calls to be polite to the API
        verbose: if True, print a summary line per test

    Returns:
        rows: list of dicts with fields:
              id, expected, got, correct, status, error
    """
    import time

    judge_model = grader_model or model
    MAX_TOKENS = 400
    final_answers = []
    count = 0
    
    for t in tests:
        count += 1
        # 1) Get model prediction
        #print('prompt:', t['prompt'])
        print('\n','='*64)
        seperator('TEST_CASE')
        print_test(t)
        r = call_model_chat_completions(
            f"{t['prompt']}",
            system="Give a short answer to each prompt, don't explain.",
            model=model,
            temperature=0.3,
            max_tokens=MAX_TOKENS
        )
        got = (r.get("text") or "").strip()
        tokens_used = r.get("tokens_used")
        

        got = map_tf(got, t["expected"])
        
        #If output is truncated and both evals return true, return false
        not_truncated = seperator('\nMODEL_OUTPUT', tokens_used, MAX_TOKENS)
        display(Markdown(f"\n{got}"))
        #print('raw: ', got)
        
        super_match(t, got)
        match_check = basic_match_check(t, got)
        match_check = bool(match_check)
        
        # 2) LLM-as-a-judge: strict True/False
        is_correct = self_evaluate(
            question=t["prompt"],
            prediction=got,
            expected_answer=t["expected"],
            model=judge_model,
        )
        is_correct = bool(is_correct)
        
        seperator('\nEVALUATION')
        print('match check:', match_check)
        print('self_eval:', is_correct)
        correctness, agreement = check_correct(match_check, is_correct)
        
        if not agreement:
            seperator('\nDISAGREEMENT --> SECOND EVAL')
            is_correct2 = self_evaluate2(
                question=t["prompt"],
                model_output=got,
                expected_answer=t["expected"],
                prediction=is_correct,
                model=judge_model
            )
            is_correct2 = bool(is_correct2)
            
            print('self_eval2:', is_correct2)
            correctness, agreement = check_correct(is_correct, is_correct2)
            
            if not not_truncated and correctness:
                correctness = False
                print("‚ùå INCORRECT | BOTH EVALS RETURNED TRUE BUT OUTPUT WAS TRUNCATED")

        final_answers.append(correctness)
        
        if sleep_sec:
            time.sleep(sleep_sec)

    return final_answers

# Example:


In [120]:
import re

In [313]:
test_prompts = get_tests(n=5) #get_test_type(["math"],end=10, upper=300) get_random_tests(n=3, upper=300)
results_llm_judge = self_evaluate_tests(test_prompts, verbose=True, model=MODEL, grader_model=MODEL)
print("\n","="*64)
print(results_llm_judge)

filtered size: 1000

TEST_CASE
--------------------------------
{
  "id": "math_3_5_605",
  "type": "math",
  "prompt": "Three clever monkeys divide a pile of bananas. The first monkey takes some bananas from the pile, keeps three-fourths of them, and divides the rest equally between the other two. The second monkey takes some bananas from the pile, keeps one-fourth of them, and divides the rest equally between the other two. The third monkey takes the remaining bananas from the pile, keeps one-twelfth of them, and divides the rest equally between the other two. Given that each monkey receives a whole number of bananas whenever the bananas are divided, and the numbers of bananas the first, second, and third monkeys have at the end of the process are in the ratio $3: 2: 1,$ what is the least possible total for the number of bananas?",
  "expected": "408",
  "char_count": 722,
  "exp_word_count": 1
}

MODEL_OUTPUT (TOKENS USED: 367/400)
--------------------------------



Let the total number of bananas be $ N $.

Let the number of bananas taken by the first monkey be $ x $, the second monkey be $ y $, and the third monkey be $ z $, with $ x + y + z = N $.

From the problem:

- First monkey keeps $ \frac{3}{4}x $, gives $ \frac{1}{4}x $ to the other two monkeys (so each gets $ \frac{1}{8}x $).
- Second monkey keeps $ \frac{1}{4}y $, gives $ \frac{3}{4}y $ to the other two monkeys (so each gets $ \frac{3}{8}y $).
- Third monkey keeps $ \frac{1}{12}z $, gives $ \frac{11}{12}z $ to the other two monkeys (so each gets $ \frac{11}{24}z $).

Let the final number of bananas each monkey has be:

- First monkey: $ \frac{3}{4}x + \frac{1}{8}y + \frac{11}{24}z $
- Second monkey: $ \frac{1}{4}x + \frac{3}{8}y + \frac{11}{24}z $
- Third monkey: $ \frac{1}{12}z + \frac{1}{8}x + \frac{3}{8}y $

Given the ratio of their final counts is $ 3:2:1 $, and all divisions must result in whole numbers.

After solving the system of equations and ensuring all divisions are integers, the least possible total number of bananas is:

**144**

Dot product: 0
Cosine similarity: 0.0
0/198 : 0.0%
match counts: {'408': 0}
0/1 : 0.0%
match counts: {'let': 0, 'the': 0, 'total': 0, 'number': 0, 'of': 0, 'bananas': 0, 'be': 0, '$': 0, 'n': 0, '$.': 0, 'taken': 0, 'by': 0, 'first': 0, 'monkey': 0, 'x': 0, '$,': 0, 'second': 0, 'y': 0, 'and': 0, 'third': 0, 'z': 0, 'with': 0, '+': 0, '=': 0, 'from': 0, 'problem:': 0, '-': 0, 'keeps': 0, '\\frac{3}{4}x': 0, 'gives': 0, '\\frac{1}{4}x': 0, 'to': 0, 'other': 0, 'two': 0, 'monkeys': 0, '(so': 0, 'each': 0, 'gets': 0, '\\frac{1}{8}x': 0, '$).': 0, '\\frac{1}{4}y': 0, '\\frac{3}{4}y': 0, '\\frac{3}{8}y': 0, '\\frac{1}{12}z': 0, '\\frac{11}{12}z': 0, '\\frac{11}{24}z': 0, 'final': 0, 'has': 0, 'be:': 0, 'monkey:': 0, '\\frac{1}{8}y': 0, 'given': 0, 'ratio': 0, 'their': 0, 'counts': 0, 'is': 0, '3:2:1': 0, 'all': 0, 'divisions': 0, 'must': 0, 'result': 0, 'in': 0, 'whole': 0, 'numbers.': 0, 'after': 0, 'solving': 0, 'system': 0, 'equations': 0, 'ensuring': 0, 'are': 0, 'integers,': 0, 'least'


true

Dot product: 0
Cosine similarity: 0.0
0/1 : 0.0%
match counts: {'yes': 0}
0/1 : 0.0%
match counts: {'true': 0}
start False end False

EVALUATION
--------------------------------
match check: True
self_eval: True
‚úÖ CORRECT
üÜó AGREED

TEST_CASE
--------------------------------
{
  "id": "coding_0_33_33",
  "type": "coding",
  "prompt": "Analyze the salary distribution within the department with code 'EMPXX'. Generate random salaries for each employee and create a histogram. - For the department of interest, randomly generate as many salaries as its number of employees. - Make sure that the salary is within SALARY_RANGE. - The histogram title should be 'Salary Distribution in EMPXX Department' - The x-label should be set to 'Salary' - The y-label should be set to 'Number of Employees'\nThe function should output with:\n    matplotlib.axes._axes.Axes: Axes object representing the histogram.\nYou should write self-contained code starting with:\n```\nimport random\nimport matplotlib.pypl


import random
import matplotlib.pyplot as plt

# Constants
SALARY_RANGE = (20000, 100000)

def task_func(dict1):
    dept = dict1.get('EMPXX', {})
    num_employees = len(dept)
    salaries = [random.randint(*SALARY_RANGE) for _ in range(num_employees)]
    
    fig, ax = plt.subplots()
    ax.hist(salaries, bins=10, edgecolor='black')
    ax.set_title('Salary Distribution in EMPXX Department')
    ax.set_xlabel('Salary')
    ax.set_ylabel('Number of Employees')
    
    return ax

Dot product: 27
Cosine similarity: 0.47970161180012355
17/46 : 36.95652173913043%
match counts: {'emp_salaries': 0, '=': 5, '[]': 0, 'for': 1, 'prefix,': 0, 'num_employees': 1, 'in': 2, 'dict1.items():': 0, 'if': 0, 'not': 0, "prefix.startswith('empxx'):": 0, 'continue': 0, '_': 1, 'range(num_employees):': 0, 'salary': 0, 'random.randint(*salary_range)': 0, 'emp_salaries.append(salary)': 0, 'plt.hist(emp_salaries,': 0, 'bins=10,': 1, 'alpha=0.5)': 0, "plt.title('salary": 0, 'distribution': 1, 'empxx': 1, "department')": 1, "plt.xlabel('salary')": 0, "plt.ylabel('number": 0, 'of': 1, "employees')": 1, 'return': 1, 'plt.gca()': 0}
16/34 : 47.05882352941176%
match counts: {'import': 0, 'random': 0, 'matplotlib.pyplot': 0, 'as': 0, 'plt': 0, '#': 0, 'constants': 0, 'salary_range': 0, '=': 2, '(20000,': 0, '100000)': 0, 'def': 0, 'task_func(dict1):': 0, 'dept': 0, "dict1.get('empxx',": 0, '{})': 0, 'num_employees': 1, 'len(dept)': 0, 'salaries': 0, '[random.randint(*salary_range)': 0, 'for'


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def task_func(product_dict, product_keys):
    if not product_keys:
        return (None, None)
    
    df = pd.DataFrame(product_dict, index=product_keys).reset_index()
    df.columns = ['Product', 'Quantity', 'Price', 'Profit']
    avg_price = df['Price'].mean()
    avg_profit = df['Profit'].mean()
    df['Average Price'] = avg_price
    df['Average Profit'] = avg_profit
    
    fig, ax = plt.subplots()
    ax.bar(df['Product'], df['Profit'])
    
    return (df, ax)

Dot product: 108
Cosine similarity: 0.6354700506655651
30/54 : 55.55555555555556%
match counts: {'columns': 0, '=': 7, "['product',": 1, "'quantity',": 1, "'price',": 1, "'profit']": 1, 'data': 0, '[]': 0, 'for': 0, 'key': 0, 'in': 0, 'product_keys:': 1, 'quantity,': 0, 'price': 0, 'product_dict[key]': 0, 'profit': 0, 'quantity': 0, '*': 0, 'data.append([key,': 0, 'price,': 0, 'profit])': 0, 'df': 1, 'pd.dataframe(data,': 0, 'columns=columns)': 0, 'if': 1, 'not': 1, 'df.empty:': 0, '#': 0, 'calculate': 0, 'average': 0, 'and': 0, 'using': 0, 'numpy': 1, 'avg_price': 2, "np.mean(df['price'])": 0, 'avg_profit': 2, "np.mean(df['profit'])": 0, 'add': 0, 'as': 3, 'new': 0, 'to': 0, 'the': 0, 'dataframe': 0, "df['average": 2, "price']": 1, "profit']": 1, 'ax': 1, "df.plot(x='product',": 0, "y='profit',": 0, "kind='bar',": 0, 'legend=false,': 0, 'title="profit': 0, 'each': 0, 'product")': 0, 'ax.set_ylabel("profit")': 0, 'else:': 0, 'none': 0, 'return': 2, 'df,': 0}
33/87 : 37.93103448275862%



```python
import pandas as pd
from collections import Counter
def task_func(d):
    counts = {k: Counter() for k in ['x', 'y', 'z']}
    for item in d:
        for key in ['x', 'y', 'z']:
            if key in item:
                counts[key][item[key]] += 1
    return counts
```

Dot product: 31
Cosine similarity: 0.5563279654601043
20/41 : 48.78048780487805%
match counts: {'df': 0, '=': 1, 'pd.dataframe(d)': 0, 'counts': 2, '{}': 0, 'for': 3, 'key': 2, 'in': 4, "['x',": 2, "'y',": 2, "'z']:": 1, 'if': 1, 'df.columns:': 0, 'counts[key]': 0, 'counter(df[key].dropna().tolist())': 0, 'else:': 0, 'counter()': 1, 'return': 1}
17/25 : 68.0%
match counts: {'```python': 0, 'import': 0, 'pandas': 0, 'as': 0, 'pd': 0, 'from': 0, 'collections': 0, 'counter': 0, 'def': 0, 'task_func(d):': 0, 'counts': 2, '=': 4, '{k:': 0, 'counter()': 1, 'for': 1, 'k': 0, 'in': 2, "['x',": 1, "'y',": 1, "'z']}": 0, 'item': 0, 'd:': 0, 'key': 2, "'z']:": 1, 'if': 1, 'item:': 0, 'counts[key][item[key]]': 0, '+=': 0, '1': 0, 'return': 1, '```': 0}
start False end False

EVALUATION
--------------------------------
match check: False
self_eval: False
‚ùå INCORRECT
üÜó AGREED

[False, True, False, False, False]
