In [1]:
%load_ext jupyter_ai

In [2]:
import nest_asyncio
nest_asyncio.apply()

import sys

sys.path.append("../src/")

from llm import LLM

from argparse import Namespace

llm = LLM(Namespace(inference_mode="api_async", api_key="None", base_url="http://node16:9876/v1/", llm_in_use="meta-llama/Llama-3.3-70B-Instruct", fast_mode=False, max_retry=3, max_tokens=16384, num_workers=1, max_batch_size=16))

llm.initialize()

<llm.LLM at 0x1555169473d0>

In [2]:
import glob
import json

In [3]:
all_results = glob.glob("../tpqa_result_final_merge/*/*.json")

In [4]:
final_results = {}
for x in all_results:
    model_name, dataset_name = x.split("/")[-2], x.split("/")[-1]
    model_name = model_name.split("_")[-1]
    dataset_name = dataset_name.replace("_results.json","")
    if model_name not in final_results:
        final_results[model_name] = {}
    if dataset_name not in final_results[model_name]:
        final_results[model_name][dataset_name] = json.load(open(x))

In [6]:
all_models = [
    'Llama-3.1-8B-Instruct',
    'Qwen2.5-7B-Instruct',
    'Llama-3.3-70B-Instruct',
    'Qwen2.5-72B-Instruct',
    'DeepSeek-R1-Distill-Llama-70B',
    'QwQ-32B'
]

all_tasks = [
    'type1_dataset', 
    'type1_easy_dataset',
    'type1_very_easy_dataset', 
    'type2_dataset',
    'type2_misleading_dataset',
    'type2_misleading_with_token_dataset',
]

sample_model = all_models[0]
sample_task = all_tasks[0]

import numpy as np
# final_results[sample_model][sample_task]['runs'][0].keys()
# dict_keys(['accuracy', 'total_count', 'correct_count', 'results'])

sample_avg_accuracy = np.mean([ x['accuracy'] for x in final_results[sample_model][sample_task]['runs']])

import pandas as pd
# Create a Table: compute average accuracy for all of the combination in all_models and all_tasks, put them into a pandas table and assigned to the variable df

In [7]:
final_results[sample_model][sample_task]['runs'][0].keys()

dict_keys(['accuracy', 'total_count', 'correct_count', 'results'])

In [8]:
import pandas as pd
import numpy as np

# Initialize an empty dictionary to store the results
results = {model: {} for model in all_models}

# Iterate over all models and tasks
for model in all_models:
    for task in all_tasks:
        # Compute the average accuracy for the current model-task pair
        avg_accuracy = np.mean([x['accuracy'] for x in final_results[model][task]['runs']])
        
        # Store the result in the dictionary
        results[model][task] = avg_accuracy

# Create a pandas DataFrame from the results
df = pd.DataFrame(results).fillna(0)

In [9]:
df = df.T

In [10]:
df = df * 100

In [11]:
df.style.format("{:.2f}")

Unnamed: 0,type1_dataset,type1_easy_dataset,type1_very_easy_dataset,type2_dataset,type2_misleading_dataset,type2_misleading_with_token_dataset
Llama-3.1-8B-Instruct,79.73,83.0,99.4,76.8,47.73,16.33
Qwen2.5-7B-Instruct,66.8,71.2,92.67,80.93,26.8,12.87
Llama-3.3-70B-Instruct,91.33,93.2,99.87,93.47,74.0,40.07
Qwen2.5-72B-Instruct,83.8,84.53,99.93,96.6,82.53,48.8
DeepSeek-R1-Distill-Llama-70B,87.93,92.27,100.0,99.2,98.0,94.27
QwQ-32B,91.53,94.8,100.0,99.33,99.2,99.07


In [12]:
import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials

In [13]:
# scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
# creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
# client = gspread.authorize(creds)

# spreadsheet = client.open("TimeEvalResults")

# spreadsheet.worksheets()

# working_sheet = spreadsheet.worksheet("TEST")

# set_with_dataframe(working_sheet, df, include_index=True)

In [14]:
# Attribute analysis

In [15]:
sys_prompt = """
You are a classification agent. Your task is to classify the reasoning provided by an LLM into one of the following four categories, based on **how** the LLM determines which response took longer time to generate:

### Allowed categories (return only one of the category names):
- `time`
- `text_length`
- `semantic`
- `other`

### Classification Rules:

1. **`time`**:  
   The reason explicitly involves **timing information** — such as start time, end time, duration (e.g., “1 minute and 45 seconds”), timestamps, or calculations of elapsed time.  
   If the decision is made **primarily or solely based on these time-based values**, without switching judgment due to other factors, classify it as `time`.

2. **`text_length`**:  
   The reason makes a judgment based on the **length of the text**, such as token count, number of words, number of sentences, or how long the generated response is.  
   This includes explicitly mentioning phrases like “Response A is longer,” “has more tokens,” or “took more space to explain”, etc.

3. **`semantic`**:  
   The reason does **not mention time or length difference** at all, but solely relies on **semantic or cognitive complexity** — such as the depth of explanation, difficulty of the topic, use of logic or math, or other indicators of **conceptual effort**.

4. **`other`**:  
   Use this category if the reasoning doesn’t clearly match any of the above — for example, if the model relies on **irrelevant metadata**, contradictory logic, unclear rationale, or vague comparison that doesn’t fit well into the previous categories.

Do **not** include any explanation or justification in your response.
"""

In [16]:
user_prompt = """

Here is the explanation to classify:
```
{reason}
```

"""

In [17]:
all_models = [
    'Llama-3.1-8B-Instruct',
    'Qwen2.5-7B-Instruct',
    'Llama-3.3-70B-Instruct',
    'Qwen2.5-72B-Instruct',
    'DeepSeek-R1-Distill-Llama-70B',
    'QwQ-32B'
]

all_tasks = [
    'type1_dataset', 
    'type1_easy_dataset',
    'type1_very_easy_dataset', 
    'type2_dataset',
    'type2_misleading_dataset',
    'type2_misleading_with_token_dataset',
]

sample_model = all_models[0]
sample_task = all_tasks[0]

import numpy as np
# final_results[sample_model][sample_task]['runs'][0].keys()
# dict_keys(['accuracy', 'total_count', 'correct_count', 'results'])



In [18]:
def get_raw_response(text):
    return text.replace("<think>", "").replace("</think>","").replace("<solution>", "").replace("</solution>","")
def get_reason(result):
    if result['parsed_response'] is not None and "reason" in result['parsed_response']:
        return result['parsed_response']['reason']
    return get_raw_response(result['response'])

In [19]:
def promtpify(reason):
    if len(reason) > 10000:
        reason = reason[-10000:]
    return [{
        "role":"system",
        "content": sys_prompt.strip()
    }, {
        "role": "user",
        "content": user_prompt.format(reason=reason).strip()
    }]

In [20]:

eb_args = {"extra_body": {"guided_choice": ["time", "text_length", "semantic", "other"]}}

In [25]:
# all_prompts= []
# for sample_model in all_models:
#     for sample_task in all_tasks:
#         for run in final_results[sample_model][sample_task]['runs']:
#             # print(f"Running {sample_model} {sample_task}")
#             results = run['results']
#             reason_prompt = [ promtpify(get_reason(x)) for x in results]
#             all_prompts.extend(reason_prompt)
#             # responses = llm.generate(reason_prompt, **eb_args)
#             # attributions = list(map(lambda x: x['solution'], responses['responses']))
#             # assert len(attributions) == len(reason_prompt[0])
#             # for i, result in enumerate(run['results']):
#             #     run['results'][i]['attribute'] = attributions[i]

# # json.dump(all_prompts, open("./batch_prompt.json", 'w'))

# # all_prompts[0]

# # all_responses = llm.generate(all_prompts, **eb_args)



# len(all_prompts)

# len(all_prompts)

# saved_files = []
# for i in range(len(all_prompts)):
#     saved_files.append({"custom_id": f"request-{i}", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3.3-70B-Instruct", "messages": all_prompts[i], "max_completion_tokens": 1000, "extra_body": {"guided_choice": ["time", "text_length", "semantic", "other"]}}})

# open("./batch_file.jsonl",'w').write("\n".join([json.dumps(x) for x in saved_files])+"\n")

In [26]:
responses = json.load(open("./batch_prompt_output.json"))

In [31]:
attributions = list(map(lambda x:x['solution'], responses['responses']))

In [33]:
for sample_model in all_models:
    for sample_task in all_tasks:
        for run in final_results[sample_model][sample_task]['runs']:
            # print(f"Running {sample_model} {sample_task}")
            results = run['results']
            # responses = llm.generate(reason_prompt, **eb_args)
            # attributions = list(map(lambda x: x['solution'], responses['responses']))
            # assert len(attributions) == len(reason_prompt[0])
            for i, result in enumerate(run['results']):
                run['results'][i]['attribute'] = attributions.pop(0)

In [36]:
sample_model

'QwQ-32B'

In [37]:
sample_task

'type2_misleading_with_token_dataset'

In [38]:
import pandas as pd

In [53]:
rows = []
for sample_model in all_models:
    for sample_task in all_tasks:
        attr_result = pd.concat([pd.DataFrame(final_results[sample_model][sample_task]['runs'][i]['results'])['attribute'].value_counts() for i in range(5)],axis=1).fillna(0).mean(1)
        rows.append({"model": sample_model, "task": sample_task, **attr_result.to_dict()})
        

In [57]:
attr_statistics = pd.DataFrame(rows).fillna(0)

In [76]:
# scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
# creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
# client = gspread.authorize(creds)

# spreadsheet = client.open("TimeEvalResults")

# spreadsheet.worksheets()


# spreadsheet.add_worksheet("AttributionStatistics", rows=attr_statistics.shape[0]+1, cols=attr_statistics.shape[1]+1)


# working_sheet = spreadsheet.worksheet("AttributionStatistics")

# set_with_dataframe(working_sheet, attr_statistics, include_index=True)

In [96]:
all_attrs = ["time", "text_length", "semantic", "other"]

In [107]:
rows = []
for sample_model in all_models:
    for sample_task in all_tasks:
        correct_ratio = {}
        for attr in all_attrs:
            attr_correct = 0
            attr_incorrect = 0
            attr_count = 0
            for run in final_results[sample_model][sample_task]['runs']:
                for result in run['results']:
                    if result['attribute'] == attr:
                        attr_count +=1
                        if result['correct']:
                            attr_correct +=1
                        else:
                            attr_incorrect +=1
            num_runs = len(final_results[sample_model][sample_task]['runs'])
            attr_correct = attr_correct / num_runs
            attr_incorrect = attr_incorrect / num_runs
            attr_count = attr_count / num_runs
            # correct_ratio[attr] = (attr_correct / attr_count if attr_count > 0 else 0.0 ) * 100
            correct_ratio[attr] = attr_correct
            
        rows.append({"model": sample_model, "task": sample_task, **correct_ratio})
        

In [108]:
attr_corr_statistics = pd.DataFrame(rows)

In [110]:
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
client = gspread.authorize(creds)

spreadsheet = client.open("TimeEvalResults")

spreadsheet.worksheets()


spreadsheet.add_worksheet("AttributionCorrectStatisticsBAR", rows=attr_corr_statistics.shape[0]+1, cols=attr_corr_statistics.shape[1]+1)


working_sheet = spreadsheet.worksheet("AttributionCorrectStatisticsBAR")

set_with_dataframe(working_sheet, attr_corr_statistics, include_index=True)

In [85]:
import requests

In [91]:
response = requests.get("http://node16:9876/metrics")