In [None]:
import os
import getpass
import langchain

# Langsmith
langchain.debug = False
os.environ["LANGSMITH_TRACING"] = "false"
os.environ["LANGCHAIN_ENDPOINT"] = ""
os.environ["LANGCHAIN_API_KEY"] = ""
# if "LANGSMITH_API_KEY" not in os.environ:
#     os.environ["LANGSMITH_API_KEY"] = "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"
# if "LANGSMITH_PROJECT" not in os.environ:
#     os.environ["LANGSMITH_PROJECT"] = "nlp_final"

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = ""

if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

In [None]:
import mmmlu_preparer
from mmmlu_preparer.read_mmmlu_dataset import (
    TARGET_SUBTASKS,
    MMMLULanguage,
    create_mmmlu_dataset,
    sample_first_n_data_from_subtask
)

lang_list = ["EN", "JA_JP"]
curr_language = lang_list[0]
dataset_language_enum = MMMLULanguage[curr_language]

mmmlu_ds = create_mmmlu_dataset(dataset_language_enum)
chosen_subtasks = TARGET_SUBTASKS
mmmlu_subset = sample_first_n_data_from_subtask(mmmlu_ds, chosen_subtasks)
mmmlu_subset

In [None]:
from mmmlu_preparer.answer_extract import extract_answer_from_response
print(extract_answer_from_response("TEST A TEST"))
print(extract_answer_from_response("Answer: B"))
print(extract_answer_from_response("<Answer> D"))
print(extract_answer_from_response("'Answer': C"))
print(extract_answer_from_response('"Answer": A'))

In [None]:
import pandas as pd

# Draft
experiment_save_dict = {
    "Model": "",
    "Question id": "",
    "Shuffle method": "",
    "Original to shuffled": "",
    "Input format": "",
    "Output format": "",
    "Query": "",
    "Language": "",
    "Subtask": "",
    "Original correct answer": "",
    "Shuffled correct answer": "",
    "Response answer": "",
    "Model output": "",  # Output text only
    "Full response": "", # All the output
}

experiment_list = [experiment_save_dict]
experiment_df = pd.DataFrame(experiment_list)

In [None]:
experiment_df

In [None]:
import os
save_dir = "./mmmlu_output"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [None]:
import getpass
import os
from langchain.chat_models import init_chat_model
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.rate_limiters import InMemoryRateLimiter

try:
    # load environment variables from .env file (requires `python-dotenv`)
    from dotenv import load_dotenv

    load_dotenv()
except ImportError:
    pass


# Gemini 2.0 Flash Free rate: RPM 15
# Tier 1 RPM 2000
rate_limiter = InMemoryRateLimiter(
    requests_per_second=20,
    check_every_n_seconds=0.1,
    max_bucket_size=20,
)


model_name = "gemini-2.0-flash"
model = init_chat_model(model_name,
                        model_provider="google_genai",
                        rate_limiter=rate_limiter,
                        temperature=0.0,
                        max_tokens=4096
                        )

# text: The text to translate
prompt_template = ChatPromptTemplate.from_messages(
    [("user", "{text}")]
)

In [None]:
from mmmlu_preparer.query_formats import (
    get_current_queries,
    InputFormat,
    OutputFormat,
    ShuffleMethod
)

# BASE, JSON, XML
curr_input_format = InputFormat.BASE

# BASE, JSON_FULL, XML_FULL
curr_output_format = OutputFormat.BASE

# DEFAULT, REVERSE, LONGEST_FIRST, SHORTEST_FIRST, MOST_KANA_RATIO, FEWEST_KANA_RATIO
curr_shuffle_method = ShuffleMethod.DEFAULT

input_format_save_name = curr_input_format.value.lower().replace("_", "-")
output_format_save_name = curr_output_format.name.lower().replace("_", "-")
shuffle_method_save_name = curr_shuffle_method.name.lower().replace("_", "-")

language_name = curr_language.lower().replace("_", "-")

save_name = f"{model_name}_{language_name}_{input_format_save_name}_input_{output_format_save_name}_output_{shuffle_method_save_name}_shuffle"
save_name

In [None]:
curr_queries = get_current_queries(mmmlu_subset,
                                   dataset_language_enum,
                                   chosen_subtasks,
                                   curr_input_format,
                                   curr_output_format,
                                   curr_shuffle_method,
                                   )

In [None]:
mmmlu_subset.filter(lambda x: x['Subject'] == "abstract_algebra")[0]['Question']

In [None]:
print(curr_queries[0]['Query'])

In [None]:
import json
from tqdm.auto import trange
from pathlib import Path
from typing import Optional
async def run_experiemnts(queries: list[dict], save_path: str, try_first_n: Optional[int] = None) -> list[dict]:
    text_queries = [{"text": query['Query']} for query in queries]
    input_prompts = prompt_template.batch(text_queries)

    results = []
    mini_batch_size = 20

    target_save_path = Path(save_path)
    if target_save_path.suffix != ".jsonl":
        print("Output should be jsonl file")
        target_save_path = target_save_path.with_suffix(".jsonl")
    target_save_path.touch()

    with target_save_path.open('r', encoding='utf-8') as file:
        # Count the nubmer of lines
        start_idx = sum(1 for line in file if line.strip())
    print(f"Start from {start_idx = }")

    total_process_num = len(text_queries)
    if try_first_n is not None:
        total_process_num = start_idx + try_first_n

    for batch_i in trange(start_idx, total_process_num, mini_batch_size):
        try:
            batched_prompts = input_prompts[batch_i:batch_i + mini_batch_size]
            responses = await model.abatch(batched_prompts)
            results.extend(responses)
            with target_save_path.open('a', encoding='utf-8') as file:
                for response in responses:
                    json.dump(response.to_json(), file, ensure_ascii=False)
                    file.write("\n")
                file.flush()
            print(f"Finish {batch_i + mini_batch_size} data")
        except Exception as e:
            # Rate limit break
            print(f"Current idx:{batch_i}\nencounters exception: {e}\nIt might be daily rate limit or error.")
            break
    return results


In [None]:
save_path = f"{save_dir}/{save_name}.jsonl"
results = await run_experiemnts(curr_queries, save_path, try_first_n=None)

In [None]:
from pathlib import Path
import json
save_path = f"{save_dir}/{save_name}.jsonl"
target_save_path = Path(save_path)
with target_save_path.open('r', encoding='utf-8') as file:
    result_dicts = [json.loads(line) for line in file if line.strip()]

In [None]:
print(result_dicts[1]['kwargs']['content'])

In [None]:
output_text = [result['kwargs']['content'] for result in result_dicts]
output_answer = []
none_answer_indice = []
none_answer_output = []
for idx, output in enumerate(output_text):
    extracted_answer = extract_answer_from_response(output)
    output_answer.append(extracted_answer)
    if extracted_answer is None:
        none_answer_indice.append(idx)
        none_answer_output.append(output)
        #print(idx)
        print(f"{idx}:\n{output = }\n")

print(f"{len(none_answer_indice) = }")

In [None]:
import numpy as np
output_tokens_list = [result['kwargs']['usage_metadata']['total_tokens'] for result in result_dicts]
print(np.argsort(-np.array(output_tokens_list)).tolist())
print(np.sort(-np.array(output_tokens_list)).tolist())
print(f"median: {np.median(output_tokens_list)}")
print(f"mean: {np.mean(output_tokens_list)}")

In [None]:
print(result_dicts[7]['kwargs']['content'])

In [None]:
# from langchain_core.output_parsers.json import JsonOutputParser
# import re
# from tqdm.auto import tqdm
# parser = JsonOutputParser()

# for result in tqdm(result_dicts):
#     #try:
#     string = result['kwargs']['content']

#     def escape_single_backslash(match):
#         c = match.group(0)
#         return c.replace("\\", "\\\\")

#     # ChatGPT
#     string = re.sub(r'(?<!\\)\\(?![\\ntbrf"u])', escape_single_backslash, string)

#     x = (parser.parse(string))