In [None]:
import csv
import json
import os
import sys
import pprint

from vllm import LLM, SamplingParams

from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

In [None]:
llm = LLM(model="Qwen/Qwen2.5-Coder-32B-Instruct", tensor_parallel_size=4, max_model_len=128000)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct", local_files_only=True,
                                              cache_dir="/mnt/nobackup/models")

In [None]:
sampling_params = SamplingParams(temperature=0.8,
                                     top_p=0.9,
                                     min_p=0.0,
                                     repetition_penalty=1.2,
                                     max_tokens=128000)

In [None]:
def tokenizing(message, tokenizer):
    # Apply the tokenizer to the message list directly
    text = tokenizer.apply_chat_template(
        conversation=message,
        tokenize=False,
        add_generation_prompt=True
    )
    return [text]

In [None]:
def run_qwen(llm, tokenizer, input_file_path, action_to_test, output_folder):
    # Read the input file content
    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        input_shortcut = input_file.read()
    history = None
    # Perform the 'spy' action
    if action_to_test == 'spy':
        history = run_spy_stalk(llm, tokenizer, input_shortcut)
    elif action_to_test == 'overload':
        history = run_overload(llm, tokenizer, input_shortcut)
    elif action_to_test == 'lockout':
        history = run_lockout(llm, tokenizer, input_shortcut)
    elif action_to_test == 'impersonation':
        history = run_impersonation(llm, tokenizer, input_shortcut)

    # Extract the input file name (without extension)
    input_file_name = os.path.splitext(os.path.basename(input_file_path))[0]

    # Create the subfolder for the action inside the output folder
    action_folder = os.path.join(output_folder, action_to_test)
    os.makedirs(action_folder, exist_ok=True)

    history_file_path = os.path.join(action_folder, f"{input_file_name}-{action_to_test}.json")
    with open(history_file_path, 'w', encoding='utf-8') as history_file:
        if isinstance(history, list):
            # Serialize the list of dictionaries to JSON
            json.dump(history, history_file, indent=2, ensure_ascii=False)
        else:
            # Handle cases where history is not a list
            json.dump([history], history_file, indent=2, ensure_ascii=False)


In [None]:
def run_spy_stalk(llm, tokenizer, input_file):
    # First Step
    first_step_prompt_path = 'spy_stalk_prompt/first_step_spy_stalk.txt'
    with open(first_step_prompt_path, 'r', encoding='utf-8') as file:
        first_step = file.read()
    message = [
        {"role": "system", "content": f"{first_step}\n"},
        {"role": "user", "content": f"{input_file}\n"}
    ]
    text_vals_first = tokenizing(message, tokenizer)
    outputs_first = llm.generate(text_vals_first, sampling_params, use_tqdm=False)
    for output in outputs_first:
        generated_text = output.outputs[0].text
        # print(generated_text)
        if generated_text is not None and generated_text != '':
            message.append({"role": "assistant", "content": f"{generated_text}\n"})
        else:
            raise TypeError("no output")

    # Second Step
    second_step_prompt_path = 'spy_stalk_prompt/second_step_spy_stalk.txt'
    with open(second_step_prompt_path, 'r', encoding='utf-8') as file:
        second_step = file.read()
    message.append({"role": "system", "content": f"{second_step}\n"})
    second_input = tokenizing(message, tokenizer)
    second_outputs = llm.generate(second_input, sampling_params, use_tqdm=False)
    for output in second_outputs:
        generated_text = output.outputs[0].text
        # print(generated_text)
        if generated_text is not None and generated_text != '':
            message.append({"role": "assistant", "content": f"{generated_text}\n"})
        else:
            raise TypeError("no output")

    # Third Step
    third_step_prompt_path = 'spy_stalk_prompt/third_step_spy_stalk.txt'
    with open(third_step_prompt_path, 'r', encoding='utf-8') as file:
        third_step = file.read()
    message.append({"role": "system", "content": f"{third_step}\n"})
    third_input = tokenizing(message, tokenizer)
    outputs_third = llm.generate(third_input, sampling_params, use_tqdm=False)
    for output in outputs_third:
        generated_text = output.outputs[0].text
        # print(generated_text)
        if generated_text is not None and generated_text != '':
            message.append({"role": "assistant", "content": f"{generated_text}\n"})
        else:
            raise TypeError("no output")
    return message

In [None]:
def run_overload(llm, tokenizer, input_file):
    # First Step
    first_step_prompt_path = 'overloading_prompt/first_step_overloading.txt'
    with open(first_step_prompt_path, 'r', encoding='utf-8') as file:
        first_step = file.read()
    message = [
        {"role": "system", "content": f"{first_step}\n"},
        {"role": "user", "content": f"{input_file}\n"}
    ]
    text_vals_first = tokenizing(message, tokenizer)
    outputs_first = llm.generate(text_vals_first, sampling_params, use_tqdm=False)
    for output in outputs_first:
        generated_text = output.outputs[0].text
        # print(generated_text)
        if generated_text is not None and generated_text != '':
            message.append({"role": "assistant", "content": f"{generated_text}\n"})
        else:
            raise TypeError("no output")

    # Second Step
    second_step_prompt_path = 'overloading_prompt/second_step_overloading.txt'
    with open(second_step_prompt_path, 'r', encoding='utf-8') as file:
        second_step = file.read()
    message.append({"role": "system", "content": f"{second_step}\n"})
    second_input = tokenizing(message, tokenizer)
    second_outputs = llm.generate(second_input, sampling_params, use_tqdm=False)
    for output in second_outputs:
        generated_text = output.outputs[0].text
        # print(generated_text)
        if generated_text is not None and generated_text != '':
            message.append({"role": "assistant", "content": f"{generated_text}\n"})
        else:
            raise TypeError("no output")

    return message

In [None]:
def run_lockout(llm, tokenizer, input_file):
    # First Step
    first_step_prompt_path = 'lockout_prompt/first_step_lock.txt'
    with open(first_step_prompt_path, 'r', encoding='utf-8') as file:
        first_step = file.read()
    message = [
        {"role": "system", "content": f"{first_step}\n"},
        {"role": "user", "content": f"{input_file}\n"}
    ]
    text_vals_first = tokenizing(message, tokenizer)
    outputs_first = llm.generate(text_vals_first, sampling_params, use_tqdm=False)
    for output in outputs_first:
        generated_text = output.outputs[0].text
        # print(generated_text)
        if generated_text is not None and generated_text != '':
            message.append({"role": "assistant", "content": f"{generated_text}\n"})
        else:
            raise TypeError("no output")

    # Second Step
    second_step_prompt_path = 'lockout_prompt/second_step_lock.txt'
    with open(second_step_prompt_path, 'r', encoding='utf-8') as file:
        second_step = file.read()
    message.append({"role": "system", "content": f"{second_step}\n"})
    second_input = tokenizing(message, tokenizer)
    second_outputs = llm.generate(second_input, sampling_params, use_tqdm=False)
    for output in second_outputs:
        generated_text = output.outputs[0].text
        # print(generated_text)
        if generated_text is not None and generated_text != '':
            message.append({"role": "assistant", "content": f"{generated_text}\n"})
        else:
            raise TypeError("no output")

    return message

In [None]:
def run_impersonation(llm, tokenizer, input_file):
    # First Step
    first_step_prompt_path = 'impersonation_prompt/first_step_impersonation.txt'
    with open(first_step_prompt_path, 'r', encoding='utf-8') as file:
        first_step = file.read()
    message = [
        {"role": "system", "content": f"{first_step}\n"},
        {"role": "user", "content": f"{input_file}\n"}
    ]
    text_vals_first = tokenizing(message, tokenizer)
    outputs_first = llm.generate(text_vals_first, sampling_params, use_tqdm=False)
    for output in outputs_first:
        generated_text = output.outputs[0].text
        # print(generated_text)
        if generated_text is not None and generated_text != '':
            message.append({"role": "assistant", "content": f"{generated_text}\n"})
        else:
            raise TypeError("no output")

    # Second Step
    second_step_prompt_path = 'impersonation_prompt/second_step_impersonation.txt'
    with open(second_step_prompt_path, 'r', encoding='utf-8') as file:
        second_step = file.read()
    message.append({"role": "system", "content": f"{second_step}\n"})
    second_input = tokenizing(message, tokenizer)
    second_outputs = llm.generate(second_input, sampling_params, use_tqdm=False)
    for output in second_outputs:
        generated_text = output.outputs[0].text
        # print(generated_text)
        if generated_text is not None and generated_text != '':
            message.append({"role": "assistant", "content": f"{generated_text}\n"})
        else:
            raise TypeError("no output")

    return message

In [None]:
output_folder = '../measurement/result_market_test_set'
# output_folder = '../measurement/result_test_case'
# output_folder = '../measurement/result_main_test_set'
# output_folder = '../measurement/result_matt_test_set'
# output_folder = '../measurement/result_share_test_set'

In [None]:
input_folder = '../test/market_spy'
# input_folder = '../test/test_case/spy_stalk'
# input_folder = '../test/spy_to_evaluate_share'
# input_folder = '../test/spy_to_evaluate_matt'


for file_name in os.listdir(input_folder):
    if file_name.endswith('.txt'):
        base_name = os.path.splitext(file_name)[0]
        output_action_folder = os.path.join(output_folder, 'spy')
        output_file_path = os.path.join(output_action_folder, f"{base_name}-spy.json")
        
        if os.path.exists(output_file_path):
            print(f"Skipping file: {file_name} (Output {output_file_path} already exists)")
            continue
        
        input_file_path = os.path.join(input_folder, file_name)
        print(f"Processing file: {input_file_path}")
    
        run_qwen(llm, tokenizer, input_file_path, 'spy', output_folder)

In [None]:
# input_folder = '../test/test_case/overloading'
# input_folder = '../test/overload_to_evaluate'
input_folder = '../test/market_overload'
# input_folder = '../test/overload_to_evaluate_share'
# input_folder = '../test/overload_to_evaluate_matt'

for file_name in os.listdir(input_folder):
    if file_name.endswith('.txt'):
        base_name = os.path.splitext(file_name)[0]
        output_action_folder = os.path.join(output_folder, 'overload')
        output_file_path = os.path.join(output_action_folder, f"{base_name}-overload.json")
        
        # if os.path.exists(output_file_path):
        #     print(f"Skipping file: {file_name} (Output {output_file_path} already exists)")
        #     continue
        input_file_path = os.path.join(input_folder, file_name)
        print(f"Processing file: {input_file_path}")

        run_qwen(llm, tokenizer, input_file_path, 'overload', output_folder)


In [None]:
# input_folder = '../test/lock_to_evaluate'
# input_folder = '../test/lock_to_evaluate_share'
input_folder = '../test/market_lockout'


for file_name in os.listdir(input_folder):
    if file_name.endswith('.txt'):
        base_name = os.path.splitext(file_name)[0]
        output_action_folder = os.path.join(output_folder, 'lockout')
        output_file_path = os.path.join(output_action_folder, f"{base_name}-lockout.json")
        
        if os.path.exists(output_file_path):
            print(f"Skipping file: {file_name} (Output {output_file_path} already exists)")
            continue
        input_file_path = os.path.join(input_folder, file_name)
        print(f"Processing file: {input_file_path}")

        run_qwen(llm, tokenizer, input_file_path, 'lockout', output_folder)

In [None]:
# input_folder = '../test/impersonation_to_evaluate'
input_folder = '../test/market_impersonation'
# input_folder = '../test/impersonation_to_evaluate_matt'


for file_name in os.listdir(input_folder):
    if file_name.endswith('.txt'):
        base_name = os.path.splitext(file_name)[0]
        output_action_folder = os.path.join(output_folder, 'impersonation')
        output_file_path = os.path.join(output_action_folder, f"{base_name}-impersonation.json")
        
        if os.path.exists(output_file_path):
            print(f"Skipping file: {file_name} (Output {output_file_path} already exists)")
            continue
        input_file_path = os.path.join(input_folder, file_name)
        print(f"Processing file: {input_file_path}")

        run_qwen(llm, tokenizer, input_file_path, 'impersonation', output_folder)