In [1]:
import sys
sys.path.append('../..')
from lib import automated_evaluation, chat_helper
from anthropic import Anthropic
from lib.automated_evaluation import  caesar_decrypt
from lib.hallucination_lib import get_letter_probabilities_batched
from transformers import AutoTokenizer
import pickle
import pandas as pd
from tqdm import tqdm
import torch
import os
import re
import json
import matplotlib.pyplot as plt


In [2]:
def rate_all_answers(questions):
    for key in questions.keys():
        if "truth_answer" in key:
            i = int(re.match(r"truth_answer_(\d+)$", key).group(1))
            answer_name = "truth_rating_" + str(i)
            if answer_name not in questions.keys():
                rate_df = pd.DataFrame(
                    {
                        "questions": questions["truth_question"].tolist(),
                        "answers": questions[key].tolist(),
                        "solutions": questions["truth"].tolist(),
                    }
                )
                automated_evaluation.rate_answers_to_true_questions(rate_df)
                questions[answer_name] = rate_df["rating"].tolist()
                questions["truth_rating_text_" + str(i)] = rate_df[
                    "rating_text"
                ].tolist()
        if "fiction_answer" in key:
            i = int(re.match(r"fiction_answer_(\d+)$", key).group(1))
            answer_name = "fiction_rating_" + str(i)
            if answer_name not in questions.keys():
                rate_df = pd.DataFrame(
                    {
                        "question": questions["fiction_question"].tolist(),
                        "answers": questions[key].tolist(),
                        "fiction": questions["fiction"].tolist(),
                        "truth": questions["truth"].tolist(),
                    }
                )
                automated_evaluation.classify_affirmation_answer(rate_df)
                questions[answer_name] = rate_df["rating"].tolist()
                questions["fiction_rating_text_" + str(i)] = rate_df[
                    "rating_text"
                ].tolist()


def rate_all_answers_wrapper_function(questions, N=20):
    # Splitting the main dataframe into smaller dataframes of size N
    list_of_dfs = [questions.iloc[i : i + N] for i in range(0, len(questions), N)]

    # Process each smaller dataframe
    processed_dfs = []
    for small_df in list_of_dfs:
        rate_all_answers(small_df)
        processed_dfs.append(small_df)

    # Combine the processed dataframes to get the final dataframe
    final_df = pd.concat(processed_dfs, axis=0, ignore_index=True)

    return final_df

In [12]:
layer = 15
coeff_list = [-10, -7.5, -5, -2.5, 0, 2.5, 5, 7.5, 10]

path = "./steered_completions/"
question_types = ["direct_questions", "questioning_assuming_statement", "alluding_questions", "conversation"]

for question_type in question_types:
    question_path = f"{path}{question_type}/"
    for coeff in coeff_list:
        if os.path.exists(f"{question_path}fiction_steered_{coeff}.csv"):
            data = pd.read_csv(f"{question_path}fiction_steered_{coeff}.csv")
            print(f"rating {question_path}fiction_steered_{coeff}.csv")
            rate_all_answers(data)
            data.to_csv(f"{question_path}fiction_steered_{coeff}.csv", index=False)
        if os.path.exists(f"{question_path}mix_steered_{coeff}.csv"):
            data = pd.read_csv(f"{question_path}mix_steered_{coeff}.csv")
            print(f"rating {question_path}mix_steered_{coeff}.csv")
            rate_all_answers(data)
            data.to_csv(f"{question_path}mix_steered_{coeff}.csv", index=False)
        if os.path.exists(f"{question_path}added_steered_{coeff}.csv"):
            data = pd.read_csv(f"{question_path}added_steered_{coeff}.csv")
            print(f"rating {question_path}added_steered_{coeff}.csv")
            rate_all_answers(data)
            data.to_csv(f"{question_path}added_steered_{coeff}.csv", index=False)

rating ./steered_completions/direct_questions/fiction_steered_-10.csv
rating ./steered_completions/direct_questions/mix_steered_-10.csv
rating ./steered_completions/direct_questions/added_steered_-10.csv
rating ./steered_completions/direct_questions/fiction_steered_-7.5.csv
rating ./steered_completions/direct_questions/mix_steered_-7.5.csv
rating ./steered_completions/direct_questions/added_steered_-7.5.csv
rating ./steered_completions/direct_questions/fiction_steered_-5.csv
rating ./steered_completions/direct_questions/mix_steered_-5.csv
rating ./steered_completions/direct_questions/added_steered_-5.csv
rating ./steered_completions/direct_questions/fiction_steered_-2.5.csv
rating ./steered_completions/direct_questions/mix_steered_-2.5.csv
rating ./steered_completions/direct_questions/added_steered_-2.5.csv
rating ./steered_completions/direct_questions/fiction_steered_0.csv
rating ./steered_completions/direct_questions/mix_steered_0.csv
rating ./steered_completions/direct_questions/add