In [None]:
!pip install together datasets

In [None]:
!export TOGETHER_API_KEY=''

In [None]:
import pandas as pd
import json
import requests
from together import Together
from datasets import load_dataset

In [None]:
ds = load_dataset("StonyBrookNLP/tellmewhy")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.76k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

train.json:   0%|          | 0.00/70.1M [00:00<?, ?B/s]

validation.json:   0%|          | 0.00/8.71M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/71892 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8976 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10689 [00:00<?, ? examples/s]

In [None]:
def csv_to_json(csv_path):
    df = pd.read_csv(csv_path)
    data = []
    for _, row in df.iterrows():
        data.append({
            "narrative": row["narrative"],
            "question": row["question"],
            "answer": row["answer"],
            "T5-prediction": row["T5-prediction"],
            "fine-tuned-T5-prediction": row["fine-tuned-T5-prediction"],
            "fine-tuned-prompt-generated_answer": row["fine-tuned-prompt-generated_answer"],
            "t5-prompt-generated_answer": row["t5-prompt-generated_answer"]
        })
    return data

In [None]:

def send_to_api(json_data):
    client = Together(api_key="c0aea7278e26fdd8b2d394d9ed01b14f2f56971374fbfbe81e759089b0b9bab7")
    invalid_responses = []  # To store responses that are not in the expected format

    prompt_template = (
        "You are an impartial evaluation judge tasked with assessing responses from different models based on the following criteria:\n"
        "1. **Accuracy**: How correct and precise the response is compared to the provided answer.\n"
        "2. **Logical Reasoning**: The quality of logical connections and reasoning in the response.\n"
        "3. **Lexical Coherence**: How well-structured and meaningful the language is.\n"
        "4. **Syntactical Similarity**: How closely the syntax matches that of the provided answer.\n\n"
        "Your task is to rank the responses from the models in increasing order of performance:\n"
        "- The **best-performing model** is ranked **1st**.\n"
        "- The **worst-performing model** is ranked **4th**.\n\n"
        "Please return the rankings in the following JSON format:\n"
        "[\n"
        "    {\"model\": \"Model Name\", \"ranking\": 1},\n"
        "    {\"model\": \"Model Name\", \"ranking\": 2},\n"
        "    {\"model\": \"Model Name\", \"ranking\": 3},\n"
        "    {\"model\": \"Model Name\", \"ranking\": 4}\n"
        "]"
    )

    prompt_ending = "Just give the JSON result. Keep the analysis with yourself."
    responses = []


    for entry in json_data:
        # Construct the prompt
        prompt = (
            f"{prompt_template}\n\n"
            f"Narrative:\n{entry['narrative']}\n\n"
            f"Question: {entry['question']}\n"
            f"Correct Answer: {entry['answer']}\n\n"
            f"T5-prediction: {entry['T5-prediction']}\n"
            f"fine-tuned-T5-prediction: {entry['fine-tuned-T5-prediction']}\n"
            f"fine-tuned-prompt-generated_answer: {entry['fine-tuned-prompt-generated_answer']}\n"
            f"t5-prompt-generated_answer: {entry['t5-prompt-generated_answer']}\n\n"
            f"{prompt_ending}"
        )

        try:
            print("Sending data to api")
            stream = client.chat.completions.create(
                model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
                messages=[{"role": "user", "content": prompt}],
                stream=True,
            )
        except Exception as e:
            print(f"API call failed for question: {entry['question']}. Error: {str(e)}")
            break

        response_content = ""
        for chunk in stream:
            response_content += chunk.choices[0].delta.content or ""

        try:
            parsed_response = json.loads(response_content)
            responses.append({"rankings": parsed_response})
        except json.JSONDecodeError:
            print(f"Invalid response format for question: {entry['question']}")
            invalid_responses.append({"question": entry['question'], "response": response_content})
            responses.append({"rankings": []})


    with open("invalid_responses.json", "w") as file:
        json.dump(invalid_responses, file, indent=4)

    return responses



In [None]:
def create_ranked_csv(json_data, api_responses, output_csv_path):
    rankings = []

    for entry, response in zip(json_data, api_responses):
        ranking_dict = {item["model"]: item["ranking"] for item in response.get("rankings", [])}

        ranked_data = {
            "Question": entry["question"],
            "Answers": entry["answer"],
            "T5-prediction": entry["T5-prediction"],
            "fine-tuned-T5-prediction": entry["fine-tuned-T5-prediction"],
            "fine-tuned-prompt-generated_answer": entry["fine-tuned-prompt-generated_answer"],
            "t5-prompt-generated_answer": entry["t5-prompt-generated_answer"],
            "T5-prediction_ranking": ranking_dict.get("T5-prediction", "N/A"),
            "fine-tuned-T5-prediction_ranking": ranking_dict.get("fine-tuned-T5-prediction", "N/A"),
            "t5-prompt-generated_answer_ranking": ranking_dict.get("t5-prompt-generated_answer", "N/A"),
            "fine-tuned-prompt-generated_answer_ranking": ranking_dict.get("fine-tuned-prompt-generated_answer", "N/A"),
        }
        rankings.append(ranked_data)

    ranked_df = pd.DataFrame(rankings)
    ranked_df.to_csv(output_csv_path, index=False)
    print(f"Ranked CSV saved to {output_csv_path}")

In [None]:
if __name__ == "__main__":
    csv_path = "merged_predictions.csv"
    output_csv_path = "ranked_output_finsl.csv"

    json_data = csv_to_json(csv_path)
    api_responses = send_to_api(json_data)
    create_ranked_csv(json_data, api_responses, output_csv_path)

Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
Sending data to api
