# Gemini Code

In [None]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, ChatSession
from vertexai.generative_models import (
    GenerativeModel,
    HarmCategory,
    HarmBlockThreshold,
    Part,
    SafetySetting,
)
project_id = "rising-precinct-429608-h9"
location = "asia-southeast1"
vertexai.init(project=project_id, location=location)

safety_config = {
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}

model = GenerativeModel("gemini-1.5-pro")
chat = model.start_chat(response_validation=False)

def get_chat_response(chat: ChatSession, prompt: str):
    response = chat.send_message(prompt, safety_settings=safety_config,)
    return response.text

prompt = "Hello"
print(get_chat_response(chat, prompt))


Hello! ðŸ‘‹  

What can I do for you today? ðŸ˜Š 



In [None]:
import json

output_path = 'medusa_response.json'

with open(output_path, "r") as outfile:
    medusa = json.load(outfile)
print(len(medusa))


304


In [None]:
import json

output_path = 'vicuna_response.json'

with open(output_path, "r") as outfile:
    vicuna = json.load(outfile)
print(len(vicuna))


304


In [None]:
import json

output_path = 'hansard_answered_questions_llama3_formatted_test.json'

with open(output_path, "r") as outfile:
    ground_truth_qa = json.load(outfile)
print(len(ground_truth_qa))


304


In [None]:
ground_truth = []
for qa in ground_truth_qa:
  answer = qa['conversations'][-1]
  if answer['from'] == 'gpt':
    ground_truth.append(answer['value'])
  else:
    ground_truth.append('')
    print(answer)
print(len(ground_truth))

{'from': 'human', 'value': 'Question:Mr Louis Ng Kok Kwang asked the Minister for Sustainability and the Environment in assessing ventilation in migrant worker dormitories using carbon dioxide measurements as a proxy (a) whether the Ministry will consider taking measurements in separate locations, including workersâ€™ sleeping rooms; and (b) whether the Ministry will include all types of dormitories, such as Construction Temporary Quarters and Factory-Converted Dormitories.\n\nSupporting points: **Title: Extension of Ventilation Assessment to All Dormitory Types and Locations**\n\n**Introduction**\n\nIn response to growing concerns over indoor air quality and its potential impacts on the health and well-being of dormitory residents, the Ministry has decided to extend its ventilation assessment to encompass all types of dormitories and locations. This extension is a proactive measure aimed at ensuring the living conditions within these spaces are conducive to the health of the occupants

In [None]:
def get_preference(medusa, other):
  all_responses = []
  for i in range(len(medusa)):
    medusa_qa = medusa[i]
    other_a = other[i]
    medusa_q = "#Answer:".join(medusa_qa.split["#Answer:"][:-1])
    medusa_a = medusa_qa.split["#Answer:"][-1]
    prompt = f"""Tell me which answer you prefer to the question below. Strictly reply 1 or 2 only without explanation or preamble.
    Question: {medusa_q}

    Answer 1: {medusa_a}
    Answer 2: {other_a}
    """
    response = get_chat_response(chat, prompt)
  return all_responses


In [None]:
#

# RAGAS

In [None]:
'''
def context_relevance(questions):
  all_responses = []
  for q in questions:

    prompt = f"""The following is a question, followed by Supporting points, and then the answer. Only consider the lsat question-answer pair.
    {q}
    List all the independent claims in the supporting points, strictly separated by line breaks only, without explanation or preamble. """
    response1 = get_chat_response(chat, prompt)

    prompt = f"""The following is a question, followed by Supporting points, and then the answer. Only consider the lsat question-answer pair.
    {q}

    These are the independent claims in the supporting point: {response1}

    List the number of independent claims in the supporting point which is relevant to the question, separated by line breaks, without explanation or preamble."""
    response2 = get_chat_response(chat, prompt)
    num_response1 = len(response1.split("\n"))
    num_response2 = len(response2.split("\n"))
    all_responses.append([num_response1, num_response2])
  return all_responses
'''

In [None]:
import time
def answer_correctness(questions, ground_truth):
  all_responses = []
  for i in range(len(questions)):
    pred = questions[i].split("# Answer:")[-1]
    gt = ground_truth[i]
    if (len(gt)>0) and (len(pred)> 1):
      prompt = f""" Determine if the following statement is correct given the ground truth. Respond striclty with 1 (corect) or 0 (incorrect).
      Statement: {pred}
      Ground Truth: {gt}

      The output must be 0 or 1 without explanation.
      """
      try:
        response = get_chat_response(chat, prompt)
      except:
        time.sleep(0.1)
        try:
          response = get_chat_response(chat, prompt)
        except:
          time.sleep(60)
          response = get_chat_response(chat, prompt)
      if response[0] in ["0", "1"]:
        response = response[0]
      all_responses.append(response.strip())

    else:
      all_responses.append("")
  return all_responses


In [None]:
medusa_ans_corr = answer_correctness(medusa , ground_truth)
medusa_ac = [int(a) for a in medusa_ans_corr if len(a)==1]
import numpy as np
np.mean(medusa_ac) #0.80

In [None]:
vicuna_ans_corr = answer_correctness(vicuna , ground_truth)
vicuna_ac = [int(a) for  a in vicuna_ans_corr if len(a)==1]
print(np.mean(vicuna_ac)) #0.47

0.4717607973421927


In [None]:
from vertexai.preview.language_models import TextEmbeddingModel
model = TextEmbeddingModel.from_pretrained("text-embedding-004")

def get_embeddings(text):
  embeddings = model.get_embeddings([text],)

  for embedding in embeddings:
    vector = embedding.values
  return vector

In [None]:
import numpy as np
import time
def answer_sim(questions, ground_truth):
  all_sim = []
  for i in range(len(questions)):
    pred = questions[i].split("# Answer:")[-1]
    gt = ground_truth[i]
    if (len(gt)>0) and (len(pred)> 10):
      embedding1 = np.array(get_embeddings(pred))
      embedding2 = np.array(get_embeddings(gt))
      similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
      all_sim.append(similarity)
      time.sleep(0.1)

  return all_sim

In [None]:
medusa_sim = answer_sim(medusa , ground_truth)
np.mean(medusa_sim)

[0.9655236357600216, 0.9806240186432382, 0.8944921415080447]

In [None]:
medusa_sim

In [None]:
from tqdm import tqdm
import time
def faithfulness(questions):
  all_responses = []
  for q in tqdm(questions):

    prompt = f"""The following is a question, followed by Supporting points, and then the answer. Only consider the lsat question-answer pair.
    {q[:150000]}
    List all the independent claims in the answer, strictly separated by line breaks only, without explanation or preamble. """
    try:
      response1 = get_chat_response(chat, prompt)
      time.sleep(0.5)
    except:
      time.sleep(60)
      response1 = get_chat_response(chat, prompt)

    prompt = f"""The following is a question, followed by Supporting points, and then the answer. Only consider the lsat question-answer pair.
    {q[:1500000]}

    These are the independent claims in the answers: {response1[:100000]}

    List the points from the above which is relevant to the supporting points, separated by line breaks, without explanation or preamble."""
    try:
      response2 = get_chat_response(chat, prompt)
      time.sleep(0.5)
    except:
      time.sleep(60)
      response2 = get_chat_response(chat, prompt)

    num_response1 = len(response1.split("\n"))
    num_response2 = len(response2.split("\n"))
    if num_response2> num_response1:
      num_response2 = num_response1
    all_responses.append([num_response1, num_response2])
  return all_responses

In [None]:
f_medusa = faithfulness(medusa)
faith_medusa = [int(b)/int(a) for a,b in f_medusa ]
import numpy as np
np.mean(faith_medusa)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 304/304 [1:54:44<00:00, 22.65s/it]


0.8575170169742539

In [None]:
f_vicuna= faithfulness(vicuna)
faith_vicuna = [int(b)/int(a) for a,b in f_vicuna ]
import numpy as np
np.mean(faith_vicuna)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 304/304 [5:33:33<00:00, 65.84s/it]


0.6961408360895768

In [None]:
from tqdm import tqdm

def answer_relevance(questions):
  all_responses = []
  for q in tqdm(questions):

    prompt = f"""The following is a question, followed by Supporting points, and then the answer. Only consider the lsat question-answer pair.
    {q[:1500000]}
    List all the independent claims in the answer, strictly separated by line breaks only, without explanation or preamble. """
    try:
      response1 = get_chat_response(chat, prompt)
      time.sleep(0.1)
    except:
      time.sleep(120)
      response1 = get_chat_response(chat, prompt)
    prompt = f"""The following is a question.
    {q.split("Supporting Points:")[0][:1500000]}

    These are the independent claims in the answer:
    {response1[:100000]}

    List the claims above which are relevant to the question, separated by line breaks, without explanation or preamble."""
    try:
      response2 = get_chat_response(chat, prompt)
      time.sleep(0.1)
    except:
      time.sleep(120)
      response2 = get_chat_response(chat, prompt)

    num_response1 = len(response1.split("\n"))
    num_response2 = len(response2.split("\n"))
    if num_response1 < num_response2:
      num_response2 = num_response1
    all_responses.append([num_response1, num_response2])
  return all_responses

In [None]:
ar_medusa = answer_relevance(medusa)
rel_medusa = [int(b)/int(a) for a,b in ar_medusa ]
import numpy as np
np.mean(rel_medusa)

In [None]:
ar_vicuna = answer_relevance(vicuna)
rel_vicuna = [int(b)/int(a) for a,b in ar_vicuna ]
import numpy as np
np.mean(rel_vicuna)