In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import transformers
from tqdm import tqdm

In [None]:
device = "cpu"
if torch.cuda.is_available():
  device = "cuda"

device

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

In [None]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1",
    load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config )


In [None]:
#model.to(device) # not needed

In [None]:
mem = model.get_memory_footprint()
print("Memory footprint: {} ".format(mem))

## LLM Reranking Test

In [None]:
PROMPT_LENGTHS = []

In [None]:
def get_resp(system_text, user_text):
  input_text = system_text + user_text

  encoded = tokenizer(input_text, return_tensors="pt", add_special_tokens=False)
  model_inputs = encoded.to(device)
  #print(len(model_inputs[0]))
  PROMPT_LENGTHS.append(len(model_inputs[0]))
  generated_ids = model.generate(**model_inputs, max_new_tokens=400, pad_token_id=tokenizer.eos_token_id, do_sample=False, num_beams=1)
  decoded = tokenizer.batch_decode(generated_ids)
  return(decoded[0])

In [None]:
system_text = """<s>[INST] You are a smart text analysis assistant.
Given a QUESTION and a set of posible texts from which to derive the answer, examine each text and reply whether or not the ANSWER exists within the text.
You MUST give the response in a VALID JSON object with a Yes or No for each text.


So for instance the following:

QUESTION: What is the color of the sky?
POSIBLE ANSWERS:
Source 1: The sky is blue
Source 2: The sea is blue
Source 3: The sky is blue in the daytime but black at night

would give an output of:[/INST]

{
"Source 1": "Yes",
"Source 2": "No",
"Source 3": "Yes"
}
</s>
"""

In [None]:
user_text = """[INST]
QUESTION: what is rba
POSIBLE ANSWERS:
Source 1: Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.
Source 2: Results-Based Accountability® (also known as RBA) is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole. RBA is also used by organizations to improve the performance of their programs. Creating Community Impact with RBA. Community impact focuses on conditions of well-being for children, families and the community as a whole that a group of leaders is working collectively to improve. For example: “Residents with good jobs,” “Children ready for school,” or “A safe and clean neighborhood”.
Source 3: RBA Recognized with the 2014 Microsoft US Regional Partner of the ... by PR Newswire. Contract Awarded for supply and support the. Securitisations System used for risk management and analysis.
Source 4: The inner workings of a rebuildable atomizer are surprisingly simple. The coil inside the RBA is made of some type of resistance wire, normally Kanthal or nichrome. When a current is applied to the coil (resistance wire), it heats up and the heated coil then vaporizes the eliquid. 1 The bottom feed RBA is, perhaps, the easiest of all RBA types to build, maintain, and use. 2  It is filled from below, much like bottom coil clearomizer. 3  Bottom feed RBAs can utilize cotton instead of silica for the wick. 4  The Genesis, or genny, is a top feed RBA that utilizes a short woven mesh wire.
Source 5: Results-Based Accountability® (also known as RBA) is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole. RBA is also used by organizations to improve the performance of their programs. RBA improves the lives of children, families, and communities and the performance of programs because RBA: 1  Gets from talk to action quickly; 2  Is a simple, common sense process that everyone can understand; 3  Helps groups to surface and challenge assumptions that can be barriers to innovation;
[/INST]"""

a = get_resp(system_text, user_text)
print(a[len(system_text):])

In [None]:
user_text = """[INST]
QUESTION: What is the Capital of Spain?
POSIBLE ANSWERS:
Source 1: The Capital of Italy is Rome
Source 2: Delhi is the capital of India
Source 3: Washington DC is the Capital of the United States of Americe
[/INST]
"""

a = get_resp(system_text, user_text)
print(a[len(system_text):])

In [None]:
user_text = """[INST]
QUESTION: was ronald reagan a democrat
POSIBLE ANSWERS:
Source 1: From Wikipedia, the free encyclopedia. A Reagan Democrat is a traditionally Democratic voter in the United States, especially a white working-class Northerner, who defected from their party to support Republican President Ronald Reagan in either or both the 1980 and 1984 elections. During the 1980 election a dramatic number of voters in the U.S., disillusioned with the economic 'malaise' of the 1970s and the presidency of Jimmy Carter (even more than, four years earlier, Liberal Republican Gerald Ford), supported former California governor (and former Democrat) Ronald Reagan.
Source 2: Ronald Reagan began his political life in the Democratic Party, but as he became more and more conservative, he ultimately changed to the Republican Party in the early 1960s. Yes, he switched parties in 1962. He said that he did not desert the Democrats but rather they deserted him. Yes, Ronald Reagan was a member of the Democratic Party until he s â€¦ witched to the Republican Party in 1962, at the age of 51. 8 people found this useful.
Source 3: Ronald Wilson Reagan (/ËˆrÉ’nÉ™ld ËˆwÉªlsÉ™n ËˆreÉªÉ¡É™n/ ; February 6, 1911 â€“ June 5, 2004) was an American politician, commentator, and actor, who served as the 40th President of the United States from 1981 to 1989. I think Ronald Reagan changed the trajectory of America in a way that Richard Nixon did not and in a way that Bill Clinton did not. He put us on a fundamentally different path because the country was ready for it.
[/INST]
"""

a = get_resp(system_text, user_text)
print(a[len(system_text):])

In [None]:
user_text = """[INST]
QUESTION: was ronald reagan a democrat
POSIBLE ANSWERS:
Source 1: From Wikipedia, the free encyclopedia. A Reagan Democrat is a traditionally Democratic voter in the United States, especially a white working-class Northerner, who defected from their party to support Republican President Ronald Reagan in either or both the 1980 and 1984 elections. During the 1980 election a dramatic number of voters in the U.S., disillusioned with the economic 'malaise' of the 1970s and the presidency of Jimmy Carter (even more than, four years earlier, Liberal Republican Gerald Ford), supported former California governor  Ronald Reagan.
Source 2: Ronald Reagan began his political life in the Democratic Party, but as he became more and more conservative, he ultimately changed to the Republican Party in the early 1960s. Yes, he switched parties in 1962. He said that he did not desert the Democrats but rather they deserted him. Yes, Ronald Reagan was a member of the Democratic Party until he s â€¦ witched to the Republican Party in 1962, at the age of 51. 8 people found this useful.
Source 3: Ronald Wilson Reagan (/ËˆrÉ’nÉ™ld ËˆwÉªlsÉ™n ËˆreÉªÉ¡É™n/ ; February 6, 1911 â€“ June 5, 2004) was an American politician, commentator, and actor, who served as the 40th President of the United States from 1981 to 1989. I think Ronald Reagan changed the trajectory of America in a way that Richard Nixon did not and in a way that Bill Clinton did not. He put us on a fundamentally different path because the country was ready for it.
[/INST]
"""

a = get_resp(system_text, user_text)
print(a[len(system_text):])

In [None]:
PROMPT_LENGTHS, max(PROMPT_LENGTHS)


## Run multiple

In [None]:
from datasets import load_dataset
import random
import json

In [None]:
dataset = load_dataset("ms_marco", 'v1.1', split="train[:1000]")

In [None]:
dataset = dataset.filter(lambda example: example['query_type'] == "description")

In [None]:
len(dataset)

In [None]:
def build_pos_neg(row):
  NUM_OUTPUT = 5
  is_selected = row["passages"]["is_selected"]
  passages = row["passages"]["passage_text"]
  pos_passage_text = ""
  neg_passage_text = []
  #get index of positive passage
  for i in range(len(is_selected)):
    if is_selected[i] == 1:
      pos_passage_text = passages[i]
    else:
      neg_passage_text.append(passages[i])

  #random
  n = int(random.random()*NUM_OUTPUT + 1) # random number between 1 and NUM_OUTPUT. Determines WHERE to place the text in JSON

  #construct input and outputs
  input_str = "[INST]\nQUESTION: " + row["query"] + "\nPOSIBLE ANSWERS:\n"

  for i in range(1,NUM_OUTPUT+1):
      source_str = "Source " + str(i)
      if (i == n):
        add_string = source_str + ": " + pos_passage_text + "\n"
      else:
        if (len(neg_passage_text) > i-1):
          add_string = source_str + ": " + neg_passage_text[i-1] + "\n"

      input_str = input_str + add_string
  prompt = system_text + input_str + "[/INST]"

  return {
        "prompt": prompt
  }

In [None]:
# Save columns
original_columns = dataset.column_names
original_columns

In [None]:
# Format dataset
dataset = dataset.map(
    build_pos_neg,
    remove_columns=original_columns
)

In [None]:
print(dataset[1]["prompt"])

In [None]:
a = get_resp("", dataset[1]["prompt"])
print(a[len(system_text):])

In [None]:
len(dataset)

In [None]:
dataset[216], dataset[0]

In [None]:
PROMPT_LENGTHS = []

In [None]:
resp = []
for i in tqdm(range(0,len(dataset) + 1)):
  #print(i)
  p = dataset[i]["prompt"]
  a = get_resp("", p)
  #print(a[len(system_text):])
  resp.append(a[len(system_text):])

In [None]:
print(resp[0])

In [None]:
max(PROMPT_LENGTHS), min(PROMPT_LENGTHS)

In [None]:
sum(PROMPT_LENGTHS) / len(PROMPT_LENGTHS)

In [None]:
import pandas as pd
# create a dataframe from the list
df = pd.DataFrame(resp)

In [None]:
df.head()

In [None]:
df.to_csv('outputs-LARGE.csv', index=False)