In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [None]:
!pip install trl

In [None]:
!pip install -U "huggingface_hub[cli]"

In [None]:
!huggingface-cli login

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
# Dataset
from datasets import load_dataset
from trl import SFTTrainer

print("Loading Dataset\n")

# load train jsonl dataset
train_dataset = load_dataset("json", data_files="/content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/train.jsonl", split="train")
# load valid jsonl dataset
valid_dataset = load_dataset("json", data_files="/content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/valid.jsonl", split="train")
# load test jsonl dataset
test_dataset = load_dataset("json", data_files="/content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/test.jsonl", split="train")

def formatting_prompts_func_2(example):
  bos_token = "<s>"
  eos_token = "</s>"

  full_prompt = ""
  full_prompt += bos_token
  full_prompt += "### Instruction:"
  full_prompt += "\n" + "Summarize the given reddit post"
  full_prompt += "\n\n### Input:"
  full_prompt += "\n" + f"SUBREDDIT: r/{example['subreddit']}\nTITLE: {example['title']}\nPOST: {example['post']}"
  full_prompt += "\n\n### Response:"
  # full_prompt += "\n" + f"Summary: {example['summary']}"
  # full_prompt += eos_token

  return full_prompt

def formatting_prompts_func_3(example):
    bos_token = "<s>"
    eos_token = "</s>"
    text = ""
    text += bos_token
    text += f"SUBREDDIT: r/{example['subreddit']}\nTITLE: {example['title']}\nPOST: {example['post']}\nTL;DR:" # {example['summary']}"
    # text += eos_token
    return text

Loading Dataset



Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
# Model

print("Loading Model\n")

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    # "/content/drive/MyDrive/ConceptualGapsProject/huggingFace",
    "mistralai/Mistral-7B-v0.1",
    cache_dir = "/content/mistral_weights/",
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=True,
    trust_remote_code=True,
)

Loading Model



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [11]:
# Tokenizer
print("Loading Tokenizer\n")

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True,)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading Tokenizer



tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [None]:
# Loading PEFT Model and save the summary results to output file

from peft import PeftModel
from tqdm import tqdm
import json

sft_model_3 = PeftModel.from_pretrained(model, "/content/drive/MyDrive/ConceptualGapsInML/SFT_Model_3/SFT_Model_3/checkpoint-10000/")

# Trying out queries

def extract_after_tldr(text):
    index = text.find("TL;DR:")

    if index != -1:
        return text[index + len("TL;DR:"):].strip()
    else:
        return text

def respond(query):
    eval_prompt_3 = formatting_prompts_func_3(query)

    model_input_3 = tokenizer(eval_prompt_3, return_tensors="pt").to("cuda")

    output_3 = tokenizer.decode(sft_model_3.generate(**model_input_3, max_new_tokens=256, repetition_penalty=1.15)[0], skip_special_tokens=True)
    output_3 = extract_after_tldr(output_3)

    return output_3

output_file = "/content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/SFT_Summaries.jsonl"

with open(output_file, 'w') as f:
  for i in tqdm(range(300), desc="Processing queries"):
      sft_output_3 = respond(test_dataset[i])
      json.dump({'response': sft_output_3}, f)
      f.write('\n')  # Write each output right after it's generated

print(f"Output written to {output_file}")

In [29]:
# Generate the prompts to ask GPT

from datasets import load_dataset

actual_summaries = load_dataset("json", data_files="/content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/test.jsonl", split="train")
SFT_summaries = load_dataset("json", data_files="/content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/SFT_Summaries.jsonl", split="train")

def create_gpt4_prompt(post, summary_a, summary_b):
    prompt_text = f"""
system
You are a helpful assistant, that ranks models by the quality of their answers.

user
Which of the following summaries does a better job of summarizing the most important points in the given forum post, without including unimportant or irrelevant details? A good summary is both precise and concise.
Post: "{post}"
Summary A:
{{
"model": "Mistral7b_SFT",
"summary": "{summary_a}"
}}
Summary B:
{{
"model": "Original",
"summary": "{summary_b}"
}}
Now please rank the models by the quality of their summaries, so that the model with rank 1 has the best summary. Then return a list of the model names and ranks, i.e., produce the following output:
[
{{'model': <model-name>, 'rank': <model-rank>}},
{{'model': <model-name>, 'rank': <model-rank>}}
]
Your response must be a valid Python dictionary and should contain nothing else because we will directly execute it in Python. Please provide the ranking that the majority of humans would give.
"""
    return prompt_text


output_file = "/content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/GPT4Prompt_Generalization.jsonl"


with open(output_file, 'w') as f:
  for i in tqdm(range(300), desc="Processing queries"):
      prompt = create_gpt4_prompt(actual_summaries[i]['post'], SFT_summaries[i]['response'], actual_summaries[i]['summary'])
      json.dump({'prompt': prompt}, f)
      f.write('\n')

print(f"Prompts written to {output_file}")


Processing queries: 100%|██████████| 300/300 [00:00<00:00, 4384.76it/s]

Prompts written to /content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/GPT4Prompt_Generalization.jsonl





In [None]:
!pip install openai

In [36]:
#Call GPT-4 on the prompts and save the data

import os
from openai import OpenAI
import json

os.environ["OPENAI_API_KEY"]

client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

def callGPT4(prompt):
  response = client.chat.completions.create(
              model="gpt-4",
              messages=[
                  {"role": "system", "content": "You are a helpful assistant, that ranks models by the quality of their answers."},
                  {"role": "user", "content": prompt}
              ]
          )
  return response.choices[0].message.content

GPT4_prompts = load_dataset("json", data_files="/content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/GPT4Prompt_Generalization.jsonl", split="train")

output_file = "/content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/GPT4Responses_Generalization.jsonl"

def extract_after_user(text):
    index = text.find("user")

    if index != -1:
        return text[index + len("user"):].strip()
    else:
        return text

with open(output_file, 'w') as f:
  for i in tqdm(range(300), desc="Calling GPT4"):
    prompt = extract_after_user(GPT4_prompts[i]['prompt'])
    response = callGPT4(prompt)
    json.dump({'response': response}, f)
    f.write('\n')

print(f"Prompts written to {output_file}")


Calling GPT4: 100%|██████████| 300/300 [17:07<00:00,  3.43s/it]

Prompts written to /content/drive/MyDrive/ConceptualGapsInML/redditDataset/redditDataset/sftDataset/GPT4Responses_Generalization.jsonl



