In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [None]:
!pip install trl

In [None]:
!pip install -U "huggingface_hub[cli]"

In [None]:
!huggingface-cli login

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
import json

def load_and_extract_articles(file_path):
    # Load the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Data structure to hold the articles and highlights
    articles_data = []

    # Extract the first two articles and highlights
    for entry in data['rows'][:100]:  # Limit to first two entries
        article_entry = {
            "id": entry['row']['id'],
            "article": entry['row']['article'],
            "highlights": entry['row']['highlights']
        }
        articles_data.append(article_entry)

    return articles_data

# Example usage:
file_path = '/content/drive/MyDrive/ConceptualGapsInML/cnn/test_data_cnn.json'
output_path = '/content/drive/MyDrive/ConceptualGapsInML/cnn/cnn_summary_prompts.json'
cnn_test_data = load_and_extract_articles(file_path)


def formatting_prompts_func_3(example):
    bos_token = "<s>"
    text = ""
    text += bos_token
    text += f"Article: {example['article']}\nTL;DR:"
    return text


formatted_prompts = []
for article in cnn_test_data:
  prompt = formatting_prompts_func_3(article)
  formatted_prompts.append(prompt)

with open(output_path, 'w') as file:
  json.dump(formatted_prompts, file, indent=4)

In [26]:
# Model

print("Loading Model\n")

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    # "/content/drive/MyDrive/ConceptualGapsProject/huggingFace",
    "mistralai/Mistral-7B-v0.1",
    cache_dir = "/content/mistral_weights/",
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=True,
    trust_remote_code=True,
)

Loading Model



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [27]:
# Tokenizer
print("Loading Tokenizer\n")

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True,)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading Tokenizer



tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [45]:
# Loading PEFT Model and save the summary results to output file

from peft import PeftModel
from tqdm import tqdm
import json

sft_model_3 = PeftModel.from_pretrained(model, "/content/drive/MyDrive/ConceptualGapsInML/SFT_Model_3/SFT_Model_3/checkpoint-10000/")

def extract_after_tldr(text):
    index = text.find("TL;DR:")

    if index != -1:
        return text[index + len("TL;DR:"):].strip()
    else:
        return text

def respond(query):
    model_input_3 = tokenizer(query, return_tensors="pt").to("cuda")
    output_3 = tokenizer.decode(sft_model_3.generate(**model_input_3, max_new_tokens=256, repetition_penalty=1.15)[0], skip_special_tokens=True)
    return output_3

output_file = "/content/drive/MyDrive/ConceptualGapsInML/cnn/SFT_Summaries_cnn.jsonl"
input_file = "/content/drive/MyDrive/ConceptualGapsInML/cnn/cnn_summary_prompts.json"

from datasets import load_dataset
cnn_summary_prompts = load_dataset("json", data_files=input_file, split="train")
cnn_summaries = []

with open(output_file, 'w') as file:
  for i in tqdm(range(100), "Processing"):
    response = respond(cnn_summary_prompts[i]['text'])
    response = extract_after_tldr(response)
    cnn_summaries.append(response)
    json.dump({'summary': response}, file)
    file.write('\n')

print(f"Output written to {output_file}")

Processing:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing:   1%|          | 1/100 [00:07<12:13,  7.41s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing:   2%|▏         | 2/100 [00:12<10:12,  6.25s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing:   3%|▎         | 3/100 [00:22<12:19,  7.63s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing:   4%|▍         | 4/100 [00:26<10:05,  6.31s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing:   5%|▌         | 5/100 [00:31<09:33,  6.03s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing:   6%|▌         | 6/100 [00:38<09:46,  6.24s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Processing:   7%|▋         | 7/100 [00:49<11:51,  7.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end gene

Output written to /content/drive/MyDrive/ConceptualGapsInML/cnn/SFT_Summaries_cnn.jsonl





In [47]:
# Generate the prompts to ask GPT

from datasets import load_dataset

SFT_summaries_cnn = load_dataset("json", data_files="/content/drive/MyDrive/ConceptualGapsInML/cnn/SFT_Summaries_cnn.jsonl", split="train")

def create_gpt4_prompt(post, summary_a, summary_b):
    prompt_text = f"""
system
You are a helpful assistant, that ranks models by the quality of their answers.

user
Which of the following summaries does a better job of summarizing the most important points in the given forum post, without including unimportant or irrelevant details? A good summary is both precise and concise.
Post: "{post}"
Summary A:
{{
"model": "Mistral7b_SFT",
"summary": "{summary_a}"
}}
Summary B:
{{
"model": "Original",
"summary": "{summary_b}"
}}
Now please rank the models by the quality of their summaries, so that the model with rank 1 has the best summary. Then return a list of the model names and ranks, i.e., produce the following output:
[
{{'model': <model-name>, 'rank': <model-rank>}},
{{'model': <model-name>, 'rank': <model-rank>}}
]
Your response must be a valid Python dictionary and should contain nothing else because we will directly execute it in Python. Please provide the ranking that the majority of humans would give.
"""
    return prompt_text


output_file = "/content/drive/MyDrive/ConceptualGapsInML/cnn/cnn_gpt4_prompts_summary.jsonl"


with open(output_file, 'w') as f:
  for i in tqdm(range(100), desc="Processing queries"):
      prompt = create_gpt4_prompt(cnn_test_data[i]['article'], SFT_summaries_cnn[i]['summary'], cnn_test_data[i]['highlights'])
      json.dump({'prompt': prompt}, f)
      f.write('\n')

print(f"Prompts written to {output_file}")


Processing queries: 100%|██████████| 100/100 [00:00<00:00, 6786.46it/s]

Prompts written to /content/drive/MyDrive/ConceptualGapsInML/cnn/cnn_gpt4_prompts_summary.jsonl





In [48]:
!pip install openai

Collecting openai
  Downloading openai-1.24.0-py3-none-any.whl (312 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 

In [50]:
#Call GPT-4 on the prompts and save the data

import os
from openai import OpenAI
import json

os.environ["OPENAI_API_KEY"]

client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

def callGPT4(prompt):
  response = client.chat.completions.create(
              model="gpt-4",
              messages=[
                  {"role": "system", "content": "You are a helpful assistant, that ranks models by the quality of their answers."},
                  {"role": "user", "content": prompt}
              ]
          )
  return response.choices[0].message.content

GPT4_prompts = load_dataset("json", data_files="/content/drive/MyDrive/ConceptualGapsInML/cnn/cnn_gpt4_prompts_summary.jsonl", split="train")

output_file = "/content/drive/MyDrive/ConceptualGapsInML/cnn/GPT4Responses_Generalization_cnn.jsonl"

def extract_after_user(text):
    index = text.find("user")

    if index != -1:
        return text[index + len("user"):].strip()
    else:
        return text

with open(output_file, 'w') as f:
  for i in tqdm(range(100), desc="Calling GPT4"):
    prompt = extract_after_user(GPT4_prompts[i]['prompt'])
    response = callGPT4(prompt)
    json.dump({'response': response}, f)
    f.write('\n')

print(f"Prompts written to {output_file}")


Calling GPT4: 100%|██████████| 100/100 [09:02<00:00,  5.43s/it]

Prompts written to /content/drive/MyDrive/ConceptualGapsInML/cnn/GPT4Responses_Generalization_cnn.jsonl



