# Platypus2-70B + Wikipedia RAG


In [1]:
# Installing offline dependencies
# !pip install -U --no-deps /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# !pip install -U --no-deps /kaggle/input/datasets-214/datasets-2.14.5-py3-none-any.whl

In [None]:
import gc
import logging
from time import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import ctypes
from functools import partial

import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# For RAG
import faiss
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_from_disk, Dataset

NUM_TITLES = 5
MAX_SEQ_LEN = 512
MODEL_PATH = "wikipedia/bge-small-faiss/"

# For LLM
from transformers import AutoConfig, AutoModelForCausalLM,AutoModelForMultipleChoice, AutoTokenizer, AutoModel
from accelerate import init_empty_weights
from accelerate.utils.modeling import set_module_tensor_to_device
from safetensors.torch import load_file

N_BATCHES = 5 
MAX_CONTEXT = 1956
MAX_LENGTH = 4096

In [2]:
# Function to clean RAM & vRAM
def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

# Load data
# df = pd.read_csv("input_data/train_with_context2.csv",index_col='id')

df = pd.read_csv("input_data/train_with_context2.csv")
df['id'] = range(len(df))

# Variable used to avoid running the notebook for 3 hours when submitting. Credit : CPMP
IS_TEST_SET = True

# Uncomment this to see results on the train set
# df = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv", index_col="id")
# IS_TEST_SET = True
# N_BATCHES = 1

In [3]:
import os

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"]= "6,9"

In [5]:
# df.set_index('id',inplace=True)

## 1. Wikipedia Retrieval Augmented Generation (RAG)

The following code is adapted from [the notebook of @MGöksu](https://www.kaggle.com/code/mgoksu/0-807-sharing-my-trained-with-context-model) and [the notebook of @MB](https://www.kaggle.com/code/mbanaei/86-2-with-only-270k-articles/notebook). We use the [bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) to embed the Wikipedia dataset.

In [6]:
# New SentenceTransformer class similar to the one used in @Mgöksu notebook but relying on the transformers library only

class SentenceTransformer:
    def __init__(self, checkpoint, device="cuda:0"):
        self.device = device
        self.checkpoint = checkpoint
        self.model = AutoModel.from_pretrained(checkpoint).to(self.device).half()
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    def transform(self, batch):
        tokens = self.tokenizer(batch["text"], truncation=True, padding=True, return_tensors="pt", max_length=MAX_SEQ_LEN)
        return tokens.to(self.device)  

    def get_dataloader(self, sentences, batch_size=32):
        sentences = ["Represent this sentence for searching relevant passages: " + x for x in sentences]
        dataset = Dataset.from_dict({"text": sentences})
        dataset.set_transform(self.transform)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        return dataloader

    def encode(self, sentences, show_progress_bar=False, batch_size=32):
        dataloader = self.get_dataloader(sentences, batch_size=batch_size)
        pbar = tqdm(dataloader) if show_progress_bar else dataloader

        embeddings = []
        for batch in pbar:
            with torch.no_grad():
                e = self.model(**batch).pooler_output
                e = F.normalize(e, p=2, dim=1)
                embeddings.append(e.detach().cpu().numpy())
        embeddings = np.concatenate(embeddings, axis=0)
        return embeddings

In [7]:
if IS_TEST_SET:
    # Load embedding model
    start = time()
    print(f"Starting prompt embedding, t={time() - start :.1f}s")
    model = SentenceTransformer(MODEL_PATH, device="cuda:0")

    # Get embeddings of prompts
    f = lambda row : " ".join([row["prompt"], row["A"], row["B"], row["C"], row["D"], row["E"]])
    inputs = df.apply(f, axis=1).values # better results than prompt only
    prompt_embeddings = model.encode(inputs, show_progress_bar=False)

    # Search closest sentences in the wikipedia index 
    print(f"Loading faiss index, t={time() - start :.1f}s")
    faiss_index = faiss.read_index(MODEL_PATH + '/faiss.index')
    # faiss_index = faiss.index_cpu_to_all_gpus(faiss_index) # causes OOM, and not that long on CPU

    print(f"Starting text search, t={time() - start :.1f}s")
    search_index = faiss_index.search(np.float32(prompt_embeddings), NUM_TITLES)[1]

    print(f"Starting context extraction, t={time() - start :.1f}s")
    dataset = load_from_disk("wikipedia/all-paraphs-parsed-expanded")
    for i in range(len(df)):
        df.loc[i, "context_new"] = "-" + "\n-".join([dataset[int(j)]["text"] for j in search_index[i]])

    # Free memory
    faiss_index.reset()
    del faiss_index, prompt_embeddings, model, dataset
    clean_memory()
    print(f"Context added, t={time() - start :.1f}s")

Starting prompt embedding, t=0.0s
Loading faiss index, t=5.2s
Starting text search, t=7.9s
Starting context extraction, t=33.3s
Context added, t=34.2s


In [8]:
df

Unnamed: 0,prompt,context,A,B,C,D,E,answer,id,context_new
0,Which of the following statements accurately d...,The presence of a clustered thick disk-like co...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,0,-MOND is an example of a class of theories kno...
1,Which of the following is an accurate definiti...,Many of these systems evolve in a self-similar...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,1,-In such systems we can define a certain time-...
2,Which of the following statements accurately d...,It is possible that this usage is related with...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,2,"-Classical Antiquity The triskeles proper, com..."
3,What is the significance of regularization in ...,Renormalization is distinct from regularizatio...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,3,-Regularization: Classical physics theory brea...
4,Which of the following statements accurately d...,Several qualitative observations can be made o...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,4,-Several qualitative observations can be made ...
...,...,...,...,...,...,...,...,...,...,...
195,What is the relation between the three moment ...,The second equation is more general as it does...,The three moment theorem expresses the relatio...,The three moment theorem is used to calculate ...,The three moment theorem describes the relatio...,The three moment theorem is used to calculate ...,The three moment theorem is used to derive the...,C,195,-In civil engineering and structural analysis ...
196,"What is the throttling process, and why is it ...",A throttle is the mechanism by which fluid flo...,The throttling process is a steady flow of a f...,The throttling process is a steady adiabatic f...,The throttling process is a steady adiabatic f...,The throttling process is a steady flow of a f...,The throttling process is a steady adiabatic f...,B,196,-Throttling One of the simple applications of ...
197,What happens to excess base metal as a solutio...,"Furthermore, this melting may begin at a tempe...","The excess base metal will often solidify, bec...",The excess base metal will often crystallize-o...,"The excess base metal will often dissolve, bec...","The excess base metal will often liquefy, beco...","The excess base metal will often evaporate, be...",B,197,"-Similarly, a hypoeutectoid alloy has two crit..."
198,"What is the relationship between mass, force, ...",Newton first set out the definition of mass Th...,Mass is a property that determines the weight ...,Mass is an inertial property that determines a...,Mass is an inertial property that determines a...,Mass is an inertial property that determines a...,Mass is a property that determines the size of...,D,198,-Mass is (among other properties) an inertial ...


In [9]:
# df.reset_index(inplace=True)

# Deberta V3

In [10]:
import torch.nn as nn
from typing import Optional, Union
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

In [11]:
class DebertaModel(nn.Module):
    def __init__(self, modelname_or_path, config, dropout=0.2, pretrained=True):
        super().__init__()

        # Transformer
        self.config = config
        if pretrained:
            self.transformer = AutoModelForMultipleChoice.from_pretrained(modelname_or_path, config=config)
        else:
            self.transformer = AutoModelForMultipleChoice.from_config(config)
        

    def _init_weights(self, module, config):
        module.weight.data.normal_(mean=0.0, std=config.initializer_range)
        if module.bias is not None:
            module.bias.data.zero_()

    def forward(self, input_ids, attention_mask, labels=None,token_type_ids=None):
        out = self.transformer(input_ids, attention_mask =attention_mask, token_type_ids=token_type_ids)
        logits = -out['logits'] 
        return logits

In [12]:
def prepare_answering_input_deberta(
        tokenizer, # longformer_tokenizer
        question,  # str
        options,   # List[str]
        context,   # str
        max_seq_length=4096,
    ):
    
    first_sentence = [ "[CLS] " + context ] * 5
    second_sentences = [" #### " + question + " [SEP] " + option + " [SEP]" for option in options]
    tokenized_examples = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=max_seq_length, add_special_tokens=False)
      
    return tokenized_examples



class LlmseDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        example = self.df.iloc[idx]
        tokenized_example = dict()
        
        options = [ example[option] for option in 'ABCDE']

        tokenized_example = prepare_answering_input_deberta(tokenizer=self.tokenizer, question=example['prompt'], options=options, context= example['context'], max_seq_length = MAX_CONTEXT )

        return tokenized_example
            


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        return batch

In [13]:
CONFIG_NAME = 'models/microsoft/deberta-v3-large/'
MODEL_PATH = 'models/microsoft/deberta-v3-large/'
base_checkpoint_path = 'base/output/deberta_w_context_debertaformat/'
TOKENIZER_PATH = 'models/microsoft/deberta-v3-large/'
device = torch.device('cuda')

In [14]:
def get_predictions(checkpoint_path):
    
    
    model_config = AutoConfig.from_pretrained(CONFIG_NAME)
    model = DebertaModel(MODEL_PATH, config=model_config)
    model = nn.DataParallel(model)
    
    model.load_state_dict(torch.load((base_checkpoint_path + checkpoint_path) ,map_location=torch.device('cuda')));
    model.to(device)
    
    predictions = []
    with torch.no_grad():
        with tqdm(valid_dataloader, leave=False) as pbar:
            for idx, batch in enumerate(pbar):
                
                batch = {k: v.to(device) for k, v in batch.items()}
                output = model(**batch)
                probability = torch.softmax(-output, dim=-1)
                predictions.append(probability.detach().cpu().numpy())
                
    del model, model_config,output,probability 
    clean_memory()
    return predictions

In [None]:
losses1 = -outputs1.logits[0].detach().cpu().numpy()
        probability1 = torch.softmax(torch.tensor(-losses1), dim=-1)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
val_df = LlmseDataset(df, tokenizer)
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
valid_dataloader = DataLoader(
        val_df, 
        batch_size=2, 
        shuffle=False, 
        collate_fn=data_collator,
        pin_memory=True,
        drop_last=False
    )



In [1]:
probability_fold_0 = get_predictions('checkpoint-fold-0/pytorch_model.bin')
probability_fold_1 = get_predictions('checkpoint-fold-1/pytorch_model.bin')
probability_fold_2 = get_predictions('checkpoint-fold-2/pytorch_model.bin')

In [None]:
probability_fold_0 = np.concatenate(probability_fold_0, axis=0)
probability_fold_1 = np.concatenate(probability_fold_1, axis=0)
probability_fold_2 = np.concatenate(probability_fold_2, axis=0)


In [None]:
weights = np.array([0.5, 0.25, 0.25])  # Define your weights here
pred_lists = [probability_fold_0, probability_fold_1, probability_fold_2]  # Assume list1 to list5 are your 5 lists of arrays

# Stack arrays along a new dimension, then multiply by weights and sum along that dimension
final_prob = np.sum(np.stack(pred_lists) * weights[:, np.newaxis, np.newaxis], axis=0)

In [None]:
predict_labels = np.array(list("ABCDE"))
# Sort indices for each row
sorted_indices = np.argsort(final_prob, axis=1)

# Take the last 3 largest for each row and reverse
top3_indices = sorted_indices[:, -3:][:, ::-1]

# Map to labels
top3_labels = predict_labels[top3_indices]

# Convert to list of lists if needed
all_predictions = top3_labels.tolist()

In [None]:
df.reset_index(inplace=True)

In [None]:
final_predictions = [" ".join(i) for i in all_predictions]
submission = pd.DataFrame({'id': df['id'],'prediction':final_predictions})

## 2: Run Platypus2-70B

To run such a large model on a single T4 GPU, we run it layer by layer and sample by sample

In [None]:
# Create symlinks from kaggle datasets to fake cached model

checkpoint_path = Path("/root/.cache/")
checkpoint_path.mkdir(exist_ok=True, parents=True)

for part in [1, 2]:
    source_dir = Path(f"/kaggle/input/platypus2-70b-instruct-part{part}")
    for path in source_dir.glob("*"):
        try:
            (checkpoint_path / path.name).symlink_to(path)
        except:
            pass

In [None]:
# Class for sharded llama

class ShardedLlama:
    def __init__(self, checkpoint_path, device="cuda:0", dtype=torch.float16):
        """
        Sharded version of LlamaForCausalLM : the model is splitted into layer shards to reduce GPU memory usage.
        During the forward pass, the inputs are processed layer by layer, and the GPU memory is freed after each layer.
        To avoid loading the layers multiple times, we could save all the intermediate activations in RAM, but
        as Kaggle accelerators have more GPU memory than CPU, we simply batch the inputs and keep them on the GPU.

        Parameters
        ----------
        checkpoint_path : str or Path
            path to the checkpoint
        device : str, optional
            device, by default "cuda:0"
        dtype : torch.dtype, optional
            dtype, by default torch.float16
        """
        
        # Save parameters
        self.checkpoint_path = Path(checkpoint_path)
        self.device = device 
        self.dtype = dtype

        # Create model
        self.config = AutoConfig.from_pretrained(self.checkpoint_path)
        # For flash attention when Turing architecture will be supported : https://github.com/Dao-AILab/flash-attention/issues/542
        # self.config.auto_map = {"AutoModelForCausalLM" : "togethercomputer/LLaMA-2-7B-32K--modeling_flash_llama.LlamaForCausalLM"} 
        
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"
        self.init_model()
        self.layer_names = ["model.embed_tokens"] + [f"model.layers.{i}" for i in range(len(self.model.model.layers))] + ["model.norm", "lm_head"]

    def init_model(self):
    
        # Load meta model (no memory used)
        with init_empty_weights():
            self.model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
            self.model.tie_weights()
            
        self.layers = [self.model.model.embed_tokens] + list(self.model.model.layers) + [self.model.model.norm, self.model.lm_head]
            
        # Move buffers to device (not that much GPU memory used)
        for buffer_name, buffer in self.model.named_buffers():
            set_module_tensor_to_device(self.model, buffer_name, self.device, value=buffer, dtype=self.dtype)

    def load_layer(self, layer_name):
        state_dict = load_file(self.checkpoint_path / (layer_name + ".safetensors"), device=self.device)
        for param_name, param in state_dict.items():
            assert param.dtype != torch.int8, "int8 not supported (need to add fp16_statistics)"
            set_module_tensor_to_device(self.model, param_name, self.device, value=param, dtype=self.dtype)

    def __call__(self, inputs, output_token):
        # inputs = [(prefix, suffix), ...] with prefix.shape[0] = 1 and suffix.shape[0] = 5
        
        # Reboot the model to make sure buffers are loaded and memory is clean
        del self.model
        clean_memory()
        self.init_model()
        
       # Send batch to device
        batch = [(prefix.to(self.device), suffix.to(self.device)) for prefix, suffix in inputs]
        n_suffixes = len(batch[0][1])
        suffix_eos = [(suffix != self.tokenizer.pad_token_id).sum(1) - 1 for _, suffix in inputs]

        # Create attention mask for the largest input, and position ids to use KV cache
        attention_mask = torch.finfo(self.dtype).min * torch.ones(MAX_LENGTH, MAX_LENGTH)
        attention_mask = attention_mask.triu(diagonal=1)[None, None, ...]
        attention_mask = attention_mask.to(self.device)
        position_ids = torch.arange(MAX_LENGTH, dtype=torch.long, device=self.device)[None, :]

        with ThreadPoolExecutor() as executor, torch.inference_mode():

            # Load first layer
            #future = executor.submit(self.load_layer, "model.embed_tokens")
            self.load_layer("model.embed_tokens")

            for i, (layer_name, layer) in tqdm(enumerate(zip(self.layer_names, self.layers)), desc=self.device, total=len(self.layers)):

                # Wait for previous layer to be loaded and load next layer
                #future.result()
                if (i + 1) < len(self.layer_names):
                    #future = executor.submit(self.load_layer, self.layer_names[i + 1])
                    self.load_layer(self.layer_names[i + 1])

                # Run layer
                for j, (prefix, suffix) in enumerate(batch):
                    if layer_name == "model.embed_tokens":
                        batch[j] = (layer(prefix), layer(suffix))
                    elif layer_name == "model.norm":
                        # Only keep the last token at this point
                        batch[j] = (None, layer(suffix[torch.arange(n_suffixes), suffix_eos[j]][:, None]))
                    elif layer_name == "lm_head":
                        batch[j] = layer(suffix)[:, 0, output_token].detach().cpu().numpy()
                    else:
                        # Run prefix
                        len_p, len_s = prefix.shape[1], suffix.shape[1]
                        new_prefix, (k_cache, v_cache) = layer(prefix, use_cache=True, attention_mask=attention_mask[:, :, -len_p:, -len_p:])
                        
                        # Run suffix
                        pos = position_ids[:, len_p:len_p + len_s].repeat(n_suffixes, 1)
                        attn = attention_mask[:, :, -len_s:, -len_p - len_s:].repeat(n_suffixes, 1, 1, 1)
                        kv_cache = (k_cache.repeat(n_suffixes, 1, 1, 1), v_cache.repeat(n_suffixes, 1, 1, 1))
                        new_suffix = layer(suffix, past_key_value=kv_cache, position_ids=pos, attention_mask=attn)[0]
                        batch[j] = (new_prefix, new_suffix)

                # Remove previous layer from memory (including buffers)
                layer.to("meta")
                clean_memory() # proposed by CPMP

        # Get scores
        return batch

In [None]:
# Run model on the 2 GPUs

def get_tokens(row, tokenizer):
        system_prefix = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input_prefix}"
        instruction = "Your task is to analyze the question and answer below. If the answer is correct, respond yes, if it is not correct respond no. As a potential aid to your answer, background context from Wikipedia articles is at your disposal, even if they might not always be relevant."
        input_prefix = f"Context: {row['context'][:MAX_CONTEXT]}\nQuestion: {row['prompt']}\nProposed answer: "
        prompt_prefix = system_prefix.format(instruction=instruction, input_prefix=input_prefix)
        prefix = tokenizer(prompt_prefix, return_tensors="pt", return_attention_mask=False, truncation=True, max_length=MAX_LENGTH)["input_ids"]
        prompt_suffix = [f"{row[letter]}\n\n### Response:\n" for letter in "ABCDE"]
        suffix = tokenizer(prompt_suffix, return_tensors="pt", return_attention_mask=False, truncation=True, max_length=MAX_LENGTH, padding=True)["input_ids"][:, 1:]
        return prefix, suffix

def run_model(device, df):
    model = ShardedLlama(checkpoint_path, device=f"cuda:{device}")
    f = partial(get_tokens, tokenizer=model.tokenizer)
    inputs = df.apply(f, axis=1).values
    batches = np.array_split(inputs, N_BATCHES)
    outputs = []
    for i, batch in enumerate(batches):
        # Token #4874 is yes.
        outputs += model(batch, output_token=4874)
    return outputs

# Run model
if IS_TEST_SET: 
    with ThreadPoolExecutor() as executor:
        outputs = list(executor.map(run_model, [0, 1], np.array_split(df, 2)))
        outputs = sum(outputs, [])
        
    # Save results
    n = len(df)
    for i, scores in enumerate(outputs):
        top3 = np.argsort(scores)[::-1]
        df.loc[i, "prediction"] = " ".join(["ABCDE"[j] for j in top3])
    
    # Display performances if train set is used (in this case use IS_TEST_SET=True !)
    if "answer" in df.columns:
        for i in range(n):
            df.loc[i, "top_1"] = df.loc[i, "prediction"][0]
            df.loc[i, "top_2"] = df.loc[i, "prediction"][2]
            df.loc[i, "top_3"] = df.loc[i, "prediction"][4]

        top_i = [(df[f"top_{i}"] == df["answer"]).sum() for i in [1, 2, 3]]
        print(f"top1 : {top_i[0]}/{n}, top2 : {top_i[1]}/{n}, top3 : {top_i[2]}/{n} (total={sum(top_i)} / {n})")
        print(f"Accuracy: {100*top_i[0]/n:.1f}%, map3: {100*(top_i[0] + top_i[1]*1/2 + top_i[2]*1/3).sum()/n:.1f}%")
else:
    df["prediction"] = "A B C"

df[["prediction"]].to_csv("submission.csv")