# LLM + Wikipedia RAG


In [1]:
# # Quick Save
# import os
# if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     ! touch submission.csv
#     import sys
#     sys.exit(0)

In [2]:
! pip install --quiet --use-deprecated=legacy-resolver --no-index /kaggle/input/llm-se-python-wheel/llm_science_exam-0.0.1-py3-none-any.whl --find-links /kaggle/input/llm-se-required-libs-python-wheels

In [3]:
# # Create symlinks from kaggle datasets to fake cached model
# import pathlib
# checkpoint_path = pathlib.Path("/root/.cache/")
# checkpoint_path.mkdir(exist_ok=True, parents=True)

# for part in [1, 2]:
#     source_dir = pathlib.Path(f'/kaggle/input/platypus2-70b-instruct-part{part}')
#     for path in source_dir.glob('*'):
#         try:
#             (checkpoint_path / path.name).symlink_to(path)
#         except:
#             pass

# checkpoint_path = "/kaggle/input/llmse-llama2-13b-layers-es-with-full-td"


In [4]:
import os
import gc
from time import time, sleep
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import ctypes
from functools import partial

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch

# For Platypus2
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from accelerate import init_empty_weights
from accelerate.utils.modeling import set_module_tensor_to_device
from safetensors.torch import load_file


os.environ["TOKENIZERS_PARALLELISM"] = "false"


def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()



In [5]:
# Load data

import os

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    dataset_type = "test"
else:
    dataset_type = "train"
#     
# df = pd.read_csv(f"/kaggle/input/kaggle-llm-science-exam/{dataset_type}.csv", index_col="id")
# # df = df.iloc[:2]
# 
# # # +300 validation
# # if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
# #     extra_df = pd.read_csv("/kaggle/input/llm-se-extra-train-datasets/yalickj/dataset-wiki-new-1/dataset_wiki_new_1_balanced.csv")
# #     df = pd.concat([df, extra_df]).reset_index(drop=True)
# #     df.index.name = "id"
# 
# # if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
# #    df = pd.concat([df] * 8).reset_index(drop=True)
# #    df.index.name = "id"
# 
# df

Unnamed: 0_level_0,prompt,A,B,C,D,E,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A


## 1. Wikipedia Retrieval Augmented Generation (RAG)

The following code is adapted from https://www.kaggle.com/code/mbanaei/86-2-with-only-270k-articles

In [7]:
# !cp -r /kaggle/input/all-paraphs-parsed-expanded /kaggle/working/
# 
# import llm_science_exam.open_book_v2
# 
# df["context"] = llm_science_exam.open_book_v2.tf_idf.get_context(
#     df,
#     wiki_dataset_paths=[
#         "/kaggle/working/all-paraphs-parsed-expanded",
# #         "/kaggle/input/llm-se-additional-wiki-stem-articles"
#     ],
#     num_titles=3,
# #     num_titles=4,
# )
# clean_memory()
# 
# print(df.iloc[0])



Map:   0%|          | 0/2101279 [00:00<?, ? examples/s]



length of vectorizer vocab is 224


100%|██████████| 211/211 [04:16<00:00,  1.21s/it]


prompt     Which of the following statements accurately d...
A          MOND is a theory that reduces the observed mis...
B          MOND is a theory that increases the discrepanc...
C          MOND is a theory that explains the missing bar...
D          MOND is a theory that reduces the discrepancy ...
E          MOND is a theory that eliminates the observed ...
answer                                                     D
context    - MOND is an example of a class of theories kn...
Name: 0, dtype: object


In [None]:
df = pd.read_csv(dataset_with_context_path)

In [8]:
system_prefix = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Your task is to analyze the question and answer below. If the answer is correct, respond yes, if it is not correct respond no. As a potential aid to your answer, background context from Wikipedia articles is at your disposal, even if they might not always be pertinent.

### Input:
Context:
{context}

Question:
{prompt}

Proposed answer:
"""

def get_prompts(row):
    prompt_prefix = system_prefix.format(context=row["context"], prompt=row["prompt"])

    prompt_suffix = [f"{row[letter]}\n\n### Response:\n" for letter in 'ABCDE']

    return prompt_prefix, prompt_suffix


prefix, suffixes = get_prompts(df.iloc[0])
print(f"{prefix}{suffixes[0]}")
del prefix, suffixes

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Your task is to analyze the question and answer below. If the answer is correct, respond yes, if it is not correct respond no. As a potential aid to your answer, background context from Wikipedia articles is at your disposal, even if they might not always be pertinent.

### Input:
Context:
- MOND is an example of a class of theories known as modified gravity, and is an alternative to the hypothesis that the dynamics of galaxies are determined by massive, invisible dark matter halos. Since Milgrom's original proposal, proponents of MOND have claimed to successfully predict a variety of galactic phenomena that they state are difficult to understand as consequences of dark matter.Though MOND explains the anomalously great rotational velocities of galaxies at their perimeters, it does not fully explain the velocity d

In [9]:
def get_tokens(row, tokenizer, max_length):
    prompt_prefix, prompt_suffix = get_prompts(row)
    
    prefix = tokenizer(
        prompt_prefix, return_tensors="pt", return_attention_mask=False, 
        truncation=True, max_length=max_length,
    )['input_ids']
    
    suffix = tokenizer(
        prompt_suffix, return_tensors="pt", return_attention_mask=False,
        truncation=True, max_length=max_length, 
        padding=True
    )['input_ids'][:, 1:]
    
    return prefix, suffix

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

inputs = df.apply(lambda row: get_tokens(row, tokenizer, max_length=None), axis=1).values
MAX_LENGTH = max(p.shape[1] + s.shape[1] for p, s in inputs)

del inputs, tokenizer
clean_memory()

MAX_LENGTH = min(MAX_LENGTH, 4096)

print(f"{MAX_LENGTH = }")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


751

## 2: Run

To such a large model on a single T4 GPU, we run it layer by layer and sample by sample

In [10]:
# Class for sharded llama


class ShardedLlama:
    
    def __init__(self, checkpoint_path, device: int, dtype=torch.float16):
        """
        Sharded version of LlamaForCausalLM : the model is splitted into layer shards to reduce GPU memory usage.
        During the forward pass, the inputs are processed layer by layer, and the GPU memory is freed after each layer.
        To avoid loading the layers multiple times, we could save all the intermediate activations in RAM, but
        as Kaggle accelerators have more GPU memory than CPU, we simply batch the inputs and keep them on the GPU.

        Parameters
        ----------
        checkpoint_path : str or Path
            path to the checkpoint
        device : device
        dtype : torch.dtype, optional
            dtype, by default torch.float16
        """
        
        # Save parameters
        self.checkpoint_path = Path(checkpoint_path)
        self.device = f"cuda:{device}"
        self.device_id = device
        self.dtype = dtype

        # Create model
        self.config = AutoConfig.from_pretrained(self.checkpoint_path)
        # For flash attention when Turing architecture will be supported : https://github.com/Dao-AILab/flash-attention/issues/542
        # self.config.auto_map = {"AutoModelForCausalLM" : "togethercomputer/LLaMA-2-7B-32K--modeling_flash_llama.LlamaForCausalLM"} 
        
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = 'right'
        
        self.tokenizer_pad_token_id = self.tokenizer.pad_token_id
        
        self.init_model()        
        self.layer_names = ['model.embed_tokens'] + [f'model.layers.{i}' for i in range(len(self.model.model.layers))] + ['model.norm', 'lm_head']
        
    def init_model(self):
        
        # Load meta model (no memory used)
        with init_empty_weights():
            self.model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
            self.model.tie_weights()
        
        self.layers = [self.model.model.embed_tokens] + list(self.model.model.layers) + [self.model.model.norm, self.model.lm_head]
        
        # Move buffers to device (not that much GPU memory used)
        for buffer_name, buffer in self.model.named_buffers():
            set_module_tensor_to_device(self.model, buffer_name, self.device, value=buffer, dtype=self.dtype)

    def load_layer(self, layer_name):
        state_dict = load_file(self.checkpoint_path / (layer_name + '.safetensors'), device=self.device)
        for param_name, param in state_dict.items():
            assert param.dtype != torch.int8, 'int8 not supported (need to add fp16_statistics)'
            set_module_tensor_to_device(self.model, param_name, self.device, value=param, dtype=self.dtype)

    def __call__(self, inputs, output_token: int | list[int]):
        # inputs = [(prefix, suffix), ...] with prefix.shape[0] = 1 and suffix.shape[0] = 5
        
        # Reboot the model to make sure buffers are loaded and memory is clean
        del self.model
        clean_memory()
        self.init_model()
        
        # Send batch to device
        batch = [(prefix.to(self.device), suffix.to(self.device)) for prefix, suffix in inputs]
        n_suffixes = len(batch[0][1])
        suffix_eos = [(suffix != self.tokenizer_pad_token_id).sum(1) - 1 for _, suffix in inputs]

        # Create attention mask for the largest input, and position ids to use KV cache
        attention_mask = torch.finfo(self.dtype).min * torch.ones(MAX_LENGTH, MAX_LENGTH)
        attention_mask = attention_mask.triu(diagonal=1)[None, None, ...]
        attention_mask = attention_mask.to(self.device)
        position_ids = torch.arange(MAX_LENGTH, dtype=torch.long, device=self.device)[None, :]

        with ThreadPoolExecutor() as executor, torch.inference_mode():

            # Load first layer
            future = executor.submit(self.load_layer, 'model.embed_tokens')

            for i, (layer_name, layer) in enumerate(zip(tqdm(self.layer_names, desc=f"inference layer by layer on device {self.device}", position=self.device_id+1), self.layers)):

                # Wait for previous layer to be loaded and load next layer
                start = time()
                future.result()
                if (i + 1) < len(self.layer_names):
                    future = executor.submit(self.load_layer, self.layer_names[i + 1])
                load_time = time() - start
                
                # Run layer
                for j, (prefix, suffix) in enumerate(batch):
                    if layer_name == 'model.embed_tokens':
                        batch[j] = (layer(prefix), layer(suffix))
                    elif layer_name == 'model.norm':
                        # Only keep the last hidden state at this point
                        batch[j] = (None, layer(suffix[torch.arange(n_suffixes), suffix_eos[j]][:, None]))
                    elif layer_name == 'lm_head':
                        batch[j] = (None, layer(suffix))
                    else:
                        # Run prefix
                        len_p, len_s = prefix.shape[1], suffix.shape[1]
                        new_prefix, (k_cache, v_cache) = layer(prefix, use_cache=True, attention_mask=attention_mask[:, :, -len_p:, -len_p:])
                        
                        # Run suffix
#                         pos = position_ids[:, len_p:len_p + len_s].repeat(n_suffixes, 1)
#                         attn = attention_mask[:, :, -len_s:, -len_p - len_s:].repeat(n_suffixes, 1, 1, 1)
#                         kv_cache = (k_cache.repeat(n_suffixes, 1, 1, 1), v_cache.repeat(n_suffixes, 1, 1, 1))
                        pos = position_ids[:, len_p:len_p + len_s].expand(n_suffixes, -1)
                        attn = attention_mask[:, :, -len_s:, -len_p - len_s:].expand(n_suffixes, -1, -1, -1)
                        kv_cache = (k_cache.repeat(n_suffixes, 1, 1, 1), v_cache.expand(n_suffixes, -1, -1, -1))

                        new_suffix = layer(suffix, past_key_value=kv_cache, position_ids=pos, attention_mask=attn)[0]
                        batch[j] = (new_prefix, new_suffix)
                
                # Remove previous layer from memory (including buffers)
                layer.to('meta')
                clean_memory() # proposed by CPMP
        
        # Get scores
        if isinstance(output_token, list):
            batch = [torch.softmax(suffix[:, -1, output_token], dim=1)[..., 0].detach().cpu().numpy() for _, suffix in batch]
        else:
            batch = [suffix[:, -1, output_token].detach().cpu().numpy() for _, suffix in batch]

        return batch


In [11]:
# Run model on the 2 GPUs
# N_BATCHES = 4
# N_BATCHES = 8
N_BATCHES = -1

# N_BATCHES = max(int(np.ceil(MAX_LENGTH / 1024)), N_BATCHES)
N_BATCHES = max(int(np.ceil(MAX_LENGTH / 512)), N_BATCHES)
print(f"{N_BATCHES = }")


def run_model(device: int, df):
    sleep(60 * device)
    clean_memory()
        
    model = ShardedLlama(checkpoint_path, device=device, dtype=torch.float16)
    inputs = df.apply(partial(get_tokens, tokenizer=model.tokenizer, max_length=MAX_LENGTH), axis=1).values
    
    del model.tokenizer
    clean_memory()
    
    batches = np.array_split(inputs, N_BATCHES)
    outputs = []
    for i, batch in enumerate(batches):
        print(f"* batch #{i + 1} of {len(batches)} on device {device}")
#         outputs += model(batch, output_token=4874)
        outputs += model(batch, output_token=[4874, 694])
#         outputs += model(batch, output_token=[3582, 1217])
    return outputs


# Run model

with ThreadPoolExecutor() as executor:
    logits = list(executor.map(run_model, [0, 1], np.array_split(df, 2)))
    logits = sum(logits, [])


N_BATCHES = 1
* batch #1 of 1 on device 1
* batch #1 of 1 on device 0


inference layer by layer on device cuda:1:   0%|          | 0/43 [00:00<?, ?it/s]

inference layer by layer on device cuda:0:   0%|          | 0/43 [00:00<?, ?it/s]

In [12]:
clean_memory()
preds = torch.softmax(torch.Tensor(logits), dim=1).numpy()
pd.DataFrame(preds).to_csv("probs.csv", index=False)

  preds = torch.softmax(torch.Tensor(logits), dim=1).numpy()


In [13]:
# Save results
n = len(df)
for i, scores in enumerate(preds):
    top3 = np.argsort(scores)[::-1]
    df.loc[i, 'prediction'] = ' '.join(['ABCDE'[j] for j in top3])
df[['prediction']].to_csv('submission.csv')

In [14]:
def print_map_at_3(df):
    n = len(df)
    for i in range(n):
        df.loc[i, 'top_1'] = df.loc[i, 'prediction'][0]
        df.loc[i, 'top_2'] = df.loc[i, 'prediction'][2]
        df.loc[i, 'top_3'] = df.loc[i, 'prediction'][4]

    top_i = [(df[f'top_{i}'] == df["answer"]).sum() for i in [1, 2, 3]]
    print(f'top1 : {top_i[0]}/{n}, top2 : {top_i[1]}/{n}, top3 : {top_i[2]}/{n} (total={sum(top_i)} / {n})')
    print(f'Accuracy: {100*top_i[0]/n:.1f}%, map3: {100*(top_i[0] + top_i[1]*1/2 + top_i[2]*1/3).sum()/n:.1f}%')

    
if 'answer' in df.columns:
    if len(df) > 200:
        print("Old CV:")
        print_map_at_3(df.iloc[:200])

        print("\nNew CV:")
    else:
        print("CV:")
    print_map_at_3(df)

top1 : 2/2, top2 : 0/2, top3 : 0/2 (total=2 / 2)
Accuracy: 100.0%, map3: 100.0%
