# OpenBook DeBERTaV3-Large with an updated model

This work is based on the great [work](https://www.kaggle.com/code/nlztrk/openbook-debertav3-large-baseline-single-model) of [nlztrk](https://www.kaggle.com/nlztrk).

I trained a model offline using the dataset I shared [here](https://www.kaggle.com/datasets/mgoksu/llm-science-exam-dataset-w-context). I just added my model to the original notebook. The model is available [here](https://www.kaggle.com/datasets/mgoksu/llm-science-run-context-2).

I also addressed the problem of [CSV Not Found at submission](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/434228) with this notebook by clipping the context like so:

`test_df["prompt"] = test_df["context"].apply(lambda x: x[:1500]) + " #### " +  test_df["prompt"]`

You can probably get more than 1500 without getting an OOM.

In [1]:
# # installing offline dependencies
# !pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# !cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
# !pip install -U /kaggle/working/sentence-transformers
# !pip install -U /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl

# !pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
# !pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
# !pip install --no-index --no-deps /kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl
# !pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl

In [2]:
N_TOP_DOCS = 4
## Parameter to determine how many relevant sentences to include
NUM_SENTENCES_INCLUDE = 20

In [3]:
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf
# from __future__ import annotations

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union

import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /home/viktor/miniconda3/envs/torch-env/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...


  warn(msg)
  warn(msg)
  warn(msg)


In [4]:
def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """
    
    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values, 
                        df.document_id.values,
                        df.offset.values, 
                        filter_len, 
                        disable_progress_bar)
    return df


def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the 
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df


def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 3,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except:
            continue
    return pd.DataFrame(document_sentences)

In [5]:
# SIM_MODEL = '/kaggle/input/sentencetransformers-allminilml6v2/sentence-transformers_all-MiniLM-L6-v2'
SIM_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
DEVICE = 0
MAX_LENGTH = 512
BATCH_SIZE = 16

WIKI_PATH = "/kaggle/input/wikipedia-20230701"
WIKI_PATH = "/home/viktor/Documents/kaggle/kaggle_llm/data/kaggle-datasets/wikipedia-2023-07-faiss-index"
wiki_files = os.listdir(WIKI_PATH)

# Relevant Title Retrieval

In [6]:
trn = pd.read_csv("/home/viktor/Documents/kaggle/kaggle_llm/data/data_dumps/more_questions/more_questions_raw_questions_wiki_sci_2.csv")
# trn = trn[:1024]
trn

Unnamed: 0.1,Unnamed: 0,id,prompt,A,B,C,D,E,answer,cluster number,round
0,0,1,What is the significance of HD 87883 b?,It is the closest extrasolar planet to Earth.,It is the first planet discovered in the const...,It has the longest orbital period of any known...,It was the first planet detected by the radial...,It has a highly eccentric orbit around its star.,E,0,0
1,1,2,Which of the following statements about HD 878...,It orbits a K-type main sequence star.,It takes less than a year to complete one orbi...,It was discovered using the transit method.,It has an average distance from its star of 1....,It was detected in 2020.,A,0,0
2,2,3,What is the definition of a long-period planet?,A planet that takes more than 10 years to comp...,A planet that takes less than a year to comple...,A planet that has an average distance from its...,A planet that has a highly eccentric orbit.,A planet that orbits a K-type main sequence star.,A,0,0
3,3,4,What is the difference between the closest and...,0.20 AU,1.82 AU,3.82 AU,4.82 AU,6.82 AU,D,0,0
4,4,5,What is the reason behind the eccentric orbit ...,Gravitational interactions with other planets ...,The tidal forces exerted by the star on the pl...,The presence of a second star in the system.,The planet's initial velocity at the time of f...,Radiation pressure from the star.,A,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1019,1019,1020,What was the year when dextromethorphan was ap...,1949,1950,1952,1953,1954,D,105,0
1020,1020,1021,What is SC-17599?,A drug discovered in 1968 that acts as a selec...,A drug discovered in 1968 that acts as a selec...,A drug discovered in 1968 that acts as a selec...,A drug discovered in 1968 that acts as a non-s...,A drug discovered in 1968 that acts as an opio...,A,105,1
1021,1021,1022,What is the potency of SC-17599 compared to co...,More potent than codeine or pethidine,Less potent than codeine or pethidine,Equally potent as codeine or pethidine,The same potency as codeine or pethidine,Potency cannot be determined,A,105,1
1022,1022,1023,What are the effects of SC-17599?,Similar to morphine with more sedation,Similar to morphine with less sedation,Similar to codeine with more sedation,Similar to codeine with less sedation,None of the above,B,105,1


In [7]:
# # trn = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv").drop("id", 1)
# trn = pd.read_csv("/home/viktor/Documents/kaggle/kaggle_llm/data/kaggle-llm-science-exam/train.csv")


# trn.head()

In [8]:
model = SentenceTransformer(SIM_MODEL, device='cuda')
model.max_seq_length = MAX_LENGTH
model = model.half()

In [9]:
# sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")
sentence_index = read_index("/home/viktor/Documents/kaggle/kaggle_llm/data/kaggle-datasets/wikipedia-2023-07-faiss-index/wikipedia_202307.index")

In [10]:
prompt_embeddings = model.encode(trn.prompt.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
_ = gc.collect()

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

In [11]:
## Get the top N_TOP_DOCS pages that are likely to contain the topic of interest
search_score, search_index = sentence_index.search(prompt_embeddings, N_TOP_DOCS)

In [12]:
search_score.shape

(1024, 10)

In [13]:
## Save memory - delete sentence_index since it is no longer necessary
del sentence_index
del prompt_embeddings
_ = gc.collect()
libc.malloc_trim(0)

1

# Getting Sentences from the Relevant Titles

In [14]:
# df = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet", columns=['id', 'file'])
df = pd.read_parquet("/home/viktor/Documents/kaggle/kaggle_llm/data/kaggle-datasets/wikipedia-2023-07-faiss-index/wiki_2023_index.parquet", columns=['id', 'file'])

In [15]:
## Get the article and associated file location using the index
wikipedia_file_data = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    scr_idx = idx
    _df = df.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()
libc.malloc_trim(0)

  0%|          | 0/1024 [00:00<?, ?it/s]

1

In [16]:
wikipedia_file_data

Unnamed: 0,id,prompt_id,file
0,10031022,561,a.parquet
1,10037951,575,a.parquet
2,10046097,382,a.parquet
3,106240,776,a.parquet
4,1085262,449,a.parquet
...,...,...,...
10235,53909617,196,z.parquet
10236,697584,164,z.parquet
10237,697584,194,z.parquet
10238,9517201,981,z.parquet


In [17]:
wikipedia_file_data[wikipedia_file_data['prompt_id'] == 177]

Unnamed: 0,id,prompt_id,file
2285,46190717,177,d.parquet
2517,17710184,177,e.parquet
5170,58533682,177,j.parquet
6196,1042053,177,n.parquet
6231,1840220,177,n.parquet
6238,21506869,177,n.parquet
6269,28922127,177,n.parquet
6374,5700894,177,n.parquet
6383,6093957,177,n.parquet
6422,879282,177,n.parquet


In [18]:
wikipedia_file_data.prompt_id.unique().shape

(1024,)

In [19]:
## Get the full text data
wiki_text_data = []

for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
    _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

    _df_temp = _df[_df['id'].isin(_id)].copy()
    del _df
    _ = gc.collect()
    libc.malloc_trim(0)
    wiki_text_data.append(_df_temp)
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
_ = gc.collect()

  0%|          | 0/28 [00:00<?, ?it/s]

In [None]:
wiki_text_data

In [None]:
# # merge wikipedia_file_data with wiki_text_data in id
# merged_df = pd.merge(wikipedia_file_data, wiki_text_data, on='id', how='inner')

# # take only prompt_id and text
# merged_df = merged_df[['prompt_id', 'text']]

# # group by prompt_id, concatenate all the text
# merged_df = merged_df.groupby('prompt_id')['text'].apply(lambda x: ' '.join(x)).reset_index()

# # merge trn with merged_df
# merged_df = pd.merge(trn, merged_df, left_index=True, right_on='prompt_id', how='inner')

# merged_df['context'] = merged_df['text']

# # save to train_with_dense_context.csv
# merged_df.to_csv('train_with_dense_context.csv', index=False)


# merged_df

In [None]:
# def split_text(text, word_limit=1000):

    
#     chunks = [text[i:i + word_limit] for i in range(0, len(text), word_limit//2)]
#     return chunks


#     words = text.split()
#     chunks = [text[i:i + word_limit] for i in range(0, len(words), word_limit//2)]
#     return chunks
#     return [' '.join(chunk) for chunk in chunks]

# merged_df['context_splitted'] = merged_df['context'].apply(split_text)
# merged_df = merged_df.explode('context_splitted')
# merged_df['context'] = merged_df['context_splitted']#.apply(lambda x: x.strip())

# merged_df.to_csv("train_with_dense_context_exploded.csv")


# merged_df

In [None]:
trn

In [None]:
# wiki_text_data['text_len'] = wiki_text_data['text'].str.len()
# wiki_text_data

In [None]:
## Parse documents into sentences
processed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)

In [None]:
processed_wiki_text_data

In [None]:
## Get embeddings of the wiki text data
wiki_data_embeddings = model.encode(processed_wiki_text_data.text,
                                    batch_size=BATCH_SIZE,
                                    device=DEVICE,
                                    show_progress_bar=True,
                                    convert_to_tensor=True,
                                    normalize_embeddings=True)#.half()
wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()

In [None]:
_ = gc.collect()

In [None]:
## Combine all answers
trn['answer_all'] = trn.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)

## Search using the prompt and answers to guide the search
trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']

In [None]:
question_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
question_embeddings = question_embeddings.detach().cpu().numpy()

# Extracting Matching Prompt-Sentence Pairs

In [None]:


## List containing just Context
chunk_size = 10000
n_chunks = len(trn) // chunk_size + 1
for chunk_id, chunk in tqdm(enumerate(np.array_split(trn, n_chunks)), total=n_chunks):
    
    

    contexts = []

    for r in tqdm(chunk.itertuples(), total=len(chunk)):

        prompt_id = r.Index

        prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values

        if prompt_indices.shape[0] > 0:
            prompt_index = faiss.index_factory(wiki_data_embeddings.shape[1], "Flat")
            prompt_index.add(wiki_data_embeddings[prompt_indices])

            context = ""
            
            ## Get the top matches
            ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)
            for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
                context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + " "
            
        contexts.append(context)
    
    
    chunk['context'] = contexts
    
    if "answer" in chunk.columns:
        chunk[["prompt", "context", "A", "B", "C", "D", "E", "answer"]].to_csv(f"./train_context_{chunk_id}.csv", index=False)
    else:
        chunk[["prompt", "context", "A", "B", "C", "D", "E"]].to_csv(f"./train_context_{chunk_id}.csv", index=False)