# OpenBook DeBERTaV3-Large with an updated model

This work is based on the great [work](https://www.kaggle.com/code/nlztrk/openbook-debertav3-large-baseline-single-model) of [nlztrk](https://www.kaggle.com/nlztrk).

I trained a model offline using the dataset I shared [here](https://www.kaggle.com/datasets/mgoksu/llm-science-exam-dataset-w-context). I just added my model to the original notebook. The model is available [here](https://www.kaggle.com/datasets/mgoksu/llm-science-run-context-2).

I also addressed the problem of [CSV Not Found at submission](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/434228) with this notebook by clipping the context like so:

`test_df["prompt"] = test_df["context"].apply(lambda x: x[:1500]) + " #### " +  test_df["prompt"]`

You can probably get more than 1500 without getting an OOM.

In [2]:
# # installing offline dependencies
# !pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# !cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
# !pip install -U /kaggle/working/sentence-transformers
# !pip install -U /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl

# !pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
# !pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
# !pip install --no-index --no-deps /kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl
# !pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl

In [3]:
N_TOP_DOCS = 10
## Parameter to determine how many relevant sentences to include
NUM_SENTENCES_INCLUDE = 400

In [4]:
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf
from __future__ import annotations

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union

import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /home/viktor/miniconda3/envs/torch-env/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...


  warn(msg)
  warn(msg)
  warn(msg)


In [5]:
def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """
    
    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values, 
                        df.document_id.values,
                        df.offset.values, 
                        filter_len, 
                        disable_progress_bar)
    return df


def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the 
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df


def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 3,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except:
            continue
    return pd.DataFrame(document_sentences)

In [6]:
# SIM_MODEL = '/kaggle/input/sentencetransformers-allminilml6v2/sentence-transformers_all-MiniLM-L6-v2'
SIM_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
DEVICE = 0
MAX_LENGTH = 512
BATCH_SIZE = 16

WIKI_PATH = "/kaggle/input/wikipedia-20230701"
WIKI_PATH = "/home/viktor/Documents/kaggle/kaggle_llm/data/kaggle-datasets/wikipedia-2023-07-faiss-index"
wiki_files = os.listdir(WIKI_PATH)

# Relevant Title Retrieval

In [7]:
# trn = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv").drop("id", 1)
trn = pd.read_csv("/home/viktor/Documents/kaggle/kaggle_llm/data/kaggle-llm-science-exam/train.csv")
trn.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [8]:
model = SentenceTransformer(SIM_MODEL, device='cuda')
model.max_seq_length = MAX_LENGTH
model = model.half()

In [9]:
# sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")
sentence_index = read_index("/home/viktor/Documents/kaggle/kaggle_llm/data/kaggle-datasets/wikipedia-2023-07-faiss-index/wikipedia_202307.index")

In [10]:
prompt_embeddings = model.encode(trn.prompt.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
_ = gc.collect()

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [11]:
## Get the top N_TOP_DOCS pages that are likely to contain the topic of interest
search_score, search_index = sentence_index.search(prompt_embeddings, N_TOP_DOCS)

In [12]:
search_score.shape

(200, 10)

In [13]:
## Save memory - delete sentence_index since it is no longer necessary
del sentence_index
del prompt_embeddings
_ = gc.collect()
libc.malloc_trim(0)

1

# Getting Sentences from the Relevant Titles

In [14]:
# df = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet", columns=['id', 'file'])
df = pd.read_parquet("/home/viktor/Documents/kaggle/kaggle_llm/data/kaggle-datasets/wikipedia-2023-07-faiss-index/wiki_2023_index.parquet", columns=['id', 'file'])

In [15]:
## Get the article and associated file location using the index
wikipedia_file_data = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    scr_idx = idx
    _df = df.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()
libc.malloc_trim(0)

  0%|          | 0/200 [00:00<?, ?it/s]

1

In [16]:
wikipedia_file_data

Unnamed: 0,id,prompt_id,file
0,1059683,177,a.parquet
1,1141,36,a.parquet
2,1141,151,a.parquet
3,11963992,185,a.parquet
4,11963992,191,a.parquet
...,...,...,...
1995,262861,32,z.parquet
1996,27554141,16,z.parquet
1997,3014017,3,z.parquet
1998,34527,103,z.parquet


In [17]:
wikipedia_file_data[wikipedia_file_data['prompt_id'] == 177]

Unnamed: 0,id,prompt_id,file
0,1059683,177,a.parquet
253,25453500,177,c.parquet
398,1939808,177,d.parquet
565,915761,177,e.parquet
907,18472,177,l.parquet
984,585883,177,l.parquet
1775,655358,177,s.parquet
1918,41835,177,u.parquet
1920,4825999,177,u.parquet
1980,82759,177,w.parquet


In [18]:
wikipedia_file_data.prompt_id.unique().shape

(200,)

In [19]:
## Get the full text data
wiki_text_data = []

for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
    _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

    _df_temp = _df[_df['id'].isin(_id)].copy()
    del _df
    _ = gc.collect()
    libc.malloc_trim(0)
    wiki_text_data.append(_df_temp)
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
_ = gc.collect()

  0%|          | 0/28 [00:00<?, ?it/s]

In [20]:
wiki_text_data

Unnamed: 0,id,text
0,5259071,A Briefer History of Time is a 2006 popular-sc...
1,65293114,A History of the Theories of Aether and Electr...
2,41452537,A Slower Speed of Light is a freeware video ga...
3,1550261,"The American Petroleum Institute gravity, or A..."
4,4389619,"In superconductivity, fluxon (also called a Ab..."
...,...,...
1868,40874497,z8_GND_5296 is a dwarf galaxy discovered in Oc...
1869,27554141,thumb|Theoretical illustration of the zero-cur...
1870,262861,The zeroth law of thermodynamics is one of the...
1871,3014017,"In mathematics and theoretical physics, zeta f..."


In [21]:
# merge wikipedia_file_data with wiki_text_data in id
merged_df = pd.merge(wikipedia_file_data, wiki_text_data, on='id', how='inner')

# take only prompt_id and text
merged_df = merged_df[['prompt_id', 'text']]

# group by prompt_id, concatenate all the text
merged_df = merged_df.groupby('prompt_id')['text'].apply(lambda x: ' '.join(x)).reset_index()

# merge trn with merged_df
merged_df = pd.merge(trn, merged_df, left_index=True, right_on='prompt_id', how='inner')

merged_df['context'] = merged_df['text']

# save to train_with_dense_context.csv
merged_df.to_csv('train_with_dense_context.csv', index=False)


merged_df

Unnamed: 0,id,prompt,A,B,C,D,E,answer,prompt_id,text,context
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,0,"In astronomy and cosmology, baryonic dark matt...","In astronomy and cosmology, baryonic dark matt..."
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,1,"In computer architecture, dynamic voltage scal...","In computer architecture, dynamic voltage scal..."
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,2,[[Image:Triskele-Symbol1.svg|right|thumb|200px...,[[Image:Triskele-Symbol1.svg|right|thumb|200px...
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,3,"In physics and applied mathematics, analytical...","In physics and applied mathematics, analytical..."
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,4,"In astronomy, angular diameter distance is a d...","In astronomy, angular diameter distance is a d..."
...,...,...,...,...,...,...,...,...,...,...,...
195,195,What is the relation between the three moment ...,The three moment theorem expresses the relatio...,The three moment theorem is used to calculate ...,The three moment theorem describes the relatio...,The three moment theorem is used to calculate ...,The three moment theorem is used to derive the...,C,195,"In solid mechanics, a bending moment is the re...","In solid mechanics, a bending moment is the re..."
196,196,"What is the throttling process, and why is it ...",The throttling process is a steady flow of a f...,The throttling process is a steady adiabatic f...,The throttling process is a steady adiabatic f...,The throttling process is a steady flow of a f...,The throttling process is a steady adiabatic f...,B,196,Bandwidth throttling consists in the intention...,Bandwidth throttling consists in the intention...
197,197,What happens to excess base metal as a solutio...,"The excess base metal will often solidify, bec...",The excess base metal will often crystallize-o...,"The excess base metal will often dissolve, bec...","The excess base metal will often liquefy, beco...","The excess base metal will often evaporate, be...",B,197,"In metallurgy, cold forming or cold working is...","In metallurgy, cold forming or cold working is..."
198,198,"What is the relationship between mass, force, ...",Mass is a property that determines the weight ...,Mass is an inertial property that determines a...,Mass is an inertial property that determines a...,Mass is an inertial property that determines a...,Mass is a property that determines the size of...,D,198,Accelerations in special relativity (SR) follo...,Accelerations in special relativity (SR) follo...


In [22]:
def split_text(text, word_limit=1000):

    
    chunks = [text[i:i + word_limit] for i in range(0, len(text), word_limit//2)]
    return chunks


    words = text.split()
    chunks = [text[i:i + word_limit] for i in range(0, len(words), word_limit//2)]
    return chunks
    return [' '.join(chunk) for chunk in chunks]

merged_df['context_splitted'] = merged_df['context'].apply(split_text)
merged_df = merged_df.explode('context_splitted')
merged_df['context'] = merged_df['context_splitted']#.apply(lambda x: x.strip())

merged_df.to_csv("train_with_dense_context_exploded.csv")


merged_df

Unnamed: 0,id,prompt,A,B,C,D,E,answer,prompt_id,text,context,context_splitted
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,0,"In astronomy and cosmology, baryonic dark matt...","In astronomy and cosmology, baryonic dark matt...","In astronomy and cosmology, baryonic dark matt..."
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,0,"In astronomy and cosmology, baryonic dark matt...",uding non-emitting ordinary atoms. ==Presence=...,uding non-emitting ordinary atoms. ==Presence=...
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,0,"In astronomy and cosmology, baryonic dark matt...",amount of baryonic dark matter is much smaller...,amount of baryonic dark matter is much smaller...
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,0,"In astronomy and cosmology, baryonic dark matt...","rium were somehow generated, but large efforts...","rium were somehow generated, but large efforts..."
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,0,"In astronomy and cosmology, baryonic dark matt...",References == Category:Dark matter Category:B...,References == Category:Dark matter Category:B...
...,...,...,...,...,...,...,...,...,...,...,...,...
199,199,What did Arthur Eddington discover about two o...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,C,199,Sir Arthur Stanley Eddington (28 December 1882...,me it was accelerated (the retarded time and p...,me it was accelerated (the retarded time and p...
199,199,What did Arthur Eddington discover about two o...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,C,199,Sir Arthur Stanley Eddington (28 December 1882...,"e mass's actual position at constant velocity,...","e mass's actual position at constant velocity,..."
199,199,What did Arthur Eddington discover about two o...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,C,199,Sir Arthur Stanley Eddington (28 December 1882...,"the object (mass). Only gravitational waves, c...","the object (mass). Only gravitational waves, c..."
199,199,What did Arthur Eddington discover about two o...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,Arthur Eddington showed that two of Einstein's...,C,199,Sir Arthur Stanley Eddington (28 December 1882...,"gnetically charged objects), there is little o...","gnetically charged objects), there is little o..."


In [23]:
trn

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D
...,...,...,...,...,...,...,...,...
195,195,What is the relation between the three moment ...,The three moment theorem expresses the relatio...,The three moment theorem is used to calculate ...,The three moment theorem describes the relatio...,The three moment theorem is used to calculate ...,The three moment theorem is used to derive the...,C
196,196,"What is the throttling process, and why is it ...",The throttling process is a steady flow of a f...,The throttling process is a steady adiabatic f...,The throttling process is a steady adiabatic f...,The throttling process is a steady flow of a f...,The throttling process is a steady adiabatic f...,B
197,197,What happens to excess base metal as a solutio...,"The excess base metal will often solidify, bec...",The excess base metal will often crystallize-o...,"The excess base metal will often dissolve, bec...","The excess base metal will often liquefy, beco...","The excess base metal will often evaporate, be...",B
198,198,"What is the relationship between mass, force, ...",Mass is a property that determines the weight ...,Mass is an inertial property that determines a...,Mass is an inertial property that determines a...,Mass is an inertial property that determines a...,Mass is a property that determines the size of...,D


In [24]:
wiki_text_data['text_len'] = wiki_text_data['text'].str.len()
wiki_text_data

Unnamed: 0,id,text,text_len
0,5259071,A Briefer History of Time is a 2006 popular-sc...,729
1,65293114,A History of the Theories of Aether and Electr...,51324
2,41452537,A Slower Speed of Light is a freeware video ga...,3575
3,1550261,"The American Petroleum Institute gravity, or A...",5953
4,4389619,"In superconductivity, fluxon (also called a Ab...",2431
...,...,...,...
1868,40874497,z8_GND_5296 is a dwarf galaxy discovered in Oc...,2643
1869,27554141,thumb|Theoretical illustration of the zero-cur...,1064
1870,262861,The zeroth law of thermodynamics is one of the...,14299
1871,3014017,"In mathematics and theoretical physics, zeta f...",9818


In [25]:
## Parse documents into sentences
processed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)

  0%|          | 0/1873 [00:00<?, ?it/s]

  0%|          | 0/1873 [00:00<?, ?it/s]

In [26]:
processed_wiki_text_data

Unnamed: 0,document_id,text,offset
0,1008471,The Wigner–Eckart theorem is a theorem of repr...,"(0, 86)"
1,1008471,It states that matrix elements of spherical te...,"(87, 345)"
2,1008471,The name derives from physicists Eugene Wigner...,"(346, 658)"
3,1008471,"Mathematically, the Wigner–Eckart theorem is g...","(659, 742)"
4,1008471,Given a tensor operator T^{(k)} and two states...,"(743, 1225)"
...,...,...,...
88192,997690,"The Dielectric Constant of Liquid Helium, Phys...","(955, 1055)"
88193,997690,"Quote: : ""The temperature scale used was the 1...","(1056, 1120)"
88194,997690,"(emphasis added) *Awbery, J. H.; Heat, Rep. Pr...","(1121, 1203)"
88195,997690,"Quote: : ""It should be mentioned that below −1...","(1204, 1412)"


In [27]:
## Get embeddings of the wiki text data
wiki_data_embeddings = model.encode(processed_wiki_text_data.text,
                                    batch_size=BATCH_SIZE,
                                    device=DEVICE,
                                    show_progress_bar=True,
                                    convert_to_tensor=True,
                                    normalize_embeddings=True)#.half()
wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()

Batches:   0%|          | 0/5513 [00:00<?, ?it/s]

In [28]:
_ = gc.collect()

In [29]:
## Combine all answers
trn['answer_all'] = trn.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)

## Search using the prompt and answers to guide the search
trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']

In [30]:
question_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
question_embeddings = question_embeddings.detach().cpu().numpy()

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

# Extracting Matching Prompt-Sentence Pairs

In [31]:


## List containing just Context
contexts = []

for r in tqdm(trn.itertuples(), total=len(trn)):

    prompt_id = r.Index

    prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values

    if prompt_indices.shape[0] > 0:
        prompt_index = faiss.index_factory(wiki_data_embeddings.shape[1], "Flat")
        prompt_index.add(wiki_data_embeddings[prompt_indices])

        context = ""
        
        ## Get the top matches
        ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)
        for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
            context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + " "
        
    contexts.append(context)

  0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
trn['context'] = contexts

In [None]:
if "answer" in trn.columns:
    trn[["prompt", "context", "A", "B", "C", "D", "E", "answer"]].to_csv("./train_context.csv", index=False)
else:
    trn[["prompt", "context", "A", "B", "C", "D", "E"]].to_csv("./train_context.csv", index=False)