# 1. IMPORTING LIBRARIES, FUNCTION AND DEFINING GLOBAL VARIABLES

## 1.1 Importing Libraries

In [1]:
# Library needed to load the pdf
import fitz

# Library needed to process the text using regular expressions
import re

# Library needed to display or process the data in forms of dataframes
import pandas as pd

# Library needed to handle the operations in deep learning
import torch

# Library needed to convert the data into arrays for faster processing
import numpy as np

# Library to handle operating system related operations
import os

## 1.2 Importing Functions

In [2]:
# (OPTIONAL) Function to beautify the waiting process with a loading bar
from tqdm.notebook import tqdm as tqdm

# Function to process the text in English
from spacy.lang.en import English

# Function to convert paragraphs to sentences
from sentence_transformers import SentenceTransformer

# Function to provide utility services to process the text such as tokenization, sentencizer
from sentence_transformers import util

# Functions for loading the LLM model
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig

# Function for fetching the paths to pdfs
from glob import glob

## 1.3 Defining Global Variables

In [98]:
# Global variable consisting of all the stop words
from spacy.lang.en import STOP_WORDS

# Global variable telling about the number of sentences in each chunk stored in the dictionary
SENTENCE_CHUNKS = 10

# Global variable storing the name of the model that is used for the embedding
EMBEDDING_MODEL = 'all-mpnet-base-v2'

# Global variable storing the names of the pdfs that are to be loaded to be fed into the RAG model
PDF_PATHS = list()

# Global variable storing the integer telling to fetch the top k similar records for further processing
K = 50

# Global variable storing the name of the LLM model that will be used for augmenting the similar data
LLM_MODEL = 'google/gemma-2b-it'

# (FOR TESTING) Global variable storing the query that user wants to ask
QUERY = "What is overfitting in machine learning? Explain in 200 words"

# Setting up the device agnostic code
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Global variable for setting the temperature of the LLM model i.e how much data should LLM generate
TEMPERATURE = 0.3

## Global variable defining the length of tokens that the LLM has to generate
MAX_NEW_TOKENS = 512

# 2. DATA ACQUISITION

## 2.1 Getting the paths to all the pdfs in the `Dataset` folder

In [4]:
PDF_PATHS = glob('.\\Dataset\\*.pdf')

In [5]:
for idx, path in tqdm(enumerate(PDF_PATHS), total=len(PDF_PATHS)):
    print(f"{idx+1}. {path[10:-4]}")

  0%|          | 0/20 [00:00<?, ?it/s]

1. 120_24pgs_mlinterviewquestions
2. 4.41_faq_dl_8pgs
3. 40 Artificial Intelligence Interview Questions & Answers 
4. 41 Essential Machine Learning Interview Questions
5. 5.AI_40Questions
6. 6.45_ml_important_questions_21pgs
7. 7.beginner_nn_deep_learning
8. 8.Data Science Interview Questions
9. 9.interview_preparation_50
10. bagging_ensemblelearning
11. Data Science Interview Questions
12. Deep Learning Interview Questions
13. Interview Questions about Python Programming
14. Machine Learning Interview
15. ML FINAL (1)
16. MLBOOK
17. Top 100 Machine Learning Questions & Answers
18. Top 100 NLP Questions
19. Top 100 Python Interview Questions You Must Prepare In 2019
20. Top 40 Python Interview Questions & Answers


## 2.2 Opening all the documents

In [6]:
documents = list()
for path in tqdm(PDF_PATHS, total=len(PDF_PATHS)):
    doc = fitz.open(path)
    documents.append(doc)

  0%|          | 0/20 [00:00<?, ?it/s]

In [7]:
for doc in tqdm(documents, total=len(documents)):
    print(doc)

  0%|          | 0/20 [00:00<?, ?it/s]

Document('.\Dataset\120_24pgs_mlinterviewquestions.pdf')
Document('.\Dataset\4.41_faq_dl_8pgs.pdf')
Document('.\Dataset\40 Artificial Intelligence Interview Questions & Answers .pdf')
Document('.\Dataset\41 Essential Machine Learning Interview Questions.pdf')
Document('.\Dataset\5.AI_40Questions.pdf')
Document('.\Dataset\6.45_ml_important_questions_21pgs.pdf')
Document('.\Dataset\7.beginner_nn_deep_learning.pdf')
Document('.\Dataset\8.Data Science Interview Questions.pdf')
Document('.\Dataset\9.interview_preparation_50.pdf')
Document('.\Dataset\bagging_ensemblelearning.pdf')
Document('.\Dataset\Data Science Interview Questions.pdf')
Document('.\Dataset\Deep Learning Interview Questions.pdf')
Document('.\Dataset\Interview Questions about Python Programming.pdf')
Document('.\Dataset\Machine Learning Interview.pdf')
Document('.\Dataset\ML FINAL (1).pdf')
Document('.\Dataset\MLBOOK.pdf')
Document('.\Dataset\Top 100 Machine Learning Questions & Answers.pdf')
Document('.\Dataset\Top 100 NLP 

## 2.3 Getting the text from all the documents

In [8]:
pages = dict()
for doc in tqdm(documents, total=len(documents)):
    for page_number, page in tqdm(enumerate(doc), total=len(doc)):
        page_number = len(pages)
        pages[page_number] = page.get_text()

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [9]:
for page_number, page in tqdm(pages.items(), total=len(documents)):
    print(f"{page_number}. {pages[page_number]}")
    print()

  0%|          | 0/20 [00:00<?, ?it/s]

0. Data Science Interview Questions Pdf 
1. What is meant by selection bias? 
Answer: Selection bias is a type of error that arises when the researcher decides on 
whom he is going to conduct the study. It happens when the selection of participants 
takes place not randomly. Selection bias is also sometimes referred to as a selection 
effect. It works more effectively and sometimes if the selection bias is not taken into 
account, the conclusions of the study may go wrong. 
2. What is a Boltzmann machine? 
Answer: Boltzmann developed with simple learning algorithms that allow them to find 
the important information that was presented in the complex regularities in the data. 
These machines are generally used to optimize the quantity and the weights of the 
given problem. The learning program works very slow in networks due to many layers of 
feature detectors. When we consider Restricted Boltzmann Machines, this has a single 
algorithm feature detectors that make it faster compared to 

## 2.4 Getting the metadata of each page

In [10]:
pages_and_metadata = list()
for page_number, page in tqdm(pages.items(), total=len(pages)):
    metadata = dict()
    metadata['page_number'] = page_number
    metadata['raw_text'] = page
    metadata['number_of_characters'] = len(page)
    metadata['number_of_tokens'] = len(page)/4
    metadata['number_of_words'] = len(page.split())
    pages_and_metadata.append(metadata)

  0%|          | 0/641 [00:00<?, ?it/s]

In [11]:
df = pd.DataFrame(pages_and_metadata)

In [12]:
df.describe().round(2)

Unnamed: 0,page_number,number_of_characters,number_of_tokens,number_of_words
count,641.0,641.0,641.0,641.0
mean,320.0,1604.09,401.02,263.04
std,185.19,811.66,202.91,136.5
min,0.0,3.0,0.75,1.0
25%,160.0,1094.0,273.5,180.0
50%,320.0,1601.0,400.25,260.0
75%,480.0,2128.0,532.0,347.0
max,640.0,3653.0,913.25,843.0


In [13]:
print(f"The number of tokens this model is being trained on are: {df["number_of_tokens"].sum()/1000000:.2f} million tokens")

The number of tokens this model is being trained on are: 0.26 million tokens


In [14]:
print(f"The number of pages in the database are: {len(pages_and_metadata)}")

The number of pages in the database are: 641


## 2.4 Preprocessing the `raw_text` from metadata

In [15]:
def convert_to_lowercase(text):
    new_text = text.lower()
    return new_text

In [16]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in STOP_WORDS:
            new_text.append(word)
    return " ".join(new_text)

In [17]:
def remove_html_tags(text):
    new_text = re.sub(r"<!--.*?-->", "", text)
    return new_text

In [18]:
def remove_newlines(text):
    new_text = re.sub(r"\n+", " ", text)
    return new_text

In [19]:
def remove_multiple_spaces(text):
    new_text = text.replace("  ", " ")
    return new_text

In [20]:
def remove_comments(text):
    new_text = re.sub(r"<!--.*?-->", "", text)
    return new_text

In [21]:
def remove_unnecessary_text(text):
    new_text = text.replace("answer:","").replace("question", "").replace(":","").replace("  "," ")
    return new_text

In [22]:
def preprocess_text(text):
    text = convert_to_lowercase(text)
    text = remove_stopwords(text)
    text = remove_html_tags(text)
    text = remove_newlines(text)
    text = remove_multiple_spaces(text)
    text = remove_comments(text)
    text = remove_unnecessary_text(text)
    return text

In [23]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    page["formatted_text"] = preprocess_text(page["raw_text"])

  0%|          | 0/641 [00:00<?, ?it/s]

In [24]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(page["formatted_text"])
    print()

  0%|          | 0/641 [00:00<?, ?it/s]

data science interview s pdf 1. meant selection bias? selection bias type error arises researcher decides going conduct study. happens selection participants takes place randomly. selection bias referred selection effect. works effectively selection bias taken account, conclusions study wrong. 2. boltzmann machine? boltzmann developed simple learning algorithms allow find important information presented complex regularities data. machines generally optimize quantity weights given problem. learning program works slow networks layers feature detectors. consider restricted boltzmann machines, single algorithm feature detectors faster compared others. 3. difference cluster systematic sampling? cluster sampling technique difficult study target population spread wide area simple random sampling applied. cluster sample probability sample sampling unit collection cluster elements. systematic sampling statistical technique elements selected ordered sampling frame. systematic sampling, list prog

## 2.5 Converting the paragraphs to sentences

In [63]:
nlp = English()
nlp.add_pipe('sentencizer')
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    sentences = nlp(page["formatted_text"]).sents
    sentences = list(set([str(sentence).strip() for sentence in sentences if len(str(sentence).split())>10]))
    pages_and_metadata[page["page_number"]]["sentences"] = sentences

  0%|          | 0/641 [00:00<?, ?it/s]

In [64]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(page["sentences"])
    print()

  0%|          | 0/641 [00:00<?, ?it/s]

['boltzmann developed simple learning algorithms allow find important information presented complex regularities data.', 'banking industry giving loans primary source making money time repayment rate good profit, risk huge losses.', 'says sample means, sample variance sample standard deviation converge trying estimate.', 'cluster sampling technique difficult study target population spread wide area simple random sampling applied.', 'systematic sampling, list progressed circular manner reach end list, progressed again.', 'banks don’t want lose good customers point time, don’t want acquire bad customers.', 'consider restricted boltzmann machines, single algorithm feature detectors faster compared others.']

['predictor variables money spent election campaigning particular candidate, time spent campaigning, etc.', 'studying target population spread wide area difficult applying simple random sampling ineffective, technique cluster sampling used.', 'couple layers added input output size lay

## 2.6 Update the metadata

In [65]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    pages_and_metadata[page["page_number"]]["number_of_sentences"] = len(page["sentences"])

  0%|          | 0/641 [00:00<?, ?it/s]

In [66]:
for key in pages_and_metadata[0].keys():
    print(key)

page_number
raw_text
number_of_characters
number_of_tokens
number_of_words
formatted_text
sentences
number_of_sentences
sentence_chunk
embeddings


## 2.7 Converting sentences to sentence_chunks

In [67]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    sentences = pages_and_metadata[page["page_number"]]["sentences"]
    sentence_chunk = [sentences[i : i+SENTENCE_CHUNKS] for i in range(0, len(sentences), SENTENCE_CHUNKS)]
    page["sentence_chunk"] = sentence_chunk

  0%|          | 0/641 [00:00<?, ?it/s]

In [68]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(page["sentence_chunk"])
    print()

  0%|          | 0/641 [00:00<?, ?it/s]

[['boltzmann developed simple learning algorithms allow find important information presented complex regularities data.', 'banking industry giving loans primary source making money time repayment rate good profit, risk huge losses.', 'says sample means, sample variance sample standard deviation converge trying estimate.', 'cluster sampling technique difficult study target population spread wide area simple random sampling applied.', 'systematic sampling, list progressed circular manner reach end list, progressed again.', 'banks don’t want lose good customers point time, don’t want acquire bad customers.', 'consider restricted boltzmann machines, single algorithm feature detectors faster compared others.']]

[['predictor variables money spent election campaigning particular candidate, time spent campaigning, etc.', 'studying target population spread wide area difficult applying simple random sampling ineffective, technique cluster sampling used.', 'couple layers added input output size 

## 2.8 Converting sentence_chunks into sentence embeddings

In [69]:
embedding_model = SentenceTransformer(model_name_or_path=EMBEDDING_MODEL).to(device)

In [70]:
for page in pages_and_metadata:
    embeddings = list()
    print(f"Processing Page {page['page_number']}")
    for sentence in page["sentences"]:
        # sentence = sentence.to(device)
        embedding = embedding_model.encode(sentence, batch_size=512, convert_to_tensor=True, show_progress_bar=False, device=device)
        embedding = np.stack(embedding.tolist(), axis=0)
        embedding = torch.tensor(embedding)
        embedding = embedding.type(torch.float32)
        embeddings.append(embedding)
    sentence_embeddings = [np.array(embedding) for embedding in embeddings]
    pages_and_metadata[page["page_number"]]["embeddings"] = sentence_embeddings

Processing Page 0
Processing Page 1
Processing Page 2
Processing Page 3
Processing Page 4
Processing Page 5
Processing Page 6
Processing Page 7
Processing Page 8
Processing Page 9
Processing Page 10
Processing Page 11
Processing Page 12
Processing Page 13
Processing Page 14
Processing Page 15
Processing Page 16
Processing Page 17
Processing Page 18
Processing Page 19
Processing Page 20
Processing Page 21
Processing Page 22
Processing Page 23
Processing Page 24
Processing Page 25
Processing Page 26
Processing Page 27
Processing Page 28
Processing Page 29
Processing Page 30
Processing Page 31
Processing Page 32
Processing Page 33
Processing Page 34
Processing Page 35
Processing Page 36
Processing Page 37
Processing Page 38
Processing Page 39
Processing Page 40
Processing Page 41
Processing Page 42
Processing Page 43
Processing Page 44
Processing Page 45
Processing Page 46
Processing Page 47
Processing Page 48
Processing Page 49
Processing Page 50
Processing Page 51
Processing Page 52
Pro

In [71]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(np.array(page["embeddings"]).shape)

  0%|          | 0/641 [00:00<?, ?it/s]

(7, 768)
(5, 768)
(10, 768)
(6, 768)
(9, 768)
(9, 768)
(13, 768)
(11, 768)
(9, 768)
(7, 768)
(8, 768)
(8, 768)
(8, 768)
(6, 768)
(11, 768)
(8, 768)
(8, 768)
(11, 768)
(6, 768)
(5, 768)
(8, 768)
(9, 768)
(12, 768)
(5, 768)
(7, 768)
(9, 768)
(8, 768)
(13, 768)
(7, 768)
(12, 768)
(2, 768)
(6, 768)
(3, 768)
(5, 768)
(2, 768)
(2, 768)
(2, 768)
(2, 768)
(4, 768)
(3, 768)
(1, 768)
(3, 768)
(3, 768)
(3, 768)
(5, 768)
(3, 768)
(3, 768)
(1, 768)
(4, 768)
(3, 768)
(3, 768)
(5, 768)
(3, 768)
(3, 768)
(1, 768)
(2, 768)
(3, 768)
(2, 768)
(5, 768)
(0,)
(5, 768)
(6, 768)
(6, 768)
(5, 768)
(5, 768)
(4, 768)
(5, 768)
(4, 768)
(6, 768)
(6, 768)
(4, 768)
(6, 768)
(7, 768)
(7, 768)
(6, 768)
(5, 768)
(3, 768)
(4, 768)
(7, 768)
(7, 768)
(3, 768)
(5, 768)
(2, 768)
(2, 768)
(2, 768)
(2, 768)
(4, 768)
(3, 768)
(1, 768)
(3, 768)
(3, 768)
(3, 768)
(5, 768)
(3, 768)
(3, 768)
(1, 768)
(4, 768)
(3, 768)
(3, 768)
(5, 768)
(3, 768)
(3, 768)
(1, 768)
(2, 768)
(3, 768)
(2, 768)
(5, 768)
(0,)
(5, 768)
(6, 768)
(6, 768)
(

## 2.9 Checking the metadata present for use

In [72]:
for key in pages_and_metadata[0].keys():
    print(key)

page_number
raw_text
number_of_characters
number_of_tokens
number_of_words
formatted_text
sentences
number_of_sentences
sentence_chunk
embeddings


# 3. FETCHING SIMILAR CONTENT

## 3.1 Getting the data embeddings

In [73]:
pages_and_metadata_embeddings = []

for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    page_embeddings = []
    for chunk_embedding in pages_and_metadata[page["page_number"]]["embeddings"]:
        if isinstance(chunk_embedding, torch.Tensor):
            chunk_embedding = chunk_embedding.tolist()
        page_embeddings.append(chunk_embedding)
    pages_and_metadata_embeddings.append(page_embeddings)

  0%|          | 0/641 [00:00<?, ?it/s]

## 3.2 Converting each embedding into the same dimensions

In [74]:
if pages_and_metadata_embeddings:
    embedding_dim = len(pages_and_metadata_embeddings[0][0])
    pages_and_metadata_embeddings = [
            [np.pad(chunk, (0, max(0, embedding_dim - len(chunk))), mode='constant')[:embedding_dim]
             for chunk in page]
            for page in pages_and_metadata_embeddings
        ]

## 3.3 Flattening the nested list of embeddings and the sentence to fetch by index

In [75]:
flat_embeddings = [chunk for page in pages_and_metadata_embeddings for chunk in page]
flat_data = [sentence for page in pages_and_metadata for sentence in page["sentences"]]

## 3.4 Saving the flattened embeddings and the flattened data

In [76]:
df = pd.DataFrame(flat_embeddings)
df.to_csv("embeddings_v3.csv", index=False)

df = pd.DataFrame(flat_data)
df.to_csv("data_v3.csv", index=False)

## 3.5 Loading the flattened embeddings and flattened data

In [77]:
flat_embeddings = pd.read_csv("embeddings_v3.csv").to_numpy()
flat_data = pd.read_csv("data_v3.csv")["0"].tolist()

In [78]:
print(f"The size of the flat_embeddings is {len(flat_embeddings)}")
print(f"The size of the flat_data is {len(flat_data)}")

The size of the flat_embeddings is 3531
The size of the flat_data is 3531


## 3.6 Converting embeddings to numpy array

In [79]:
pages_and_metadata_embeddings = np.array(flat_embeddings, dtype=np.float32)

## 3.7 Converting the numpy array embeddings to torch tensors

In [80]:
pages_and_metadata_embeddings = torch.tensor(pages_and_metadata_embeddings, dtype=torch.float32).to(device)

## 3.8 Getting the similarity score by query

In [99]:
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
query_embeddings = embedding_model.encode(QUERY, convert_to_tensor=True).to(device)
dot_score = util.dot_score(query_embeddings, pages_and_metadata_embeddings)[0]

## 3.9 Getting the top k similar scores

In [100]:
top_scores, top_indices = torch.topk(dot_score, k=K)

In [101]:
print(f"Top scores: {top_scores}")
print(f"Top indices: {top_indices}")

Top scores: tensor([0.7078, 0.6232, 0.6208, 0.6178, 0.6178, 0.6054, 0.5918, 0.5918, 0.5917,
        0.5829, 0.5771, 0.5741, 0.5741, 0.5692, 0.5692, 0.5648, 0.5648, 0.5575,
        0.5546, 0.5524, 0.5505, 0.5386, 0.5386, 0.5329, 0.5311, 0.5233, 0.5233,
        0.5229, 0.5227, 0.5187, 0.5187, 0.5187, 0.5140, 0.5137, 0.5114, 0.5081,
        0.5041, 0.5022, 0.4989, 0.4967, 0.4951, 0.4950, 0.4946, 0.4920, 0.4920,
        0.4920, 0.4920, 0.4919, 0.4918, 0.4915], device='cuda:0')
Top indices: tensor([  60,  895, 3149,  766, 1234, 2405,   33,  896, 3221,  191, 3114,  763,
        1231,  400,  587,  352,  539, 2380,  183, 3219,   44,  403,  590, 3125,
        1694,  447,  634,  193, 1745,  108,  142,  926, 2383,  867,  144, 3112,
        1079,   68, 1753, 2778, 3116, 1798,   64,   30,  889,  321,  508, 3229,
         893,  103], device='cuda:0')


## 3.10 Getting the top k content based on the scores

In [102]:
context = list()
for index in top_indices:
    print(f"Fetching data from page {index}")
    context.append(flat_data[index.item()])

Fetching data from page 60
Fetching data from page 895
Fetching data from page 3149
Fetching data from page 766
Fetching data from page 1234
Fetching data from page 2405
Fetching data from page 33
Fetching data from page 896
Fetching data from page 3221
Fetching data from page 191
Fetching data from page 3114
Fetching data from page 763
Fetching data from page 1231
Fetching data from page 400
Fetching data from page 587
Fetching data from page 352
Fetching data from page 539
Fetching data from page 2380
Fetching data from page 183
Fetching data from page 3219
Fetching data from page 44
Fetching data from page 403
Fetching data from page 590
Fetching data from page 3125
Fetching data from page 1694
Fetching data from page 447
Fetching data from page 634
Fetching data from page 193
Fetching data from page 1745
Fetching data from page 108
Fetching data from page 142
Fetching data from page 926
Fetching data from page 2383
Fetching data from page 867
Fetching data from page 144
Fetching da

In [103]:
print(context)

['overfitting comes light data associated complexity, means associated parameters relative number observations.', 'follow steve nouri ai data science posts https//lnkd.in/gzu463x overfitting, statistical model describes random error noise instead underlying relationship.', 'simple restatement fundamental problem machine learning possibility overfitting training data carrying noise data test set, providing inaccurate generalizations.', 'lot data overﬁtting avoided, overﬁtting happens relatively small dataset, try learn it.', 'lot data overﬁtting avoided, overﬁtting happens relatively small dataset, try learn it.', 'overfitting evaluation 83 (from weiss, s., kulikowski, c., computer systems learn, morgan kaufmann, 1991) training errors validation errors 1 2 3 4 5 6 7 8 9 0.2 0.4 0.6 0.8 1.0 0 0 error rate number terminal nodes iris data decision tree figure 6.8 determining overﬁtting begins stopping growth decision tree, grow size prune away leaf nodes ancestors cross- validation accurac

# 4. Augmentation

## 4.1 Login to HuggingFace CLI

In [104]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 4.2 Loading the LLM model

In [105]:
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=LLM_MODEL,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=False,
).to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## 4.3 Augmenting the prompt for instructing the LLM in a better way

In [106]:
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
context = "\n -".join(context)
base_prompt = f'''Bases on the following context items, please answer the query
Context Items:
{context}
Query:
{QUERY}
Answer:'''

In [107]:
base_prompt = base_prompt.format(context=context, query=QUERY)

## 4.4 Creating the dialogue template for the LLM

In [108]:
dialogue_template = [{
    "role": "user",
    "content": base_prompt,
}]

## 4.5 Applying the prompt to the dialogue template

In [109]:
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False,
                                       add_generation_prompt=True)

## 4.6 Providing the prompt and retrieving the answer from the LLM model

In [110]:
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**input_ids, temperature=TEMPERATURE, do_sample=True, max_new_tokens=MAX_NEW_TOKENS)
output_text = tokenizer.decode(outputs[0])

In [111]:
print(output_text)

<bos><bos><start_of_turn>user
Bases on the following context items, please answer the query
Context Items:
overfitting comes light data associated complexity, means associated parameters relative number observations.
 -follow steve nouri ai data science posts https//lnkd.in/gzu463x overfitting, statistical model describes random error noise instead underlying relationship.
 -simple restatement fundamental problem machine learning possibility overfitting training data carrying noise data test set, providing inaccurate generalizations.
 -lot data overﬁtting avoided, overﬁtting happens relatively small dataset, try learn it.
 -lot data overﬁtting avoided, overﬁtting happens relatively small dataset, try learn it.
 -overfitting evaluation 83 (from weiss, s., kulikowski, c., computer systems learn, morgan kaufmann, 1991) training errors validation errors 1 2 3 4 5 6 7 8 9 0.2 0.4 0.6 0.8 1.0 0 0 error rate number terminal nodes iris data decision tree figure 6.8 determining overﬁtting begin

In [112]:
idx = output_text.find("Answer")

In [113]:
answer = output_text[idx+7:]

In [114]:
answer = answer.replace("**", "")
answer = answer.replace("<start_of_turn>model","")
answer = re.sub("<.*?>", "", answer)
# answer = answer[]

In [115]:
print(f"The cleaned answer is: {answer}")

The cleaned answer is: 

Sure, here's a 200-word explanation of overfitting in machine learning:

Overfitting is a phenomenon where a machine learning model becomes too closely fit to the training data and fails to generalize well to new, unseen data. This results in poor predictive performance, even on data similar to the training set.

Overfitting occurs when a model is trained on a relatively small dataset and is unable to generalize its knowledge to a larger, unseen dataset. This can happen when the model has too few parameters to capture the underlying relationship between the features and the target variable. As a result, the model makes random errors that are not representative of the true underlying relationship.

Overfitting can be addressed by increasing the size of the training dataset, using regularization techniques, or by using cross-validation to select the optimal model parameters.
