# 1. IMPORTING LIBRARIES, FUNCTION AND DEFINING GLOBAL VARIABLES

## 1.1 Importing Libraries

In [1]:
# Library needed to load the pdf
import fitz

# Library needed to process the text using regular expressions
import re

# Library needed to display or process the data in forms of dataframes
import pandas as pd

# Library needed to handle the operations in deep learning
import torch

# Library needed to convert the data into arrays for faster processing
import numpy as np

# Library to handle operating system related operations
import os

## 1.2 Importing Functions

In [2]:
# (OPTIONAL) Function to beautify the waiting process with a loading bar
from tqdm import tqdm as tqdm

# Function to process the text in English
from spacy.lang.en import English

# Function to convert paragraphs to sentences
from sentence_transformers import SentenceTransformer

# Function to provide utility services to process the text such as tokenization, sentencizer
from sentence_transformers import util

# Functions for loading the LLM model
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig
from transformers import AutoModelForSeq2SeqLM

# Function for fetching the paths to pdfs
from glob import glob

## 1.3 Defining Global Variables

In [62]:
# Global variable consisting of all the stop words
from spacy.lang.en import STOP_WORDS

# Global variable telling about the number of sentences in each chunk stored in the dictionary
SENTENCE_CHUNKS = 10

# Global variable storing the name of the model that is used for the embedding
EMBEDDING_MODEL = 'all-MiniLM-L12-v2'

# Global variable storing the names of the pdfs that are to be loaded to be fed into the RAG model
PDF_PATHS = list()

# Global variable storing the integer telling to fetch the top k similar records for further processing
K = 10

# Global variable storing the name of the LLM model that will be used for augmenting the similar data
LLM_MODEL = 'google/gemma-2b-it'

# (FOR TESTING) Global variable storing the query that user wants to ask
QUERY = "What are some good practices in machine learning?"

# Setting up the device agnostic code
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Global variable for setting the temperature of the LLM model i.e how much data should LLM generate
TEMPERATURE = 0.1

## Global variable defining the length of tokens that the LLM has to generate
MAX_NEW_TOKENS = 1024

# 2. DATA ACQUISITION

## 2.1 Getting the paths to all the pdfs in the `Dataset` folder

In [4]:
PDF_PATHS = glob('.\\Dataset\\*.pdf')

In [5]:
for idx, path in tqdm(enumerate(PDF_PATHS), total=len(PDF_PATHS)):
    print(f"{idx+1}. {path[10:-4]}")

100%|██████████████████████████████████████████████████████████████████████████████| 71/71 [00:00<00:00, 108644.87it/s]

1. 1. Introduction to machine learning Author Nils J. Nilsson
2. 120_24pgs_mlinterviewquestions
3. 2. Machine Learning Author Jaydip Sen
4. 3. Undergraduate Fundamentals of Machine Learning Author William J. Deuschle
5. 4. Machine Learning - Supervised Techniques Author Sepp Hochreiter
6. 4.41_faq_dl_8pgs
7. 40 Artificial Intelligence Interview Questions & Answers 
8. 40. Supervised Learning - An Introduction Author Michael Biehl
9. 41 Essential Machine Learning Interview Questions
10. 43. Supervised Machine Learning - A Brief Introduction Author Seemant TIWARI
11. 44. Supervised Machine Learning Author Andreas Lindholm, Niklas Wahlström, Fredrik Lindsten and Thomas B. Schön
12. 45. Unsupervised learning - a systematic literature review Author Salim Dridi
13. 46. Unsupervised Learning Author Wei Wu
14. 48. Unsupervised learning Author Hannah Van Santvliet
15. 5. Machine learning - The power and promise of computers that learn by example Author Royal Society
16. 5.AI_40Questions
17. 50.




## 2.2 Opening all the documents

In [6]:
documents = list()
for path in tqdm(PDF_PATHS, total=len(PDF_PATHS)):
    doc = fitz.open(path)
    documents.append(doc)

100%|██████████████████████████████████████████████████████████████████████████████████| 71/71 [00:00<00:00, 80.24it/s]


In [7]:
for doc in tqdm(documents, total=len(documents)):
    print(doc)

100%|██████████████████████████████████████████████████████████████████████████████| 71/71 [00:00<00:00, 138703.11it/s]

Document('.\Dataset\1. Introduction to machine learning Author Nils J. Nilsson.pdf')
Document('.\Dataset\120_24pgs_mlinterviewquestions.pdf')
Document('.\Dataset\2. Machine Learning Author Jaydip Sen.pdf')
Document('.\Dataset\3. Undergraduate Fundamentals of Machine Learning Author William J. Deuschle.pdf')
Document('.\Dataset\4. Machine Learning - Supervised Techniques Author Sepp Hochreiter.pdf')
Document('.\Dataset\4.41_faq_dl_8pgs.pdf')
Document('.\Dataset\40 Artificial Intelligence Interview Questions & Answers .pdf')
Document('.\Dataset\40. Supervised Learning - An Introduction Author Michael Biehl.pdf')
Document('.\Dataset\41 Essential Machine Learning Interview Questions.pdf')
Document('.\Dataset\43. Supervised Machine Learning - A Brief Introduction Author Seemant TIWARI.pdf')
Document('.\Dataset\44. Supervised Machine Learning Author Andreas Lindholm, Niklas Wahlström, Fredrik Lindsten and Thomas B. Schön.pdf')
Document('.\Dataset\45. Unsupervised learning - a systematic lite




## 2.3 Getting the text from all the documents

In [8]:
pages = dict()
for doc in tqdm(documents, total=len(documents)):
    for page_number, page in enumerate(doc):
        if(page_number<15): 
            continue
        page_number = len(pages)
        pages[page_number] = page.get_text()

 76%|██████████████████████████████████████████████████████████████▎                   | 54/71 [00:12<00:04,  3.45it/s]

MuPDF error: format error: object out of range (4392 0 R); xref size 4383

MuPDF error: format error: object out of range (4395 0 R); xref size 4383

MuPDF error: format error: object out of range (4395 0 R); xref size 4383

MuPDF error: format error: object out of range (4395 0 R); xref size 4383

MuPDF error: format error: object out of range (4395 0 R); xref size 4383

MuPDF error: format error: object out of range (4405 0 R); xref size 4383

MuPDF error: format error: object out of range (4408 0 R); xref size 4383

MuPDF error: format error: object out of range (4408 0 R); xref size 4383

MuPDF error: format error: object out of range (4408 0 R); xref size 4383

MuPDF error: format error: object out of range (4408 0 R); xref size 4383

MuPDF error: format error: object out of range (4419 0 R); xref size 4383

MuPDF error: format error: object out of range (4422 0 R); xref size 4383

MuPDF error: format error: object out of range (4422 0 R); xref size 4383

MuPDF error: format error

100%|██████████████████████████████████████████████████████████████████████████████████| 71/71 [00:21<00:00,  3.26it/s]


In [9]:
for page_number, page in tqdm(pages.items(), total=len(documents)):
    print(f"{page_number}. {pages[page_number]}")
    print()

14218it [00:00, 129868.58it/s]                                                                                         IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




## 2.4 Getting the metadata of each page

In [10]:
pages_and_metadata = list()
for page_number, page in tqdm(pages.items(), total=len(pages)):
    metadata = dict()
    metadata['page_number'] = page_number
    metadata['raw_text'] = page
    metadata['number_of_characters'] = len(page)
    metadata['number_of_tokens'] = len(page)/4
    metadata['number_of_words'] = len(page.split())
    pages_and_metadata.append(metadata)

100%|█████████████████████████████████████████████████████████████████████████| 14218/14218 [00:00<00:00, 74841.70it/s]


In [11]:
df = pd.DataFrame(pages_and_metadata)

In [12]:
df.describe().round(2)

Unnamed: 0,page_number,number_of_characters,number_of_tokens,number_of_words
count,14218.0,14218.0,14218.0,14218.0
mean,7108.5,1845.25,461.31,299.41
std,4104.53,924.49,231.12,156.26
min,0.0,0.0,0.0,0.0
25%,3554.25,1189.0,297.25,185.0
50%,7108.5,1899.0,474.75,310.0
75%,10662.75,2475.75,618.94,410.0
max,14217.0,8994.0,2248.5,1429.0


In [13]:
print(f"The number of tokens this model is being trained on are: {df["number_of_tokens"].sum()/1000000:.2f} million tokens")

The number of tokens this model is being trained on are: 6.56 million tokens


In [14]:
print(f"The number of pages in the database are: {len(pages_and_metadata)}")

The number of pages in the database are: 14218


## 2.4 Preprocessing the `raw_text` from metadata

In [15]:
def convert_to_lowercase(text):
    new_text = text.lower()
    return new_text

In [16]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in STOP_WORDS:
            new_text.append(word)
    return " ".join(new_text)

In [17]:
def remove_html_tags(text):
    new_text = re.sub(r"<!--.*?-->", "", text)
    return new_text

In [18]:
def remove_newlines(text):
    new_text = re.sub(r"\n+", " ", text)
    return new_text

In [19]:
def remove_multiple_spaces(text):
    new_text = text.replace("  ", " ")
    return new_text

In [20]:
def remove_comments(text):
    new_text = re.sub(r"<!--.*?-->", "", text)
    return new_text

In [21]:
def remove_unnecessary_text(text):
    new_text = text.replace("answer:","").replace("question", "").replace(":","").replace("  "," ")
    return new_text

In [22]:
def preprocess_text(text):
    text = convert_to_lowercase(text)
    text = remove_stopwords(text)
    text = remove_html_tags(text)
    text = remove_newlines(text)
    text = remove_multiple_spaces(text)
    text = remove_comments(text)
    text = remove_unnecessary_text(text)
    return text

In [23]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    page["formatted_text"] = preprocess_text(page["raw_text"])

100%|█████████████████████████████████████████████████████████████████████████| 14218/14218 [00:00<00:00, 16655.68it/s]


In [24]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(page["formatted_text"])
    print()

100%|████████████████████████████████████████████████████████████████████████| 14218/14218 [00:00<00:00, 164870.40it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




## 2.5 Converting the paragraphs to sentences

In [27]:
nlp = English()
nlp.add_pipe('sentencizer')
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    sentences = nlp(page["formatted_text"]).sents
    sentences = list(set([str(sentence).strip() for sentence in sentences if len(str(sentence).split())>10]))
    pages_and_metadata[page["page_number"]]["sentences"] = sentences

100%|███████████████████████████████████████████████████████████████████████████| 14218/14218 [00:19<00:00, 716.97it/s]


In [28]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(page["sentences"])
    print()

100%|█████████████████████████████████████████████████████████████████████████| 14218/14218 [00:00<00:00, 99991.81it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




## 2.6 Update the metadata

In [29]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    pages_and_metadata[page["page_number"]]["number_of_sentences"] = len(page["sentences"])

100%|███████████████████████████████████████████████████████████████████████| 14218/14218 [00:00<00:00, 2826017.17it/s]


In [30]:
for key in pages_and_metadata[0].keys():
    print(key)

page_number
raw_text
number_of_characters
number_of_tokens
number_of_words
formatted_text
sentences
number_of_sentences


## 2.7 Converting sentences to sentence_chunks

In [31]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    sentences = pages_and_metadata[page["page_number"]]["sentences"]
    sentence_chunk = [sentences[i : i+SENTENCE_CHUNKS] for i in range(0, len(sentences), SENTENCE_CHUNKS)]
    page["sentence_chunk"] = sentence_chunk

100%|████████████████████████████████████████████████████████████████████████| 14218/14218 [00:00<00:00, 591249.57it/s]


In [32]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(page["sentence_chunk"])
    print()

100%|████████████████████████████████████████████████████████████████████████| 14218/14218 [00:00<00:00, 100221.02it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




## 2.8 Converting sentence_chunks into sentence embeddings

In [33]:
embedding_model = SentenceTransformer(model_name_or_path=EMBEDDING_MODEL).to(device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [34]:
for page in tqdm(pages_and_metadata):
    embeddings = list()
    # print(f"Processing Page {page['page_number']}")
    for sentence in page["sentences"]:
        # sentence = sentence.to(device)
        embedding = embedding_model.encode(sentence, batch_size=1024, convert_to_tensor=True, show_progress_bar=False, device=device)
        embedding = np.stack(embedding.tolist(), axis=0)
        embedding = torch.tensor(embedding)
        embedding = embedding.type(torch.float32)
        embeddings.append(embedding)
    sentence_embeddings = [np.array(embedding) for embedding in embeddings]
    pages_and_metadata[page["page_number"]]["embeddings"] = sentence_embeddings

100%|████████████████████████████████████████████████████████████████████████████| 14218/14218 [23:19<00:00, 10.16it/s]


In [35]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(np.array(page["embeddings"]).shape)

100%|████████████████████████████████████████████████████████████████████████| 14218/14218 [00:00<00:00, 179869.38it/s]

(6, 384)
(8, 384)
(13, 384)
(8, 384)
(6, 384)
(7, 384)
(5, 384)
(0,)
(6, 384)
(5, 384)
(6, 384)
(14, 384)
(6, 384)
(2, 384)
(4, 384)
(11, 384)
(12, 384)
(2, 384)
(1, 384)
(0,)
(7, 384)
(5, 384)
(4, 384)
(5, 384)
(7, 384)
(8, 384)
(11, 384)
(10, 384)
(10, 384)
(9, 384)
(7, 384)
(4, 384)
(5, 384)
(11, 384)
(4, 384)
(8, 384)
(8, 384)
(11, 384)
(5, 384)
(9, 384)
(4, 384)
(2, 384)
(10, 384)
(6, 384)
(8, 384)
(6, 384)
(10, 384)
(8, 384)
(6, 384)
(4, 384)
(6, 384)
(4, 384)
(11, 384)
(6, 384)
(9, 384)
(0,)
(9, 384)
(3, 384)
(8, 384)
(1, 384)
(4, 384)
(4, 384)
(3, 384)
(6, 384)
(8, 384)
(1, 384)
(7, 384)
(7, 384)
(5, 384)
(7, 384)
(4, 384)
(3, 384)
(8, 384)
(7, 384)
(7, 384)
(10, 384)
(6, 384)
(11, 384)
(9, 384)
(3, 384)
(1, 384)
(0,)
(7, 384)
(10, 384)
(7, 384)
(8, 384)
(9, 384)
(5, 384)
(5, 384)
(7, 384)
(7, 384)
(8, 384)
(2, 384)
(11, 384)
(12, 384)
(18, 384)
(4, 384)
(1, 384)
(1, 384)
(0,)
(7, 384)
(11, 384)
(9, 384)
(7, 384)
(9, 384)
(8, 384)
(13, 384)
(8, 384)
(13, 384)
(9, 384)
(8, 384)





## 2.9 Checking the metadata present for use

In [36]:
for key in pages_and_metadata[0].keys():
    print(key)

page_number
raw_text
number_of_characters
number_of_tokens
number_of_words
formatted_text
sentences
number_of_sentences
sentence_chunk
embeddings


# 3. FETCHING SIMILAR CONTENT

## 3.1 Getting the data embeddings

In [37]:
pages_and_metadata_embeddings = []

for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    page_embeddings = []
    for chunk_embedding in pages_and_metadata[page["page_number"]]["embeddings"]:
        if isinstance(chunk_embedding, torch.Tensor):
            chunk_embedding = chunk_embedding.tolist()
        page_embeddings.append(chunk_embedding)
    pages_and_metadata_embeddings.append(page_embeddings)

100%|████████████████████████████████████████████████████████████████████████| 14218/14218 [00:00<00:00, 474658.06it/s]


## 3.2 Converting each embedding into the same dimensions

In [38]:
if pages_and_metadata_embeddings:
    embedding_dim = len(pages_and_metadata_embeddings[0][0])
    pages_and_metadata_embeddings = [
            [np.pad(chunk, (0, max(0, embedding_dim - len(chunk))), mode='constant')[:embedding_dim]
             for chunk in page]
            for page in pages_and_metadata_embeddings
        ]

## 3.3 Flattening the nested list of embeddings and the sentence to fetch by index

In [39]:
flat_embeddings = [chunk for page in pages_and_metadata_embeddings for chunk in page]
flat_data = [sentence for page in pages_and_metadata for sentence in page["sentences"]]

## 3.4 Saving the flattened embeddings and the flattened data

In [40]:
df = pd.DataFrame(flat_embeddings)
df.to_csv("embeddings_v5.csv", index=False)

df = pd.DataFrame(flat_data)
df.to_csv("data_v5.csv", index=False)

## 3.5 Loading the flattened embeddings and flattened data

In [41]:
flat_embeddings = pd.read_csv("embeddings_v5.csv").to_numpy()
flat_data = pd.read_csv("data_v5.csv")["0"].tolist()

In [42]:
print(f"The size of the flat_embeddings is {len(flat_embeddings)}")
print(f"The size of the flat_data is {len(flat_data)}")

The size of the flat_embeddings is 92230
The size of the flat_data is 92230


## 3.6 Converting embeddings to numpy array

In [63]:
pages_and_metadata_embeddings = np.array(flat_embeddings, dtype=np.float32)

## 3.7 Converting the numpy array embeddings to torch tensors

In [64]:
pages_and_metadata_embeddings = torch.tensor(pages_and_metadata_embeddings, dtype=torch.float32).to(device)

## 3.8 Getting the similarity score by query

In [65]:
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
query_embeddings = embedding_model.encode(QUERY, convert_to_tensor=True).to(device)
dot_score = util.dot_score(query_embeddings, pages_and_metadata_embeddings)[0]

## 3.9 Getting the top k similar scores

In [66]:
top_scores, top_indices = torch.topk(dot_score, k=K)

In [67]:
print(f"Top scores: {top_scores}")
print(f"Top indices: {top_indices}")

Top scores: tensor([0.6711, 0.6603, 0.6594, 0.6514, 0.6495, 0.6474, 0.6468, 0.6301, 0.6280,
        0.6271])
Top indices: tensor([48851, 23529, 10543, 66547,  3542,  6514, 15708, 64971, 63383, 75427])


## 3.10 Getting the top k content based on the scores

In [68]:
context = list()
for index in top_indices:
    print(f"Fetching data from page {index}")
    context.append(flat_data[index.item()])

Fetching data from page 48851
Fetching data from page 23529
Fetching data from page 10543
Fetching data from page 66547
Fetching data from page 3542
Fetching data from page 6514
Fetching data from page 15708
Fetching data from page 64971
Fetching data from page 63383
Fetching data from page 75427


In [69]:
print(context)

['machine learning set methods allow computers learn data improve predictions (for example cancer, weekly sales, credit default).', 'principles mind considering machine learning problem coming later, finally fist glance practical issues.', 'course, we’ve ideas, too making use sufﬁciently large data sets (to help avoid overﬁtting); right cost function (to avoid learning slowdown); good weight initializations (also avoid learning slowdown, neuron saturation); algorithmically expanding training data.', 'leave explore possibilities machine learning offers, want final words advice, point additional resources, suggestions improve machine learning data science skills.', 'example, supervised learning techniques (forms regression, neural networks, support vector machines, etc.)', 'so, ﬁnd interesting concepts, practical tools evaluating understanding supervised machine learning methods better.', 'understandings, ensure machine learning fairly or, rather, generate unfair harmful consequences.', 

# 4. Augmentation

## 4.1 Login to HuggingFace CLI

In [70]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 4.2 Loading the LLM model

In [71]:
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=LLM_MODEL,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=False,
).to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## 4.3 Augmenting the prompt for instructing the LLM in a better way

In [72]:
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
context = "\n -".join(context)
base_prompt = f'''Based on the following context items, please answer the query
Context Items:
{context}
Query:
{QUERY}
Answer:'''

In [73]:
base_prompt = base_prompt.format(context=context, query=QUERY)

## 4.4 Creating the dialogue template for the LLM

In [74]:
dialogue_template = [{
    "role": "user",
    "content": base_prompt,
}]

## 4.5 Applying the prompt to the dialogue template

In [75]:
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False,
                                       add_generation_prompt=True)

## 4.6 Providing the prompt and retrieving the answer from the LLM model

In [76]:
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**input_ids, temperature=TEMPERATURE, do_sample=True, max_new_tokens=MAX_NEW_TOKENS)
output_text = tokenizer.decode(outputs[0])

In [77]:
print(output_text)

<bos><bos><start_of_turn>user
Based on the following context items, please answer the query
Context Items:
machine learning set methods allow computers learn data improve predictions (for example cancer, weekly sales, credit default).
 -principles mind considering machine learning problem coming later, finally fist glance practical issues.
 -course, we’ve ideas, too making use sufﬁciently large data sets (to help avoid overﬁtting); right cost function (to avoid learning slowdown); good weight initializations (also avoid learning slowdown, neuron saturation); algorithmically expanding training data.
 -leave explore possibilities machine learning offers, want final words advice, point additional resources, suggestions improve machine learning data science skills.
 -example, supervised learning techniques (forms regression, neural networks, support vector machines, etc.)
 -so, ﬁnd interesting concepts, practical tools evaluating understanding supervised machine learning methods better.
 -

In [78]:
idx = output_text.find("Answer")

In [79]:
answer = output_text[idx+7:]

In [80]:
answer = answer.replace("**", "")
answer = answer.replace("<start_of_turn>model","")
answer = re.sub("<.*?>", "", answer)
# answer = answer[]

In [81]:
print(f"The cleaned answer is: {answer}")

The cleaned answer is: 

The context items provide several good practices in machine learning, including:

- Data preparation:
 - Choosing the right cost function to avoid learning slowdown.
 - Ensuring data is sufficiently large to avoid overfitting.
 - Using good initial weight initialization to avoid neuron saturation.


- Algorithm selection:
 - Exploring different machine learning methods to find the one that best fits the problem.


- Model understanding:
 - Understanding the algorithm in detail to gain insights into its workings.


- Fairness and bias:
 - Ensuring the algorithm is fair and does not generate unfair or harmful consequences.


- Data exploration:
 - Spending time exploring different data sets to find the best one for the task.


- Evaluation and validation:
 - Using appropriate evaluation metrics to assess the performance of the model.


- Continuous learning:
 - Continuously monitoring the model's performance and making adjustments as needed.
