In [1]:
import os
# import openai
from langchain.llms import HuggingFaceHub
import sys
import numpy as np

# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv())
hf_key = os.environ['HUGGING_FACE_API_KEY']
# openai.api_key  = os.environ['OPENAI_API_KEY']

### Importing LangChain packages

In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

## Loading Data

In [12]:
pdfFilePath = "D:\\eBook-How-to-Build-a-Career-in-AI.pdf"

In [14]:
### Loading the book

loader = PyPDFLoader(pdfFilePath)
pages = loader.load()

In [16]:
print(len(pages))

41


In [20]:
page = pages[3]
text = page.page_content

In [22]:
### Number of characters in the text loaded
print(len(text))

### Meta data of the text loaded
print(page.metadata)

2445
{'source': 'D:\\eBook-How-to-Build-a-Career-in-AI.pdf', 'page': 3}


In [24]:
print(page)

page_content='PAGE 4Coding AI Is the New Literacy
Today we take it for granted that many people know how to read and write. Someday, I hope, 
it will be just as common that people know how to write code, specifically for AI.
Several hundred years ago, society didn’t view language literacy as a necessary skill. A small 
number of people learned to read and write, and everyone else let them do the reading and 
writing. It took centuries for literacy to spread, and now society is far richer for it.
Words enable deep human-to-human communication. Code is the deepest form of human-to-
machine communication. As machines become more central to daily life, that communication 
becomes ever more important.
Traditional software engineering — writing programs that explicitly tell a computer sequences 
of steps to execute — has been the main path to code literacy. Many introductory programming 
classes use creating a video game or building a website as examples. But AI, machine learning, 
and data 

### Splitting the text optimally

In [26]:
### Initialising the text splitters

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=. )", " ", ""]
)

Rsplit = r_splitter.split_text(text)
print(len(Rsplit))
print(Rsplit)

25
['PAGE 4Coding AI Is the New Literacy\nToday we take it for granted that many people know how to read and write. Someday, I hope,', 'it will be just as common that people know how to write code, specifically for AI.', 'Several hundred years ago, society didn’t view language literacy as a necessary skill. A small', 'number of people learned to read and write, and everyone else let them do the reading and', 'writing. It took centuries for literacy to spread, and now society is far richer for it.', 'Words enable deep human-to-human communication. Code is the deepest form of human-to-', 'machine communication. As machines become more central to daily life, that communication \nbecomes ever more important.', 'Traditional software engineering — writing programs that explicitly tell a computer sequences', 'of steps to execute — has been the main path to code literacy. Many introductory programming', 'classes use creating a video game or building a website as examples. But AI, machine learn

In [28]:
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    separator = "\n"
)

Tsplit = text_splitter.split_text(text)
print(len(Tsplit))
print(Tsplit)

3
['PAGE 4Coding AI Is the New Literacy\nToday we take it for granted that many people know how to read and write. Someday, I hope, \nit will be just as common that people know how to write code, specifically for AI.\nSeveral hundred years ago, society didn’t view language literacy as a necessary skill. A small \nnumber of people learned to read and write, and everyone else let them do the reading and \nwriting. It took centuries for literacy to spread, and now society is far richer for it.\nWords enable deep human-to-human communication. Code is the deepest form of human-to-\nmachine communication. As machines become more central to daily life, that communication \nbecomes ever more important.\nTraditional software engineering — writing programs that explicitly tell a computer sequences \nof steps to execute — has been the main path to code literacy. Many introductory programming \nclasses use creating a video game or building a website as examples. But AI, machine learning,', 'classe

In [30]:
docs = text_splitter.split_documents(pages)

In [32]:
print(len(docs), len(pages))

80 41


In [34]:
print(docs[2].page_content)  #### Each line is a split made

PAGE 3Table of 
ContentsIntroduction: Coding AI is the New Literacy.
Chapter 1: Three Steps to Career Growth.
Chapter 2: Learning Technical Skills for a 
Promising AI Career.
Chapter 3: Should You Learn Math to Get a Job 
in AI?
Chapter 4: Scoping Successful AI Projects.
Chapter 5: Finding Projects that Complement 
Your Career Goals.
Chapter 6: Building a Portfolio of Projects that 
Shows Skill Progression.
Chapter 7: A Simple Framework for Starting Your AI 
Job Search.
Chapter 8: Using Informational Interviews to Find 
the Right Job.
Chapter 9: Finding the Right AI Job for You.
Chapter 10: Keys to Building a Career in AI.
Chapter 11: Overcoming Imposter Syndrome.
Final Thoughts: Make Every Day Count.LEARNING
PROJECTS
JOB


## Vector Database and Text Embeddings

In [42]:
model_name = "mixedbread-ai/mxbai-embed-large-v1"
embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    encode_kwargs={"normalize_embeddings": True},
)

In [44]:
### Testing the enbedding and similarity scores

sentence1 = Rsplit[1]
sentence2 = Rsplit[3]
sentence3 = Rsplit[5]

print(sentence1)
print(sentence2)
print(sentence3)

it will be just as common that people know how to write code, specifically for AI.
number of people learned to read and write, and everyone else let them do the reading and
Words enable deep human-to-human communication. Code is the deepest form of human-to-


In [46]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [48]:
np.dot(embedding1, embedding2), np.dot(embedding2, embedding3), np.dot(embedding3, embedding1)

(0.5538325901583627, 0.5539500231029587, 0.6178851590517445)