# Indexing: Text Embedding with OpenAI

In [None]:
# Run the line of code below to check the version of langchain in the current environment.
# Substitute "langchain" with any other package name to check their version.

In [None]:
pip show langchain

In [None]:
%load_ext dotenv
%dotenv

In [None]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np

In [None]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Course Title"), 
                           ("##", "Lecture Title")]
)

pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())
    
char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap  = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

In [None]:
pages_char_split

In [None]:
embedding = OpenAIEmbeddings(model = "text-embedding-ada-002")

In [None]:
pages_char_split[18]

In [None]:
vector1 = embedding.embed_query(pages_char_split[3].page_content)
vector2 = embedding.embed_query(pages_char_split[5].page_content)
vector3 = embedding.embed_query(pages_char_split[18].page_content)

In [None]:
len(vector1), len(vector2), len(vector3)

In [None]:
np.dot(vector1, vector2), np.dot(vector1, vector3), np.dot(vector2, vector3)

In [None]:
np.linalg.norm(vector1), np.linalg.norm(vector2), np.linalg.norm(vector3)