# Indexing Text Embedding with OpenAI

In [3]:
%load_ext dotenv
%dotenv

In [4]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings

In [5]:
loader_docx = Docx2txtLoader("./Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#","course title"), 
        ("##","section title")]
)

pages_md_split = md_splitter.split_text(pages[0].page_content)

In [6]:
# pages_md_split

In [7]:
for i in pages_md_split:
    i.page_content = ' '.join(i.page_content.split())

print(len(pages_md_split))

char_splitter = CharacterTextSplitter(separator = ".", chunk_size = 500, chunk_overlap = 50)

pages_char_split = char_splitter.split_documents(pages_md_split)

print(len(pages_char_split))

2
20


In [9]:
embedding = AzureOpenAIEmbeddings(model="text-embedding-ada-002")

In [10]:
vector3 = embedding.embed_query(pages_char_split[3].page_content)
vector5 = embedding.embed_query(pages_char_split[5].page_content)
vector18 = embedding.embed_query(pages_char_split[18].page_content)


In [11]:
len(vector3), len(vector5), len(vector18)

(1536, 1536, 1536)

In [12]:
import numpy as np
np.dot(vector3, vector5), np.dot(vector3, vector18), np.dot(vector5, vector18)

(np.float64(0.879128449794393),
 np.float64(0.8000235828747095),
 np.float64(0.7934993700101878))

In [None]:
# vector の絶対値を取る
np.linalg.norm(vector3), np.linalg.norm(vector5), np.linalg.norm(vector18) 

(np.float64(0.999999951896922),
 np.float64(0.9999999432048747),
 np.float64(0.9999999688261214))

In [None]:
from langchain_chroma import Chroma
# document から embeding インスタンスを使ってベクトルDBを作成、保存する
vectorstore = Chroma.from_documents(documents = pages_char_split, embedding = embedding, persist_directory = "./data")

In [None]:
# ベクトルDBをディレクトリから読み込む
vectorstore_from_directory = Chroma(persist_directory = "./data", embedding_function = embedding)