# Loading Text, Chunking, Embedding and Upserting into Pinecone Index

Got most of these from James Briggs' notebook: https://www.pinecone.io/learn/langchain-retrieval-augmentation/

### 1. Load Text

In [7]:
import os
from pathlib import Path
import re
from llama_index import download_loader
import docx2txt
#doc_path = (r"hr_policy.txt")
directory = 'files'
contents = ''

PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_API_ENV = os.environ['PINECONE_API_ENV']
OPENAI_API_KEY   = os.environ['OPENAI_API_KEY']


#DocxReader = download_loader("DocxReader")
#loader = DocxReader()
    
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    print(file_path)
    
    #documents = loader.load_data(file=Path(file_downloaded_path))
    #content = "".join(doc.text for doc in documents)
    content = docx2txt.process(file_path)
    content = re.sub(r'Page \d+ of \d+', '', content)
    #print(content)
    
    contents = contents + '\n' + content
    print('..............\n\n')
print(contents)

files/Late-Night-Conveyance-Employees-v0.1.docx
..............


files/Leave-Policy-v0.1.docx
..............


files/Prevention-Prohibition-Redressal-Sexual-Harassment-v0.1.docx
..............


files/Employee-WorkPlaceHarrasmment-v0.1.docx
..............


files/Employee-Ethics-Code-of-Conduct-v0.1.docx
..............


files/Late-Night-Conveyance-Women-Employees-v0.1.docx
..............


files/AttendanceManagement-Policy-v0.1.docx
..............



Policy: Late Night Conveyance for EMPLOYEES

Objective

This Policy aims to ensure the safety of women employees who stay back late at office to meet project or department deadlines, by providing company transportation for them to return to their residence. This policy also addresses conveyance reimbursement for employees who stay late due to project deadlines. 

applicaBILITY

This policy is applicable to all employees of Protechsoft Systems Pvt. Ltd, India, hereafter referred to as the “Company”.

POLICY

The Company will arrange convey

In [8]:
# set up tokenizer
import tiktoken
tokenizer = tiktoken.get_encoding('p50k_base')
# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

# sample
tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

28

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
model_name = 'text-embedding-ada-002'
OPENAI_API_KEY = OPENAI_API_KEY # there is a free tier. still trying to figure out how to use the azure deployment instead


def create_vectorstore(documents):
	text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=20,
        separators=["\n\n", "\n", " ", ""]
        )
	documents 		= text_splitter.split_text(documents)
    
    embeddings 	= OpenAIEmbeddings(
        model = model_name,
        openai_api_key=OPENAI_API_KEY
    )
	
    vectorstore =  Pinecone.from_documents(
		documents, 
		embeddings,
		index_name=PINECONE_INDEX_NAME,
		namespace=bot_id
	)
	print('........................\n\n')
	print(vectorstore)
	return vectorstore

def fetch_vectorstore(bot_id):
    embeddings = OpenAIEmbeddings()
    return Pinecone.from_existing_index(
        index_name=PINECONE_INDEX_NAME, 
        embedding=embeddings,
        namespace=bot_id
    )

### 2. Create chunking function

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_text(contents)
chunks[0]

'Policy: Late Night Conveyance for EMPLOYEES\n\nObjective\n\nThis Policy aims to ensure the safety of women employees who stay back late at office to meet project or department deadlines, by providing company transportation for them to return to their residence. This policy also addresses conveyance reimbursement for employees who stay late due to project deadlines. \n\napplicaBILITY'

### 3. Create Embeddings

In [10]:
# initialize embedding function
from langchain.embeddings.openai import OpenAIEmbeddings
import os
# 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
OPENAI_API_KEY = OPENAI_API_KEY # there is a free tier. still trying to figure out how to use the azure deployment instead

model_name = 'text-embedding-ada-002'

# set embeddings function
embed = OpenAIEmbeddings(
    model = model_name,
    openai_api_key=OPENAI_API_KEY
)

In [12]:
# create data format from chunked text for upserting into Pinecone index. Format: id, embeddings, metadata
from uuid import uuid4
import time
vectors = []
i = 0
for text in chunks:
    vectors.append((str(uuid4()), embed.embed_documents([text])[0], {"text": text}))
    i = i + 1
    if i == 3:
        time.sleep(65)
        i = 0
#vectors = [(str(uuid4()), embed.embed_documents([text])[0], {"text": text}) for text in chunks]


#### How the 'vectors' or embeddings look when printed. 
There are 1536 elements to the vector representing each chunk of data.

In [13]:
vectors

[('f1dbad00-b7d6-4e53-a390-8d09b2f3e133',
  [0.0037828231746120902,
   -0.00980396185475544,
   0.001554967003555466,
   -0.03776547643776502,
   -0.002454476869708631,
   -0.00920428884348397,
   0.002593935904467184,
   0.01888273821888251,
   -0.003211041644329434,
   -0.008597642088850393,
   -0.003022771993971516,
   0.010431527045507646,
   0.007642348678643,
   0.023749854154739947,
   -0.01681874487698495,
   0.006610352939016786,
   0.008151373736416563,
   -0.03042993661113601,
   0.016191180617555312,
   0.008869587346327956,
   -0.014894212432491076,
   -0.0099992043171529,
   -0.00551908690007061,
   0.03148982266759783,
   0.005051899180195587,
   -0.0067323794780152,
   0.009462287545559882,
   -0.03536678066738888,
   0.012969679987997203,
   -0.0021999645736524905,
   0.014433998455978161,
   -0.017892580282816116,
   0.001388487856482402,
   -0.00499960215857645,
   -0.01870144138056413,
   -0.03285651990438006,
   0.008235049343536208,
   0.004954277948996857,
   0.0

![vectors](assets/vectors.png)

### 4. Prep Pinecone Index

In [14]:
import pinecone

index_name = 'tk-policy'
dimension=1536

pinecone.init(
        api_key=PINECONE_API_KEY,  # get yours from pinecone.io. there is a free tier.
        environment=PINECONE_API_ENV 
)

# delete index if it exists
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

# create index
pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=dimension       
)

### 5. Upsert vectors to index

In [15]:
# connect to index
index = pinecone.Index(index_name)

# upsert vectors to pinecone
index.upsert(
    vectors=vectors,
    #namespace=index_name, 
    values=True, 
    include_metadata=True
    )

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 155}},
 'total_vector_count': 155}