In [14]:
from dotenv.main import load_dotenv
import os
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [11]:
import nltk
import ssl

In [12]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/luqman/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Load Your Data

In [15]:
loader = DirectoryLoader('txt files', glob='**/*.txt', loader_cls=TextLoader)
data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[1].page_content)} characters in your document')

You have 8 document(s) in your data
There are 36367 characters in your document


## Chunk into Smaller Documents

In [16]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
texts = text_splitter.split_documents(data)

print (f'Now you have {len(texts)} documents')

Now you have 343 documents


## Create Embeddings for Semantic Search

In [17]:
from langchain.vectorstores import Pinecone, Chroma
from langchain.embeddings import OpenAIEmbeddings
import pinecone

load_dotenv()
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])
# docsearch = Chroma.from_documents(texts, embeddings)

In [18]:
load_dotenv()
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment=os.environ['PINECONE_API_ENV']
)

index_name = 'agriculturevaluechain'

In [None]:
docsearch = Pinecone.from_documents(texts, embeddings, index_name=index_name)

In [25]:
from langchain import OpenAI, VectorDBQA
qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type='stuff', vectorstore=docsearch, return_source_documents=True)



In [35]:
query = 'What is data science?'
result = qa({'query':query})

In [36]:
result['result']

" I don't know."

In [37]:
result['source_documents']

[Document(page_content='6. The information or document obtained under these Regulations shall be handled in accordance with any relevant provision of the Data Protection Act, 2019 Protection of information 7. A dairy business operator who — (a) willfully neglects or fails to submit a return; (b) fails to supply the information or a document required in regulation 5; (c) knowingly or recklessly submits false, incorrect return, report or estimate; or (d) supplies any such particulars or documents, containing a matter which the operator knows or ought to know that it is materially false, commits an offence and is liable on conviction to a fine not exceeding ten thousand shillings or to an imprisonment for a term not exceeding three months, or both. Offences', metadata={'source': 'txt files/dairyindustryregulations2021.txt'}),
 Document(page_content='14. The information or document obtained under these Regulations shall be handled in accordance with any relevant provision of the Data Prote

## Debug this area

In [9]:
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', document_model_name='text-embedding-ada-002', query_model_name='text-embedding-ada-002', embedding_ctx_length=-1, openai_api_key='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', chunk_size=1000, max_retries=6)

In [6]:
load_dotenv()
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment=os.environ['PINECONE_API_ENV']
)

index_name = 'agriculturevaluechain'


In [7]:
pinecone.list_indexes()

['agriculturevaluechain']

In [8]:
pinecone.GRPCIndex('agriculturevaluechain')
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

PineconeProtocolError: Failed to connect; did you specify the correct index name?

## Alternative approach

In [None]:
import pandas as pd
import tiktoken
import openai

In [None]:
from uuid import uuid4
contents = []
tiktoken_encoding = tiktoken.get_encoding('gpt2')
for file in os.listdir('txt files'):
    with open(f'txt files/{file}', 'r') as f:
        file_content = f.read()
        tokens = tiktoken_encoding.encode(file_content)
        total_tokens = len(tokens)
        contents.append((file, file_content, tokens))
df = pd.DataFrame(contents, columns=['filename', 'file_content', 'tokens'])

df['embeddings'] = df.file_content.apply(lambda x:openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
df['id'] = [str(uuid4()) for _ in range(len(df))]

InvalidRequestError: This model's maximum context length is 8191 tokens, however you requested 8465 tokens (8465 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.

In [None]:
df.head()

Unnamed: 0,filename,file_content,tokens,embeddings,id
0,filename,file_content,tokens,"[-0.008951187133789062, -0.003505569649860263,...",dd31127c-0093-47ae-b10a-c5186f6cbe51
1,filename,file_content,tokens,"[-0.008905792608857155, -0.0035540778189897537...",bfa5eb3c-428a-4ca9-a755-3553d2fe8a34
2,filename,file_content,tokens,"[-0.008841133676469326, -0.003426781389862299,...",ee62a134-94c7-4dd3-9970-ac45531b61b0
3,filename,file_content,tokens,"[-0.008951187133789062, -0.003505569649860263,...",9acf2ffe-7435-4e41-a06c-e302b7362d0d
4,filename,file_content,tokens,"[-0.008905792608857155, -0.0035540778189897537...",07132749-ffb8-4384-9e53-ca0aa438d432


In [None]:
from tqdm.auto import tqdm

batch_size = 100
chunks = texts

for i in tqdm(range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    ids