In [3]:
import os
import langchain
import tiktoken

In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.llms import OpenAI

In [5]:
openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [6]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter
from langchain.document_loaders import ReadTheDocsLoader

In [8]:
tokenizer = tiktoken.get_encoding('p50k_base')

In [9]:
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [14]:
loader = TextLoader('./txt/01-1.txt')
documents = loader.load()
docs = text_splitter.split_documents(documents)

In [15]:
from uuid import uuid4
from tqdm.auto import tqdm

chunks = []

for idx, record in enumerate(tqdm(docs)):
    #texts = text_splitter.split_text(record.page_content)
    uuid_value = uuid4()
    chunks.extend([{
        
        'id': str(uuid_value),
        'text': record.page_content,
        'chunk': idx,
        'source':record.metadata['source']
        
    } ])

  0%|          | 0/759 [00:00<?, ?it/s]

In [18]:
type(chunks[1])

dict

In [270]:

# initialize openai API key
openai.api_key = openai_api_key  #platform.openai.com

embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=embed_model
)

In [255]:
import pinecone

index_name = 'try_langchain_tools'

# initialize connection to pinecone
pinecone.init(
    api_key=pinecone_api_key,  # app.pinecone.io (console)
    environment="us-central1-gcp"  # next to API key in console
)

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='dotproduct'
    )

In [260]:
# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 759}},
 'total_vector_count': 759}

In [257]:
from tqdm.auto import tqdm
import datetime
from time import sleep

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(chunks), batch_size)):
    # find end of batch
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = openai.Embedding.create(input=texts, engine=embed_model)
    except:
        done = False
        while not done:
            sleep(5)
            try:
                res = openai.Embedding.create(input=texts, engine=embed_model)
                done = True
            except:
                pass
    embeds = [record['embedding'] for record in res['data']]
    # cleanup metadata
    meta_batch = [{
        'text': x['text'],
        'chunk': x['chunk'],
        'source': x['source']
        
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

100%|██████████| 8/8 [01:31<00:00, 11.46s/it]


In [262]:
query = "赫羅喜歡吃什麼 ?"

res = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# retrieve from Pinecone
xq = res['data'][0]['embedding']

# get relevant contexts (including the questions)
res = index.query(xq, top_k=5, include_metadata=True)

In [264]:
contexts = [item['metadata']['text'] for item in res['matches']]

augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

In [268]:
primer = f"""You are Q&A bot. A highly intelligent system that answers
user questions based on the information provided by the user above
each question. If the information can not be found in the information
provided by the user you truthfully say "I don't know".
"""

res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": augmented_query}
    ]
)


In [269]:
from IPython.display import Markdown

display(Markdown(res['choices'][0]['message']['content']))

根據提供的文本，赫蘿喜歡吃隻果，但她也嘗試了羅倫斯提供的黑麥面包。至於其他食物方面的偏好並沒有提到。

In [278]:
ans_sample = index.query(xq, top_k=3, include_metadata=True)
print(ans_sample)

{'matches': [{'id': 'a388acbf-3a45-4131-84af-0880b692fb7c',
              'metadata': {'chunk': 241.0,
                           'source': './txt/01-1.txt',
                           'text': '赫蘿拉了拉羅倫斯的衣服，指著攤販說。 \n'
                                   ' \n'
                                   '    在往返不斷的馬車和行人的另一邊，有著堆積如山的隻果。 \n'
                                   ' \n'
                                   '    「喔，很漂亮的隻果。」 \n'
                                   ' \n'
                                   '    「是吧!」 \n'
                                   ' \n'
                                   '    '
                                   '外套底下的赫蘿露出閃耀著光芒的眼神。不曉得赫蘿本人有沒有察覺.她藏在腰巾里頭的尾巴正像狗兒一樣發出唰唰唰的聲音。或許赫蘿是真的喜歡吃隻果，「看起來很好吃的樣子，是吧?」 \n'
                                   ' \n'
                                   '    「是啊。」'},
              'score': 0.854759574,
              'values': []},
             {'id': '110d7991-7a3e-49fa-afde-74df2a7feb51',
              'metadata': {'chunk': 453.0,
                 

In [272]:
llm = OpenAI(model_name="text-davinci-003", openai_api_key=openai_api_key)

In [273]:
print(llm(query))



赫羅喜歡吃各式各樣的食物，尤其是墨西哥菜和中國菜。
