In [1]:
# You need to load your OPENAI_API_KEY into your environment
with open('../../my_key.txt') as f:
    key_list = f.readlines()

key_list = [key.strip() for key in key_list]

keys = {}
for key in key_list:
    key = key.split(':')
    if (key != ['']):
        keys[key[0]] = key[1]

In [2]:
import os

os.environ['OPENAI_API_KEY'] = keys['OPENAI_API_KEY']

# VectorStore Construction
- I/O
    - Input: text.txt
    - Output: text_chunks (with document insides)

In [17]:
with open('text.txt', 'r') as f:
    text = f.read()

texts = text.split('++-------------------++\n')

In [18]:
# Change the text string into a Document object
from langchain.schema import Document

docs = []
titles = []
for text in texts:
    titles.append(text.split('\n')[0])
    docs.append(Document(page_content=''.join(text.split('\n')[1:])))

print(len(docs))

3


In [19]:
titles

['[this text is from url0]', '[from pdf0]', '[from docx0]']

## Tiktoken

In [3]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [5]:
print(len(str(text)))
print(len(encoding.encode(str(text))))

135660
32736


## Text Splitter
- I/O
    - Input: text.txt
    - Output: text_chunks (with document insides)

### RecursiveCharacterTextSplitter
- Using len() to count chunk_size -> 0.003s

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 30,
    length_function = len
)

In [12]:
start = time.time()
text_chunks = text_splitter.split_documents([text])
end = time.time()
print(end-start)

0.003742218017578125


In [None]:
print(type(text_chunks), type(text_chunks[0]), len(text_chunks))
for chunk in text_chunks:
    print('---', chunk.page_content)

### TiktokenSplitter
- Using tiktoken to count chunk_size -> 0.01s

In [20]:
from langchain.text_splitter import TokenTextSplitter
import time

token_text_splitter = TokenTextSplitter(
    chunk_size = 200,
    chunk_overlap = 0,
)

In [21]:
start = time.time()
text_chunks = []
for i in range(len(docs)):
    chunks = token_text_splitter.split_documents([docs[i]])
    for chunk in chunks:
        chunk.page_content = f'{titles[i]}\n' + chunk.page_content
        text_chunks.append(chunk)
end = time.time()
print(end-start)

0.018354177474975586


In [22]:
print(type(text_chunks), type(text_chunks[0]), len(text_chunks))
for chunk in text_chunks:
    print('---', chunk.page_content)

<class 'list'> <class 'langchain.schema.Document'> 158
--- [this text is from url0]
Pokémon[a][1][2][3] (an abbreviation for Pocket Monsters[b] in Japanese) is a Japanese media franchise managed by The Pokémon Company, founded by Nintendo, Game Freak, and Creatures. The franchise was created by Satoshi Tajiri in 1996,[4] and is centered around fictional creatures called "Pokémon". In Pokémon, Pokémon Trainers are people who catch, train, care for, and battle with Pokémon. The English slogan for the franchise is "Gotta Catch 'Em All!".[5][6] There are currently 1015 Pokémon species.[7]The franchise began as Pocket Monsters: Red and Green (later released outside of Japan as Pokémon Red and Blue), a pair of video games for the original Game Boy handheld system that were developed by Game Freak and published by Nintendo in February 1996. Pokémon soon became a media mix franchise adapted into various different media.[8] Pokémon is one of the highest-grossing media franchises of all time. Th

## Embdedding
- I/O
    - Input: text_chunks 
    - Output: Search Space

- Performance
    - Using openai_embeddings -> ~4s

In [23]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import time

In [24]:
openai_embeddings = OpenAIEmbeddings(
    model = "text-embedding-ada-002"
)

In [25]:
start = time.time()
search_space = Chroma.from_documents(text_chunks, openai_embeddings)
end = time.time()

print(end-start)

Using embedded DuckDB without persistence: data will be transient


4.259869575500488


## Similarity Search Test

In [26]:
import time

query = "align the vision task to language model"

In [27]:
start = time.time()
search_result = search_space.similarity_search(query)
end = time.time()

print(end-start)

0.3242034912109375


In [28]:
len(search_result)

4

In [29]:
for result in search_result:
    print('---', result.page_content, '\n')

--- [from pdf0]
 provides a consistent interface for vision-centric task definition and customization;(2) a language-guided image tokenizer, which encodes visual information in alignment with the givenlanguage prompt, enabling the model to comprehend and parse the visual content effectively; and(3) an LLM-based open-task decoder, which utilizes the encoded visual information and languageinstructions to generate satisfactory predictions or outputs. The three designs work together to achievea flexible and open-ended framework that can handle various vision-centric tasks at different levels oftask customization through language instructions.Different from previous interactive systems [ 68,73,50,35,30] that rely on APIs, our VisionLLMpresents a more flexible and end-to-end pipeline. Given language instructions that describe the currenttasks and an input image, the model first uses a language-guided image tokenizer to encode the imagetokens based on the given prompt. Then, the image tokens 