In [1]:
import os
import pprint
import datasets
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings

# Load dataset

In [2]:
language = "english"
data = datasets.load_dataset("wiki_lingua", name=language, split="train[:8000]")

In [3]:
len(data)

8000

In [4]:
data["article"][0]

{'section_name': ['Finding Other Transportation',
  'Designating a Driver',
  'Staying Safe'],
 'document': ['make sure that the area is a safe place, especially if you plan on walking home at night.  It’s always a good idea to practice the buddy system.  Have a friend meet up and walk with you. Research the bus, train, or streetcar routes available in your area to find safe and affordable travel to your destination.  Make sure you check the schedule for your outgoing and return travel.  Some public transportation will cease to run late at night.  Be sure if you take public transportation to the venue that you will also be able to get home late at night. Check the routes.  Even if some public transit is still running late at night, the routing may change.  Some may run express past many of the stops, or not travel all the way to the ends.  Be sure that your stop will still be available when you need it for your return trip. If you are taking public transit in a vulnerable state after d

In [5]:
def flatten(example):
    return {
        "document": example["article"]["document"],
        "summary": example["article"]["summary"],
    }

def list2samples(example):
    documents = []
    summaries = []
    for sample in zip(example["document"], example["summary"]):
        if len(sample[0]) > 0:
            documents += sample[0]
            summaries += sample[1]
    return {"document": documents, "summary": summaries}


data = data.map(flatten, remove_columns=["article", "url"])
data = data.map(list2samples, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [6]:
train_split, validation_split = data.train_test_split(test_size=0.1).values()

In [7]:
len(train_split), len(validation_split)

(17508, 1946)

In [8]:
#save the data 
os.makedirs("../data", exist_ok=True)
train_split.save_to_disk("../data/train_split")
validation_split.save_to_disk("../data/validation_split")

Saving the dataset (0/1 shards):   0%|          | 0/17508 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1946 [00:00<?, ? examples/s]

# Create VectorStore

In [9]:
model_name = 'sentence-transformers/sentence-t5-base'
model_kwargs = {'device': 'cuda'}

hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,

)

In [12]:
documents = []
for index,doc in enumerate(train_split):
    documents.append(Document(page_content=doc["document"], metadata={"source": "local","id":index} ))

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=20)

In [15]:
splitted_documents = text_splitter.split_documents(documents[:])

In [14]:
len(splitted_documents)

41624

In [16]:
vectorstore = FAISS.from_documents(splitted_documents, hf)

In [17]:
vectorstore.save_local("../data/faiss_index")

In [19]:
vectorstore = FAISS.load_local("../data/faiss_index",hf)

In [20]:
similar_docs = vectorstore.similarity_search("The quick brown fox jumps over the lazy dog", k=5)

In [29]:
for doc in similar_docs:
    pprint.pprint(doc.page_content)

('a fox. Foxes eat small animals in the wild so do not keep a fox around '
 'smaller animals especially birds or rodents. Never leave a fox alone with '
 'other animals. Foxes are omnivores, so they can eat a large variety of '
 'foods. Give them a high quality medium sized dog food and consult a vet '
 'about other food options.  Fox diets can include meats and vegetables to '
 'supplement basic dog food. Check with your vet. Leave out some dry dog food '
 'for the fox during the day. Try adding a can of wet cat food mixed with')
'in quickly, staying alert for potential dog movement.'
("kill other animals even if they aren't hungry, including neighborhood pets. "
 'Give your fox a collar and tags so if it does get away it will not be '
 'mistaken for a wild fox. Foxes are very high energy animals, so they need a '
 'lot of exercise. About an hour or two a day will wear your pet down. Always '
 'use a harness for walks. The collar is for identification, but a leash '
 'should be attach

In [30]:
vectorstore.similarity_search_with_score("The quick brown fox jumps over the lazy dog", k=5)

[(Document(page_content='a fox. Foxes eat small animals in the wild so do not keep a fox around smaller animals especially birds or rodents. Never leave a fox alone with other animals. Foxes are omnivores, so they can eat a large variety of foods. Give them a high quality medium sized dog food and consult a vet about other food options.  Fox diets can include meats and vegetables to supplement basic dog food. Check with your vet. Leave out some dry dog food for the fox during the day. Try adding a can of wet cat food mixed with', metadata={'source': 'local', 'id': 833}),
  0.38002527),
 (Document(page_content='in quickly, staying alert for potential dog movement.', metadata={'source': 'local', 'id': 3689}),
  0.39186782),
 (Document(page_content="kill other animals even if they aren't hungry, including neighborhood pets. Give your fox a collar and tags so if it does get away it will not be mistaken for a wild fox. Foxes are very high energy animals, so they need a lot of exercise. Abou

In [31]:
val_index = 0
validation_sample = validation_split[0]
validation_document = Document(page_content=validation_sample["document"], metadata={"source": "local","id":val_index})

splitted_validation_document = text_splitter.split_documents([validation_document])

In [32]:
splitted_validation_document[:5]

[Document(page_content='Draw a number line, but place your zero to the far right of the line. Then, number your line toward your left using negative numbers. Mark the first negative number you want to add on the number line. Then, count out your second negative number, moving to the left. This gives you your answer. For example, let’s say you’re adding -4+-6. You’d circle -4 on your number line. Then, count 6 spaces to the left. You’ll arrive at -10, which is your answer. This is because you are moving the same number of places on', metadata={'source': 'local', 'id': 0}),
 Document(page_content='number of places on the number line, just toward the negative side. This means your final answer will be negative. For example, when adding -12+-21, you could add 12+21=33. However, since your numbers were negative, you’d make your answer -33. Draw a number line with zero in the middle. Number to the left moving from -1 to -10, then number to the right 1 to 10. Circle the positive number on you

In [33]:
similar_documents = []
for doc in splitted_validation_document:
    similar_documents.append(vectorstore.similarity_search_with_score(doc.page_content, k=5))
similar_documents = [(item[1],item[0]) for sublist in similar_documents for item in sublist]

In [38]:
#orignal document
pprint.pprint(validation_document.page_content)

('Draw a number line, but place your zero to the far right of the line. Then, '
 'number your line toward your left using negative numbers. Mark the first '
 'negative number you want to add on the number line. Then, count out your '
 'second negative number, moving to the left. This gives you your answer. For '
 'example, let’s say you’re adding -4+-6. You’d circle -4 on your number line. '
 'Then, count 6 spaces to the left. You’ll arrive at -10, which is your '
 'answer. This is because you are moving the same number of places on the '
 'number line, just toward the negative side. This means your final answer '
 'will be negative. For example, when adding -12+-21, you could add 12+21=33. '
 'However, since your numbers were negative, you’d make your answer -33. Draw '
 'a number line with zero in the middle. Number to the left moving from -1 to '
 '-10, then number to the right 1 to 10. Circle the positive number on your '
 'number line. Then, count out your negative number to find 

In [37]:
pprint.pprint(similar_documents[0][1].page_content)

('Draw a line, then write numbers along the line from 0-15. Circle the first '
 'number you want to add. Start at that number. Then, count down your number '
 'line, moving the same number of spaces as the second number you’re adding. '
 'You’ll land on your answer.  Let’s say you want to add 4+5. Circle 4 on your '
 'number line, then count 5 spaces down the line. You will land on 9, which is '
 'your answer. Use your number line to add these numbers:  5+3 1+7 6+2 4+4 '
 'Start with a basic math problem, such as 4+6. Create two')


In [40]:
#full text of the similar document
pprint.pprint(documents[similar_documents[0][1].metadata["id"]].page_content)

('Draw a line, then write numbers along the line from 0-15. Circle the first '
 'number you want to add. Start at that number. Then, count down your number '
 'line, moving the same number of spaces as the second number you’re adding. '
 'You’ll land on your answer.  Let’s say you want to add 4+5. Circle 4 on your '
 'number line, then count 5 spaces down the line. You will land on 9, which is '
 'your answer. Use your number line to add these numbers:  5+3 1+7 6+2 4+4 '
 'Start with a basic math problem, such as 4+6. Create two piles of beans to '
 'represent your problem, including one group of 4 beans and one group of 6 '
 'beans. Next, you can combine your two sets of beans to get the answer to '
 'your addition problem. Count your beans to see that you now have a total of '
 '10 beans.  If you don’t have beans, you can use any small item that fits in '
 'your hand to practice addition! For example, you can use blocks, candies, '
 'coins, or legos. Use your beans to add these numbe

In [41]:
similar_documents.sort(key=lambda tup: tup[0],reverse=False)

In [42]:
top_k = 2
top_k_documents = [item[1].metadata['id'] for item in similar_documents[:top_k]]

In [43]:
top_k_documents

[11087, 11087]