In [1]:
import os
import pprint
import datasets
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings


# Load dataset

In [3]:
language = "english"
data = datasets.load_dataset("wiki_lingua", name=language, split="train[:4000]")

In [4]:
data["article"][0]

{'section_name': ['Finding Other Transportation',
  'Designating a Driver',
  'Staying Safe'],
 'document': ['make sure that the area is a safe place, especially if you plan on walking home at night.  It’s always a good idea to practice the buddy system.  Have a friend meet up and walk with you. Research the bus, train, or streetcar routes available in your area to find safe and affordable travel to your destination.  Make sure you check the schedule for your outgoing and return travel.  Some public transportation will cease to run late at night.  Be sure if you take public transportation to the venue that you will also be able to get home late at night. Check the routes.  Even if some public transit is still running late at night, the routing may change.  Some may run express past many of the stops, or not travel all the way to the ends.  Be sure that your stop will still be available when you need it for your return trip. If you are taking public transit in a vulnerable state after d

In [5]:
def flatten(example):
    return {
        "document": example["article"]["document"],
        "summary": example["article"]["summary"],
    }


def list2samples(example):
    documents = []
    summaries = []
    for sample in zip(example["document"], example["summary"]):
        if len(sample[0]) > 0:
            documents += sample[0]
            summaries += sample[1]
    return {"document": documents, "summary": summaries}


data = data.map(flatten, remove_columns=["article", "url"])
data = data.map(list2samples, batched=True)

In [6]:
train_split, validation_split = data.train_test_split(test_size=0.1).values()

In [7]:
len(train_split), len(validation_split)

(8755, 973)

# Create VectorStore

In [9]:
model_name = 'sentence-transformers/sentence-t5-base'
model_kwargs = {'device': 'cuda'}

hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,

)

In [10]:
documents = []
for index,doc in enumerate(train_split):
    documents.append(Document(page_content=doc["document"], metadata={"source": "local","id":index} ))

In [11]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=20)

In [12]:
len(documents)

8755

In [13]:
splitted_documents = text_splitter.split_documents(documents[:])

In [14]:
len(splitted_documents)

41624

In [15]:
vectorstore = FAISS.from_documents(splitted_documents, hf)

In [16]:
vectorstore.save_local("faiss_index")

In [17]:
vectorstore = FAISS.load_local("faiss_index",hf)

In [18]:
similar_docs = vectorstore.similarity_search("The quick brown fox jumps over the lazy dog", k=5)

In [19]:
similar_docs

[Document(page_content='a fox. Foxes eat small animals in the wild so do not keep a fox around smaller animals especially birds or rodents. Never leave a fox alone with other animals. Foxes are omnivores, so they can eat a large variety of foods. Give them a high quality medium sized dog food and consult a vet about other food options.  Fox diets can include meats and vegetables to supplement basic dog food. Check with your vet. Leave out some dry dog food for the fox during the day. Try adding a can of wet cat food mixed with', metadata={'source': 'local', 'id': 3871}),
 Document(page_content='in quickly, staying alert for potential dog movement.', metadata={'source': 'local', 'id': 7}),
 Document(page_content="kill other animals even if they aren't hungry, including neighborhood pets. Give your fox a collar and tags so if it does get away it will not be mistaken for a wild fox. Foxes are very high energy animals, so they need a lot of exercise. About an hour or two a day will wear yo

In [20]:
vectorstore.similarity_search_with_score("The quick brown fox jumps over the lazy dog", k=5)

[(Document(page_content='a fox. Foxes eat small animals in the wild so do not keep a fox around smaller animals especially birds or rodents. Never leave a fox alone with other animals. Foxes are omnivores, so they can eat a large variety of foods. Give them a high quality medium sized dog food and consult a vet about other food options.  Fox diets can include meats and vegetables to supplement basic dog food. Check with your vet. Leave out some dry dog food for the fox during the day. Try adding a can of wet cat food mixed with', metadata={'source': 'local', 'id': 3871}),
  0.38002527),
 (Document(page_content='in quickly, staying alert for potential dog movement.', metadata={'source': 'local', 'id': 7}),
  0.39186782),
 (Document(page_content="kill other animals even if they aren't hungry, including neighborhood pets. Give your fox a collar and tags so if it does get away it will not be mistaken for a wild fox. Foxes are very high energy animals, so they need a lot of exercise. About 

In [21]:
val_index = 0
validation_sample = validation_split[0]
validation_document = Document(page_content=validation_sample["document"], metadata={"source": "local","id":val_index})

splitted_validation_document = text_splitter.split_documents([validation_document])

In [22]:
splitted_validation_document[:5]

[Document(page_content='This is very important, as it will protect your hair from heat damage. Heat-damaged hair can often look dry, frizzy, or ratted. To prevent this from happening, apply a heat protectant spray to your hair, focusing on the ends. You should have the bottom third or bottom fourth of your hair loose. The thicker you hair is, the more sections you will need to work with. Take a small section of hair, and place a round brush under it, as close to your roots as you can. Turn on your hairdryer, and place it right', metadata={'source': 'local', 'id': 0}),
 Document(page_content='and place it right over the brush. Keep the hairdryer directly over the brush at all times. Do not pull the brush completely past the ends of your hair. If your hair is not smooth, pull the brush completely past your hair and start again. When your hair is smooth, rotate the brush back towards your scalp, rolling the hair around it. Leave the brush in your hair as it cools. Once it is cool to the t

In [23]:
similar_documents = []
for doc in splitted_validation_document:
    similar_documents.append(vectorstore.similarity_search_with_score(doc.page_content, k=5))
similar_documents = [(item[1],item[0]) for sublist in similar_documents for item in sublist]

In [24]:
similar_documents[:10]


[(0.08791785,
  Document(page_content="You can apply more heat protectant serum as you go if your hair looks like it's getting dried out.  Don't go over the same shaft of hair too many times. This can make your hair start to get brittle. Doing a little at a time gives you control over how much heat you're applying to your hair, reducing the risk of damage. Take your time and go over each strand of hair until your entire mane is straight, sleek and glossy.", metadata={'source': 'local', 'id': 1116})),
 (0.091248035,
  Document(page_content='your hair. Before blow drying your hair you should apply a heat protecting spray. Hold the spray bottle about a foot from your head and spritz your wet hair. Make sure to cover the entire surface area.  Heat protection spray can be found in the haircare aisle of most general and beauty stores. Select a heat protection spray that can be used on wet and dry hair.', metadata={'source': 'local', 'id': 7865})),
 (0.10462737,
  Document(page_content="don't

In [25]:
similar_documents.sort(key=lambda tup: tup[0],reverse=False)

In [26]:
top_k = 2
top_k_documents = [item[1].metadata['id'] for item in similar_documents[:top_k]]


In [27]:
top_k_documents

[1116, 7865]