1. Install/Import Necessary Libraries

In [1]:
# Install dependencies
!pip install -q faiss-cpu   # Facebook AI Similarity Search - used for efficient similarity search

In [2]:
import faiss    # Facebook AI Similarity Search - used for efficient similarity search
import numpy as np  # For numerical operations, especially arrays and matrices
import torch   # PyTorch - for loading and running deep learning models
from transformers import AutoModel, AutoTokenizer   # Hugging Face transformers for loading pretrained models

2. Check if GPU is available and use it; otherwise fall back to CPU

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


3. Sample Corpus

In [4]:
corpus = [
    "Chai at Sharma Ji's tea stall in Hazratganj hits different after lectures",
    "Biryani from Tunday Kababi is a weekend ritual for foodies",
    "Evening walks at Gomti Riverfront are refreshing after a long day",
    "Studying in AKTU comes with its own charm",
    "Assignments are best done the night before deadline",
    "Gomti Nagar cafes are favorite hangout spots for students",
    "During exams, everyone suddenly becomes religious",
    "Street food near Kapoorthala is always buzzing with students",
    "Hostel life teaches you more than any textbook can",
    "Python coding becomes fun when done with friends at night",
    "College fests in Lucknow are full of energy and dance",
    "Local coaching centers are packed during entrance exam season",
    "Rainy days in Lucknow make chai and pakoras mandatory",
    "Instagram reels take more time than actual assignments",
    "Lucknow students are pros at last-minute exam prep",
    "The best part of college is hanging out at the campus gate",
    "Everyone becomes a motivational speaker during exams"
]

In [5]:
import os
os.environ['HF_TOKEN'] = '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'

4. Tokenization and Embedding model

In [6]:
# Load the tokenizer for the INSTRUCTOR model - this breaks text into tokens for the model to process
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-xl")
# Load the encoder part of the INSTRUCTOR model and move it to GPU/CPU (to generate contextual embeddings)
model = AutoModel.from_pretrained("hkunlp/instructor-xl").encoder.to(device)

# ignore warnings

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))
Some weights of T5Model were not initialized from the model checkpoint at hkunlp/instructor-xl and are newly initialized: ['decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'decoder.block.0.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'deco

In [7]:
# Pre tokenization step
# Prepare instruction + text pairs
instruction = "Represent sentence for semantic similarity"
inputs = [[instruction, text] for text in corpus]


# we do this step because
# The INSTRUCTOR model is a transformer-based language model developed to generate task-specific sentence embeddings
# that means it turns a sentence into a vector that captures the sentence’s meaning with respect to a specific instruction or task.

In [8]:
# Output of instruction + text pairs
inputs

[['Represent sentence for semantic similarity',
  "Chai at Sharma Ji's tea stall in Hazratganj hits different after lectures"],
 ['Represent sentence for semantic similarity',
  'Biryani from Tunday Kababi is a weekend ritual for foodies'],
 ['Represent sentence for semantic similarity',
  'Evening walks at Gomti Riverfront are refreshing after a long day'],
 ['Represent sentence for semantic similarity',
  'Studying in AKTU comes with its own charm'],
 ['Represent sentence for semantic similarity',
  'Assignments are best done the night before deadline'],
 ['Represent sentence for semantic similarity',
  'Gomti Nagar cafes are favorite hangout spots for students'],
 ['Represent sentence for semantic similarity',
  'During exams, everyone suddenly becomes religious'],
 ['Represent sentence for semantic similarity',
  'Street food near Kapoorthala is always buzzing with students'],
 ['Represent sentence for semantic similarity',
  'Hostel life teaches you more than any textbook can'],
 

In [9]:
# Tokenize and generate embeddings
with torch.no_grad():
    # Tokenize the inputs
    tokenized_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt").to(device)
    # padding=True: Adds padding tokens to shorter sentences so that all sequences are the same length.
    # truncation=True: Truncates longer sentences to fit the model’s max input length.
    # return_tensors="pt": Returns PyTorch tensors instead of plain lists or NumPy arrays.
    # .to(device): Moves the data to GPU (if available) or CPU, depending on earlier device setting.


    # Passes the tokenized inputs to the transformer model (to generate embeddings)
    outputs = model(**tokenized_inputs)
    # selects the first token across all sequences and extracts the embedding corresponding to it
    # this token caputures the summary of the entire sentence
    embeddings = outputs.last_hidden_state[:, 0, :]
    # Normalize embeddings
    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
    # Convert to numpy array
    embeddings = embeddings.cpu().numpy()

In [10]:
embeddings

array([[-0.05981033,  0.05593438, -0.01728095, ..., -0.02245244,
         0.04297271, -0.0030313 ],
       [-0.03933581,  0.06761319, -0.00771267, ..., -0.0242745 ,
         0.04034788,  0.00161839],
       [-0.05593694,  0.06834757, -0.01870932, ..., -0.01145083,
         0.01680318,  0.01874093],
       ...,
       [-0.06866862,  0.06265455, -0.02622236, ..., -0.0175017 ,
         0.04128272,  0.00350707],
       [-0.05939035,  0.07108221, -0.01770738, ..., -0.00785265,
         0.01533323,  0.01536106],
       [-0.04476127,  0.06080019, -0.01977132, ..., -0.01784521,
         0.00703757,  0.0265498 ]], dtype=float32)

5. Create FAISS Index

In [11]:
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

index

<faiss.swigfaiss_avx512.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x7fefe703c630> >

6. Define a function to generate embeddings for input pairs using the model

In [12]:
# Function to query the index
def search_faiss(query, top_k=3):
    # Tokenize and encode the query
    with torch.no_grad():
        # tokenize the query
        tokenized_query = tokenizer([[instruction, query]], padding=True, truncation=True, return_tensors="pt").to(device)
        # generate query embedding
        query_output = model(**tokenized_query)
        query_vector = query_output.last_hidden_state[:, 0, :]
        query_vector = torch.nn.functional.normalize(query_vector, p=2, dim=1)
        query_vector = query_vector.cpu().numpy()

    # Search the index
    scores, indices = index.search(query_vector, top_k)
    print(f"\nQuery: {query}\n")
    for i, idx in enumerate(indices[0]):
        print(f"{i+1}. {corpus[idx]} (Score: {scores[0][i]:.4f})")

7. Example search

In [13]:
# Example query
search_faiss("Where do students relax in Lucknow?")


Query: Where do students relax in Lucknow?

1. College fests in Lucknow are full of energy and dance (Score: 0.9080)
2. Gomti Nagar cafes are favorite hangout spots for students (Score: 0.9011)
3. Street food near Kapoorthala is always buzzing with students (Score: 0.8970)
