[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/toobajaved/smuDistilBert/blob/main/chatbot.ipynb)


In [1]:
!pip install transformers datasets faiss-cpu




In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import faiss

In [3]:
# Load your dataset from Hugging Face
dataset = load_dataset("tootooba/SMU_FAQDataset")['train']

# Extract questions and answers
questions = dataset['question']
answers = dataset['answer']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

faq_data_cleaned.csv:   0%|          | 0.00/127k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/250 [00:00<?, ? examples/s]

In [4]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'  # A lightweight model for embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()  # Set the model to evaluation mode


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [5]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of output contains token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(
        input_mask_expanded.sum(dim=1), min=1e-9
    )

def encode_questions(questions):
    encoded_input = tokenizer(questions, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return embeddings.numpy()

# Generate embeddings
question_embeddings = encode_questions(questions)


In [6]:
# Initialize FAISS index
embedding_dim = question_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)

# Add question embeddings to the index
index.add(question_embeddings)


In [7]:
def find_best_answer(user_question, top_k=1):
    # Encode the user's question
    encoded_input = tokenizer(user_question, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    user_embedding = mean_pooling(model_output, encoded_input['attention_mask']).numpy()

    # Search for similar questions
    distances, indices = index.search(user_embedding, top_k)
    best_answers = [answers[idx] for idx in indices[0]]
    return best_answers[0]  # Return the most similar answer


In [8]:
# Function to interact with the chatbot
def chatbot():
    print("Welcome to the SMU FAQ Chatbot! Type 'exit' to quit.")
    while True:
        user_question = input("You: ")
        if user_question.lower() in ['exit', 'quit']:
            print("Chatbot: Goodbye!")
            break
        answer = find_best_answer(user_question)
        print(f"Chatbot: {answer}\n")


In [None]:
chatbot()


Welcome to the SMU FAQ Chatbot! Type 'exit' to quit.
You: what is academic calendar?
Chatbot: The academic calendar that arrives with acceptance letters is the rulebook for the University. In addition to listing the courses offered at Saint Mary's, the academic calendar contains valuable information on such things as programs, admission requirements, schedules, costs, regulations and student support services. When selecting courses, the first step is to take a look at the course descriptions and prerequisites outlined in the academic calendar. To see when these courses are offered, check the timetable on the Self Service Banner. Undergraduate calendars are available by request from either Admissions or the Service Centre; Graduate calendar are available from the Faculty of Graduate Studies and Research in Room AT 210. For a digital version of the current academic calendar check: Academic Calendar.

You: what is co-op
Chatbot: The Co-operative Education program is a method of learning w

KeyboardInterrupt: Interrupted by user