In [2]:
# Mount google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Install

!pip install sentence_transformers faiss-gpu transformers datasets

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownlo

In [4]:
# Imports

from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
import faiss
import json
import glob
import os
from datasets import load_dataset

In [5]:
# Define base path

BASE_PATH = '/content/drive/MyDrive/AI/tcfase3'

In [6]:
# Load the fine-tuned FLAN-T5 model and tokenizer

tokenizer = T5Tokenizer.from_pretrained(BASE_PATH + '/fine_tuned_model')
model = T5ForConditionalGeneration.from_pretrained(BASE_PATH + '/fine_tuned_model')

In [7]:
# Load the embedding model

embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# Loading data

json_files = glob.glob(os.path.join(BASE_PATH, '*.json'))
dataset = load_dataset('json', data_files=json_files)['train']

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['title', 'content'],
    num_rows: 200000
})


In [9]:
# Create embeddings for the questions (title)

title_embeddings = embedding_model.encode(dataset['title'])

In [10]:
# Index the embeddings with FAISS

embedding_dim = title_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(title_embeddings)

In [11]:
# Embedding the Query (product)

def retrieve_relevant_content(query, top_n=2):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, top_n)
    retrieved_contents = dataset.select(indices[0])['content']
    return retrieved_contents

In [12]:
# Generate the answer

def generate_answer(product, retrieved_contents):
    # Concatenate the retrieved content as context
    context = " ".join(retrieved_contents)
    prompt = f"Please describe this product: {product} Context: {context}"

    # Tokenize and generate the answer
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True)
    output = model.generate(**inputs, max_length=256)

    # Decode the generated answer
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    # Printing
    print(f"Please describe this product: {product}")
    print(f"Answer: {answer}")
    print("--------------------------------------\n")

In [13]:
# Execute the questions

product = "A Day in the Life of China"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "On Happiness, U.S. Edition"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "The Official CIA Manual of Trickery and Deception"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "The Devils of Loudun"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "The Big Book of Stress Relief Games: Quick, Fun Activities for Feeling Better"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "Coloring for Grown-Ups Holiday Fun Book"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "Brothers of the Knight (Picture Puffin Books)"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "Dearest Love (Best of Betty Neels)"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "The Dream Unfolds (Crosslyn Rise, Book 2)"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "eBay PowerSeller Business Practices For Dummies"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "Essentials of Banking"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "TKO Hiring!: Ten Knockout Strategies for Recruiting, Interviewing, and Hiring Great People"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

product = "The Bleeding Land"
retrieved_contents = retrieve_relevant_content(product)
generate_answer(product, retrieved_contents)

Please describe this product: A Day in the Life of China
Answer: A Day in the Life of China is a dazzling and engrossing book. It is a mustread for anyone interested in the history of China.
--------------------------------------

Please describe this product: On Happiness, U.S. Edition
Answer: This is a book that is a mustread for anyone interested in the development of the Western concept of happiness.
--------------------------------------

Please describe this product: The Official CIA Manual of Trickery and Deception
Answer: The CIAs secret agents were a sleazy, snobby, and snobby group of spies who spied on the CIAs spies. The CIAs secret agents were a sleazy, snobby, and snobby group of spies who spied on the CIAs spies. The CIAs secret agents were a sleazy, snobby, and snobby group of spies who spied on the CIAs spies. The CIAs secret agents were a sleazy, snobby, and snobby group of spies who spied on the CIAs spies. The CIAs secret agents were a sleazy, snobby, and snobby gro