In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install faiss-cpu

import os
import faiss
import warnings
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from langchain.text_splitter import RecursiveCharacterTextSplitter

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [3]:
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*tf.losses.sparse_softmax_cross_entropy.*")
warnings.filterwarnings("ignore", category=UserWarning, message=".*Torch was not compiled with flash attention.*")

In [4]:
class CodeEmbedder:
    def __init__(self, model_name="microsoft/codebert-base", chunk_size=1024):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        self.model = AutoModel.from_pretrained(model_name)
        self.model.config.pad_token_id = self.tokenizer.pad_token_id

        self.chunk_size = chunk_size
        self.chunk_overlap = 50
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
        )

    def _chunk_text(self, text):
        return self.text_splitter.split_text(text) if text.strip() else []

    def generate_embedding(self, text):

        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
        return embeddings.numpy()


In [5]:
class VectorStore:
    def __init__(self, dimension=None):
        self.index = None
        self.file_mapping = []
        self.dimension = dimension

    def _initialize_index(self, dimension):
        if dimension is None:
            raise ValueError("Embedding dimension cannot be None")
        self.index = faiss.IndexFlatL2(dimension)

    def add_vectors(self, embeddings, file_paths):
        """Add vectors and corresponding metadata (file paths)."""
        embeddings_array = np.array(embeddings, dtype="float32")

        if self.index is None or embeddings_array.shape[1] != self.index.d:
            self._initialize_index(embeddings_array.shape[1])

        embeddings_array /= np.linalg.norm(embeddings_array, axis=1, keepdims=True)
        self.index.add(embeddings_array)
        self.file_mapping.extend(file_paths)

    def search(self, query_vector, top_k=3):
        query_array = query_vector.reshape(1, -1).astype("float32")
        query_array /= np.linalg.norm(query_array, axis=1, keepdims=True)
        distances, indices = self.index.search(query_array, top_k)
        results = [self.file_mapping[i] for i in indices[0] if i >= 0 and i < len(self.file_mapping)]
        return results, distances[0]


In [6]:
class RAGRetriever:
    def __init__(self, embedder, vector_store, similarity_threshold=0.9):
        self.embedder = embedder
        self.vector_store = vector_store
        self.similarity_threshold = similarity_threshold

    def retrieve(self, query, top_k=3):
        query_embedding = self.embedder.generate_embedding(query)

        results, distances = self.vector_store.search(query_embedding, top_k=top_k)


        relevant_chunks = []
        unique_chunks = set()

        for result in results:
            with open(result, 'r', encoding='utf-8') as file:
                content = file.read()

                chunks = self.embedder._chunk_text(content)
                for chunk in chunks:
                    chunk_embedding = self.embedder.generate_embedding(chunk)
                    similarity = np.dot(query_embedding, chunk_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(chunk_embedding))

                    if similarity > self.similarity_threshold:
                        if chunk not in unique_chunks:
                            relevant_chunks.append(chunk)
                            unique_chunks.add(chunk)

        if not relevant_chunks:
            return []

        return relevant_chunks[:top_k]

In [7]:
class HuggingFaceChatbot:
    def __init__(self, retriever, system_message, model_name="distilgpt2"):
        self.retriever = retriever
        self.system_message = system_message
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.model = AutoModelForCausalLM.from_pretrained(model_name)

    def chat(self, query):
        try:

            prompt = f"Query: {query}\n\nAnswer:"


            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512, padding=True)
            attention_mask = inputs.attention_mask if "attention_mask" in inputs else None


            outputs = self.model.generate(
                inputs.input_ids,
                attention_mask=attention_mask,
                max_new_tokens=100,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )

            return self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        except Exception as e:
            return "Sorry, there was an issue processing the request."


In [8]:
folder_path = r"/content/drive/MyDrive/data/editorials"

embedder = CodeEmbedder(model_name="microsoft/codebert-base", chunk_size=1024)
vector_store = VectorStore()

file_paths = []
embeddings = []

for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            file_paths.append(file_path)

    with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                chunks = embedder._chunk_text(content)

for chunk in chunks:
        embedding = embedder.generate_embedding(chunk)
        embeddings.append(embedding)
        vector_store.file_mapping.append(file_path)

vector_store.add_vectors(embeddings, file_paths)

system_message = "I am solving a Competitive Programming problem, and I need help understanding the problem."
retriever = RAGRetriever(embedder, vector_store)
chatbot = HuggingFaceChatbot(retriever, system_message, model_name="distilgpt2")

query = "soltion 1844B"
try:
        response = chatbot.chat(query)
        print(f"Chatbot Response: {response}")
except Exception as e:
        print(f"An error occurred: {e}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot Response: Query: soltion 1844B

Answer:

There is no reason to stop working to solve a problem that has no impact on the user experience.
The problem is that there are no solutions that can solve it in real time. It is possible to solve it in real time, but the problem is that the problem is not solved in real time.
The problem is that there are no solutions that can solve it in real time. It is possible to solve it in real time, but the problem is that the problem is not solved
