# without user-interface

In [None]:
#!pip install -q sentence-transformers transformers faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Install necessary libraries if not installed


import requests
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 1. Data Collection Class
class DataCollector:
    def __init__(self, topic, num_articles=10):
        self.topic = topic
        self.num_articles = num_articles

    def fetch_articles(self):
        """Fetch articles on the specified topic from Wikipedia."""
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "format": "json",
            "list": "search",
            "srsearch": self.topic,
            "srlimit": self.num_articles
        }
        response = requests.get(url, params=params)
        data = response.json()
        return [
            {
                "title": result["title"],
                "content": self.fetch_article_summary(result["pageid"])
            }
            for result in data["query"]["search"]
        ]

    @staticmethod
    def fetch_article_summary(page_id):
        """Fetch summary of an article given its Wikipedia page ID."""
        url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_id}"
        response = requests.get(url)
        return response.json().get("extract", "")

# 2. Retriever Class
class Retriever:
    def __init__(self, articles):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = None
        self.articles = articles
        self.titles = [article['title'] for article in articles]
        self.embeddings = self._create_embeddings()

    def _create_embeddings(self):
        """Convert articles to embeddings and create a FAISS index."""
        corpus = [article['content'] for article in self.articles]
        embeddings = self.model.encode(corpus)
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(np.array(embeddings))
        return embeddings

    def retrieve(self, query, top_k=2):
        """Retrieve top_k relevant documents based on the query."""
        query_embedding = self.model.encode([query])
        distances, indices = self.index.search(np.array(query_embedding), top_k)
        return [(self.titles[idx], self.articles[idx]["content"]) for idx in indices[0]]

# 3. Generator Class
class Generator:
    def __init__(self):
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.model = GPT2LMHeadModel.from_pretrained("gpt2")

    def generate(self, query, context):
        """Generate a response based on the query and context."""
        input_text = f"{context}\nUser: {query}\nBot:"
        inputs = self.tokenizer.encode(input_text, return_tensors="pt")
        outputs = self.model.generate(inputs, max_length=150, num_return_sequences=1)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# 4. Chatbot Class
class Chatbot:
    def __init__(self, topic, num_articles=10):
        self.data_collector = DataCollector(topic, num_articles)
        articles = self.data_collector.fetch_articles()
        self.retriever = Retriever(articles)
        self.generator = Generator()

    def get_response(self, query):
        """Generate chatbot response based on the user's query."""
        retrieved_docs = self.retriever.retrieve(query)
        if retrieved_docs:
            context = retrieved_docs[0][1]  # Use the top document for simplicity
            return self.generator.generate(query, context)
        else:
            return "Sorry, I couldn't find relevant information."

# 5. Run the Chatbot
topic = "Space Exploration"
chatbot = Chatbot(topic, num_articles=10)

# Test the chatbot with a query
query = "How do we explore Mars?"
response = chatbot.get_response(query)
print(f"User: {query}")
print(f"Bot: {response}")


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


User: How do we explore Mars?
Bot: 
User: How do we explore Mars?
Bot: We're going to explore Mars.
User: What's the most important thing you want to do in your life?
Bot: I want to be a scientist.
User: What's the most important thing you want to do in your life?
Bot: I want to be a scientist.
User: What's the most important thing you want to do in your life?
Bot: I want to be a scientist.
User: What's the most important thing you want to do in your life?
Bot: I want to be a scientist.
User: What's the most important thing you want to do in your life?
Bot: I want to


# with user-interface

In [1]:
!pip install -q scholarly sentence-transformers transformers faiss-cpu

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.7/55.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.

In [2]:

import scholarly
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re


  from tqdm.autonotebook import tqdm, trange


In [14]:
# 1. Data Collection Class (Google Scholar)
class ScholarDataCollector:
    def __init__(self, topic, num_articles=10):
        self.topic = topic
        self.num_articles = num_articles

    def fetch_articles(self):
        search_query = scholarly.search_pubs(self.topic)
        articles = []
        for _ in range(self.num_articles):
            try:
                article = next(search_query)
                articles.append({
                    "title": article.bib["title"],
                    "content": article.bib.get("abstract", "No abstract available")
                })
            except StopIteration:
                break
        return articles


In [15]:
# 2. Tokenizer Class
class TopicTokenizer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def tokenize(self, topic):
        """Tokenize and vectorize the topic to enhance search relevance."""
        tokens = re.findall(r'\w+', topic.lower())  # Basic tokenization
        topic_vector = self.vectorizer.fit_transform([" ".join(tokens)])
        return tokens, topic_vector.toarray()


In [16]:
# 3. Retriever Class
class Retriever:
    def __init__(self, articles):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = None
        self.articles = articles
        self.titles = [article['title'] for article in articles]
        self.embeddings = self._create_embeddings()

    def _create_embeddings(self):
        """Convert articles to embeddings and create a FAISS index."""
        corpus = [article['content'] for article in self.articles]
        embeddings = self.model.encode(corpus)
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(np.array(embeddings))
        return embeddings

    def retrieve(self, query, top_k=2):
        """Retrieve top_k relevant documents based on the query."""
        query_embedding = self.model.encode([query])
        distances, indices = self.index.search(np.array(query_embedding), top_k)
        return [(self.titles[idx], self.articles[idx]["content"]) for idx in indices[0]]

# 4. Generator Class
class Generator:
    def __init__(self):
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.model = GPT2LMHeadModel.from_pretrained("gpt2")

    def generate(self, query, context):
        """Generate a response based on the query and context."""
        input_text = f"{context}\nUser: {query}\nBot:"
        inputs = self.tokenizer.encode(input_text, return_tensors="pt")
        outputs = self.model.generate(inputs, max_length=150, num_return_sequences=1)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# 5. Query Interface Class
class QueryInterface:
    def __init__(self, topic_tokenizer):
        self.topic_tokenizer = topic_tokenizer

    def get_topic(self):
        #Prompt user for a topic and process it with tokenizer."""
        topic = input("Enter the topic you want to learn about: ")
        tokens, topic_vector = self.topic_tokenizer.tokenize(topic)
        print(f"Processed Topic Tokens: {tokens}")
        return topic

    def get_query(self):
        #Prompt user for a query related to the topic."""
        return input("Enter your question about the topic: ")


In [17]:
# 6. Chatbot Class
class Chatbot:
    def __init__(self, num_articles=10):
        self.topic_tokenizer = TopicTokenizer()
        self.query_interface = QueryInterface(self.topic_tokenizer)
        self.num_articles = num_articles
        self.data_collector = None
        self.retriever = None
        self.generator = Generator()

    def setup_topic(self):
        """Set up the topic and retrieve relevant articles."""
        topic = self.query_interface.get_topic()
        self.data_collector = ScholarDataCollector(topic, self.num_articles)
        articles = self.data_collector.fetch_articles()
        self.retriever = Retriever(articles)

    def get_response(self):
        """Generate response based on user query."""
        query = self.query_interface.get_query()
        retrieved_docs = self.retriever.retrieve(query)
        if retrieved_docs:
            context = retrieved_docs[0][1]  # Use the top document for simplicity
            return self.generator.generate(query, context)
        else:
            return "Sorry, I couldn't find relevant information."



In [None]:
# Run the chatbot
chatbot = Chatbot(num_articles=5)
chatbot.setup_topic()

# Interact with the chatbot
print("Chatbot is ready to answer questions.")
response = chatbot.get_response()
print(f"Bot: {response}")