In [1]:
# מתקין את הספרייה
!pip install nltk

# מייבא את הספרייה ומוריד משאבים
import nltk
nltk.download('punkt')
nltk.download('wordnet')






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
import re
import json
from collections import Counter
import pandas as pd
from nltk.chat.util import Chat, reflections

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

class DocumentFetcher:
    def __init__(self, links):
        self.links = links
        self.docs = {}
        self.doc_ids_to_links = {f"doc_{i+1}": link for i, link in enumerate(links)}

    def fetch(self):
        for i, url in enumerate(self.links):
            try:
                response = requests.get(url)
                doc_id = f"doc_{i+1}"
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    text = soup.get_text(separator=' ')
                    self.docs[doc_id] = text.lower()
                else:
                    self.docs[doc_id] = ""
            except Exception:
                self.docs[f"doc_{i+1}"] = ""
        return self.docs, self.doc_ids_to_links

class TextProcessor:
    def __init__(self, stop_words):
        self.stop_words = set(stop_words)
        self.lemmatizer = WordNetLemmatizer()

    def process(self, docs):
        word_counts = Counter()
        for content in docs.values():
            words = re.findall(r'\b[a-z]{2,}\b', content)
            for word in words:
                if word in self.stop_words:
                    continue
                lemma = self.lemmatizer.lemmatize(word)
                word_counts[lemma] += 1
        return word_counts

class Indexer:
    def __init__(self, stop_words):
        self.stop_words = set(stop_words)
        self.lemmatizer = WordNetLemmatizer()

    def build_index(self, docs):
        index = {}
        for doc_id, content in docs.items():
            words = re.findall(r'\b[a-z]{2,}\b', content)
            for word in words:
                if word in self.stop_words:
                    continue
                lemma = self.lemmatizer.lemmatize(word)
                if lemma not in index:
                    index[lemma] = {'count': 0, 'DocIDs': set()}
                index[lemma]['count'] += 1
                index[lemma]['DocIDs'].add(doc_id)
        return index

class FirebaseUploader:
    def __init__(self, db_url):
        self.db_url = db_url

    def upload(self, index, doc_ids_to_links):
        data_to_send = {}
        for term, data in index.items():
            links = [doc_ids_to_links[doc] for doc in data['DocIDs']]
            data_to_send[term] = {
                'term': term,
                'DocIDs': links,
                'count': data['count']
            }
        try:
            response = requests.put(self.db_url + ".json", data=json.dumps(data_to_send))
            return response.status_code == 200, response.text
        except Exception as e:
            return False, str(e)

class LogicalSearch:
    def __init__(self, index, stop_words):
        self.index = index
        self.stop_words = set(stop_words)
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_term(self, term):
        term = term.lower()
        if term in self.stop_words:
            return None
        return self.lemmatizer.lemmatize(term)

    def search(self, query):
        query_upper = query.upper()
        operators = re.findall(r'AND|OR', query_upper)
        raw_terms = [term for term in query_upper.split() if term not in ('AND', 'OR')]

        terms = []
        for term in raw_terms:
            processed = self.preprocess_term(term)
            if processed is None:
                continue
            terms.append(processed)

        if not terms:
            print("⚠️ אין מונחים תקפים לחיפוש לאחר עיבוד.")
            return {}

        results = []
        for term in terms:
            if term not in self.index:
                print(f"⚠️ המונח '{term}' לא נמצא באינדקס.")
                if 'AND' in operators:
                    return {}
                continue
            results.append(self.index.get(term, {}).get('DocIDs', set()))

        if not results:
            return {}

        result_set = results[0]
        for op, next_result in zip(operators, results[1:]):
            if op == 'AND':
                result_set = result_set & next_result
            elif op == 'OR':
                result_set = result_set | next_result

        ranking = {}
        for doc_id in result_set:
            score = sum(1 for term in terms if term in self.index and doc_id in self.index[term]['DocIDs'])
            ranking[doc_id] = score

        return ranking

def build_chatbot_patterns(index):
    knowledge_base = {
        "mqtt": "MQTT is a lightweight messaging protocol for IoT devices, designed for low bandwidth and unreliable networks.",
        "broker": "A broker is a server that receives all messages, filters them, and distributes them to subscribers in MQTT.",
        "subscribe": "Subscribe means a client expresses interest in receiving messages about a specific topic in MQTT.",
        "publish": "Publish means a client sends a message to a topic on the MQTT broker, making it available to subscribers.",
    }

    patterns = []
    for term in index.keys():
        pattern = rf"(?i).*({term}).*"
        response = knowledge_base.get(term, f"{term.capitalize()} appears in {len(index[term]['DocIDs'])} document(s).")
        patterns.append((pattern, [response]))
    return patterns

class SearchEngineCoordinator:
    def __init__(self, links, db_url):
        self.links = links
        self.db_url = db_url
        self.stop_words = [
            'the', 'is', 'at', 'which', 'on', 'and', 'a', 'an', 'to', 'for', 'of',
            'with', 'by', 'from', 'in', 'this', 'that', 'it', 'as', 'be', 'are', 'was',
        ]

    def start_chatbot(self, index):
        print("\n💬 Starting Chatbot. Type 'exit' to stop.")
        patterns = build_chatbot_patterns(index)
        chatbot = Chat(patterns, reflections)
        chatbot.converse()

    def run(self):
        print("⏳ Fetching documents...")
        fetcher = DocumentFetcher(self.links)
        docs, doc_ids = fetcher.fetch()

        print("⏳ Processing text...")
        processor = TextProcessor(self.stop_words)
        processor.process(docs)

        print("\n⏳ Building full index (all terms)...")
        indexer = Indexer(self.stop_words)
        index = indexer.build_index(docs)

        df = pd.DataFrame([
            {'term': term, 'count': data['count'], 'DocIDs': [doc_ids[doc] for doc in data['DocIDs']]}
            for term, data in index.items()
        ])
        print("\n📝 Index (sample):")
        print(df.sort_values(by='count', ascending=False).head(20))

        print("\n⏳ Uploading index to Firebase...")
        uploader = FirebaseUploader(self.db_url)
        success, msg = uploader.upload(index, doc_ids)
        if success:
            print("\n✅ Uploaded successfully to Firebase!")
        else:
            print("\n❌ Upload failed:", msg)

        self.start_chatbot(index)

        searcher = LogicalSearch(index, self.stop_words)
        while True:
            query = input("\n🔎 Enter search query (AND/OR), or 'exit': ")
            if query.lower() == 'exit':
                break
            ranking = searcher.search(query)
            if ranking:
                print("\n📄 Matching documents (sorted by relevance):")
                sorted_docs = sorted(ranking.items(), key=lambda x: x[1], reverse=True)
                for doc_id, score in sorted_docs:
                    print(f"🔹 {doc_ids[doc_id]} (Score: {score})")
            else:
                print("📄 No matching documents found.")

if __name__ == "__main__":
    doc_links = [
        "https://mqtt.org/",
        "https://mqtt.org/getting-started/",
        "https://mqtt.org/mqtt-specification/",
        "https://mqtt.org/software/",
        "https://mqtt.org/use-cases/",
        "https://mqtt.org/faq/"
    ]
    firebase_url = "https://testtragil6-default-rtdb.firebaseio.com/"

    coordinator = SearchEngineCoordinator(doc_links, firebase_url)
    coordinator.run()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


⏳ Fetching documents...
⏳ Processing text...

⏳ Building full index (all terms)...

📝 Index (sample):
              term  count                                             DocIDs
0             mqtt    357  [https://mqtt.org/faq/, https://mqtt.org/getti...
45          client     89    [https://mqtt.org/software/, https://mqtt.org/]
91          broker     81  [https://mqtt.org/software/, https://mqtt.org/...
82         support     59  [https://mqtt.org/software/, https://mqtt.org/...
52         message     41  [https://mqtt.org/software/, https://mqtt.org/...
2              iot     35  [https://mqtt.org/software/, https://mqtt.org/...
5         protocol     34  [https://mqtt.org/faq/, https://mqtt.org/getti...
40             use     34  [https://mqtt.org/faq/, https://mqtt.org/getti...
60           cloud     31    [https://mqtt.org/software/, https://mqtt.org/]
17          device     30  [https://mqtt.org/software/, https://mqtt.org/...
229         source     29                       [ht