In [None]:
# התקנת ספריות
!pip install nltk
!pip install beautifulsoup4

# -----------------------------
# יבוא ספריות
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
import json
from collections import Counter

# -----------------------------
# הורדות של משאבים ל-NLTK
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# -----------------------------
# שלב 1: רשימת ה-Stop Words
custom_stop_words = set([
    'the', 'is', 'at', 'which', 'on', 'and', 'a', 'an', 'to', 'for', 'of',
    'with', 'by', 'from', 'in', 'this', 'that', 'it', 'as', 'be', 'are', 'was',
    'mqtt', 'org', 'data', 'information', 'protocol', 'message', 'client', 'server'
])

# -----------------------------
# שלב 2: קישורי הדפים
doc_links = [
    "https://en.wikipedia.org/wiki/Battle_of_Carrhae"
]

doc_ids_to_links = {}
for i, url in enumerate(doc_links):
    doc_ids_to_links[f"doc_{i+1}"] = url

# -----------------------------
# שלב 3: הורדת הטקסטים מכל דף
docs = {}
for i, url in enumerate(doc_links):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text(separator=' ')
        docs[f"doc_{i+1}"] = text.lower()
    else:
        docs[f"doc_{i+1}"] = ""

# -----------------------------
# שלב 4: חישוב שכיחויות
lemmatizer = WordNetLemmatizer()
word_counts = Counter()

for doc_id, content in docs.items():
    words = re.findall(r'\b[a-z]{2,}\b', content)
    for word in words:
        if word in custom_stop_words:
            continue
        lemma = lemmatizer.lemmatize(word)
        word_counts[lemma] += 1

# -----------------------------
# שלב 5: בחירת 10 מילים משמעותיות
top_10 = [word for word, count in word_counts.most_common(10)]
print("🔝 10 מילים שנבחרו:", top_10)

# -----------------------------
# שלב 6: בניית האינדקס הסופי
index = {}
for doc_id, content in docs.items():
    words = re.findall(r'\b[a-z]{2,}\b', content)
    for word in words:
        if word in custom_stop_words:
            continue
        lemma = lemmatizer.lemmatize(word)
        if lemma not in top_10:
            continue
        if lemma not in index:
            index[lemma] = {'count': 0, 'DocIDs': set()}
        index[lemma]['count'] += 1
        index[lemma]['DocIDs'].add(doc_id)

# -----------------------------
# שלב 7: יצירת טבלה
index_data = []
for term, data in index.items():
    links = [doc_ids_to_links[doc] for doc in data['DocIDs']]
    index_data.append({'term': term, 'count': data['count'], 'DocIDs': links})

df = pd.DataFrame(index_data)
df = df.sort_values(by='count', ascending=False).reset_index(drop=True)
print("📝 אינדקס סופי:")
print(df)

# -----------------------------
# שלב 8: שליחה ל-Firebase
database_url = "https://testtragil6-default-rtdb.firebaseio.com/"

data_to_send = {}
for i, row in df.iterrows():
    term = row['term']
    doc_ids = row['DocIDs']
    count = row['count']
    data_to_send[term] = {
        'term': term,
        'DocIDs': doc_ids,
        'count': count
    }

response = requests.put(database_url + ".json", data=json.dumps(data_to_send))

if response.status_code == 200:
    print("✅ האינדקס הועלה בהצלחה ל-Firebase Realtime Database!")
else:
    print("❌ שגיאה בהעלאה:", response.text)
