# This notebook will
- pull a wiki information for a qid
- converts it to embedded glove vectors

## Prerequisites
- In terminal run
```
pip install wikidata
pip install numpy
```
- Go to https://nlp.stanford.edu/projects/glove/
- Download 2024 Wikipedia + Gigaword 5 (11.9B tokens, 1.2M vocab, uncased, 50d vectors, 290 MB download): glove.2024.wikigiga.50d.zip
- Extract to your work directory

In [52]:
# glove_file = "../models/glove/wiki_giga_2024_50_MFT20_vectors_seed_123_alpha_0.75_eta_0.075_combined.txt"
glove_file = "/Users/saikrishna/Desktop/wiki_giga_2024_50_MFT20_vectors_seed_123_alpha_0.75_eta_0.075_combined.txt"

In [53]:
import requests
from wikidata.client import Client

In [54]:
def get_wikipedia_title_from_wikidata(qid, lang="en"):
    """Fetch Wikipedia page title linked to a Wikidata entity."""
    client = Client()
    entity = client.get(qid, load=True)
    sitelinks = entity.data.get("sitelinks", {})
    wiki_key = f"{lang}wiki"

    if wiki_key in sitelinks:
        return lang, sitelinks[wiki_key]["title"]
    elif sitelinks:
        # Return the first available Wikipedia sitelink
        first_key = next(iter(sitelinks))
        fallback_lang = first_key.replace("wiki", "")
        return fallback_lang, sitelinks[first_key]["title"]
    else:
        raise ValueError(f"No Wikipedia sitelinks available for {qid}")


In [55]:
def get_wikipedia_summary(title, lang="en"):
    """Fetch short summary (1–2 sentences) from Wikipedia."""
    url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{title}"
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.json().get("extract")

In [56]:
def get_wikipedia_intro_paragraphs(title, lang="en"):
    """Fetch the full lead/intro section (multiple paragraphs)."""
    url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "extracts",
        "exintro": True,       # only the lead section
        "explaintext": True,   # plain text (no HTML)
        "titles": title,
        "format": "json"
    }
    resp = requests.get(url, params=params)
    resp.raise_for_status()
    pages = resp.json()["query"]["pages"]
    page = next(iter(pages.values()))

    raw_text = page.get("extract", "")
    # Normalize paragraphs: split by double newlines or single breaks
    paragraphs = [p.strip() for p in raw_text.split("\n") if p.strip()]
    return "\n\n".join(paragraphs)

In [57]:
def load_wiki_qid(qid, lang="en"):
    lang, title = get_wikipedia_title_from_wikidata(qid, lang)
    summary = get_wikipedia_summary(title, lang)
    details = get_wikipedia_intro_paragraphs(title, lang)
    source_url = f"https://{lang}.wikipedia.org/wiki/"+title.replace(" ", "_")
    return f"# {title}\n" + summary + "\n## Details\n"+ details + "\n## Sources\n" + source_url

In [58]:
import numpy as np
import string

In [59]:
embedding_dim = 50
embeddings_index = {}

In [60]:
def load_embedding_vectors():
    with open(glove_file, 'r', encoding='utf8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector_values = values[1:]
            if len(vector_values) != embedding_dim:
                # Skip lines with wrong dimensions
                continue
            try:
                vector = np.array(vector_values, dtype='float32')
                embeddings_index[word] = vector
            except ValueError:
                continue

In [61]:
def embed_to_glove(text):
    if not embeddings_index:
        load_embedding_vectors()

    words = text.lower().split()
    # Remove punctuation as they are apparently not present in Glove
    words = [w.strip(string.punctuation) for w in words]
    embedded_words = []
    for word in words:
        vector = embeddings_index.get(word)
        if vector is not None:
            embedded_words.append(vector)
        else:
            # Assign small random vector for out of vocabulary words
            random_vector = np.random.uniform(-0.05, 0.05, embedding_dim)
            embedded_words.append(random_vector)
    return np.vstack(embedded_words)

In [62]:
qid = "Q249674"
text = load_wiki_qid(qid)
print(text)

# Spike (телеканал)
Spike — бывший американский кабельный телевизионный канал, принадлежавший компании Paramount Global. Начал своё вещание 11 августа 2003 года, заменив телеканал TNN.
## Details
Spike (ранее назывался Spike TV) — бывший американский кабельный телевизионный канал, принадлежавший компании Paramount Global. Начал своё вещание 11 августа 2003 года, заменив телеканал TNN.

В феврале 2017 года компания Paramount Global объявила, что Spike будет закрыт в 2018 году и его место займёт новый телеканал — Paramount Network. Spike официально прекратил своё вещание 17 января 2018 года.
## Sources
https://ru.wikipedia.org/wiki/Spike_(телеканал)


In [63]:
embedding_vector = embed_to_glove(text)
print(embedding_vector)

[[ 0.0447343  -0.04316366  0.03124164 ... -0.02293803 -0.02282768
   0.04378978]
 [ 0.378654   -0.26602501  0.012204   ... -0.011233   -0.333691
  -0.47293299]
 [ 0.02511937 -0.03469301  0.03230286 ... -0.0055438  -0.03173322
   0.01404396]
 ...
 [-0.01896668  0.04940063 -0.04128322 ... -0.00386988  0.00339797
   0.04798447]
 [ 0.92886001 -0.53292501  0.12852199 ...  0.027593    0.56147301
   0.173931  ]
 [-0.03391121 -0.01740433  0.03745184 ...  0.00170603  0.04450943
   0.04793962]]
