# This notebook will
- pull a wiki information for a qid
- converts it to embedded glove vectors

## Prerequisites
- In terminal run
```
pip install wikidata
pip install numpy
```
- Go to https://nlp.stanford.edu/projects/glove/
- Download 2024 Wikipedia + Gigaword 5 (11.9B tokens, 1.2M vocab, uncased, 50d vectors, 290 MB download): glove.2024.wikigiga.50d.zip
- Extract to your work directory

In [1]:
glove_file = "../models/glove/wiki_giga_2024_50_MFT20_vectors_seed_123_alpha_0.75_eta_0.075_combined.txt"

In [2]:
import requests
from wikidata.client import Client

In [3]:
def get_wikipedia_title_from_wikidata(qid, lang="en"):
    """Fetch Wikipedia page title linked to a Wikidata entity."""
    client = Client()
    entity = client.get(qid, load=True)
    sitelinks = entity.data.get("sitelinks", {})
    wiki_key = f"{lang}wiki"
    if wiki_key not in sitelinks:
        raise ValueError(f"No {lang} Wikipedia sitelink for {qid}")
    return sitelinks[wiki_key]["title"]

In [4]:
def get_wikipedia_summary(title, lang="en"):
    """Fetch short summary (1–2 sentences) from Wikipedia."""
    url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{title}"
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.json().get("extract")

In [5]:
def get_wikipedia_intro_paragraphs(title, lang="en"):
    """Fetch the full lead/intro section (multiple paragraphs)."""
    url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "extracts",
        "exintro": True,       # only the lead section
        "explaintext": True,   # plain text (no HTML)
        "titles": title,
        "format": "json"
    }
    resp = requests.get(url, params=params)
    resp.raise_for_status()
    pages = resp.json()["query"]["pages"]
    page = next(iter(pages.values()))

    raw_text = page.get("extract", "")
    # Normalize paragraphs: split by double newlines or single breaks
    paragraphs = [p.strip() for p in raw_text.split("\n") if p.strip()]
    return "\n\n".join(paragraphs)

In [6]:
def load_wiki_qid(qid, lang="en"):
    title = get_wikipedia_title_from_wikidata(qid, lang)
    summary = get_wikipedia_summary(title, lang)
    details = get_wikipedia_intro_paragraphs(title, lang)
    source_url = "https://en.wikipedia.org/wiki/"+title.replace(" ", "_")
    return f"# {title}\n" + summary + "\n## Details\n"+ details + "\n## Sources\n" + source_url

In [7]:
import numpy as np
import string

In [8]:
embedding_dim = 50
embeddings_index = {}

In [9]:
def load_embedding_vectors():
    with open(glove_file, 'r', encoding='utf8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector_values = values[1:]
            if len(vector_values) != embedding_dim:
                # Skip lines with wrong dimensions
                continue
            try:
                vector = np.array(vector_values, dtype='float32')
                embeddings_index[word] = vector
            except ValueError:
                continue

In [10]:
def embed_to_glove(text):
    if not embeddings_index:
        load_embedding_vectors()

    words = text.lower().split()
    # Remove punctuation as they are apparently not present in Glove
    words = [w.strip(string.punctuation) for w in words]
    embedded_words = []
    for word in words:
        vector = embeddings_index.get(word)
        if vector is not None:
            embedded_words.append(vector)
        else:
            # Assign small random vector for out of vocabulary words
            random_vector = np.random.uniform(-0.05, 0.05, embedding_dim)
            embedded_words.append(random_vector)
    return np.vstack(embedded_words)

In [11]:
qid = "Q2685"
text = load_wiki_qid(qid)
print(text)

# Arnold Schwarzenegger
Arnold Alois Schwarzenegger is an Austrian and American actor, businessman, former politician, and former professional bodybuilder, known for his roles in high-profile action films. He served as the 38th governor of California from 2003 to 2011.
## Details
Arnold Alois Schwarzenegger (born July 30, 1947) is an Austrian and American actor, businessman, former politician, and former professional bodybuilder, known for his roles in high-profile action films. He served as the 38th governor of California from 2003 to 2011.

Schwarzenegger began lifting weights at age 15 and won the Mr. Universe title aged 20, and subsequently the Mr. Olympia title seven times. He is tied with Phil Heath for the joint-second number of all-time Mr. Olympia wins, behind Ronnie Coleman and Lee Haney, who are joint-first with eight wins each. Nicknamed the "Austrian Oak" in his bodybuilding days, he is regarded as one of the greatest bodybuilders of all time. He has written books and arti

In [73]:
embedding_vector = embed_to_glove(text)
print(embedding_vector)

[[-1.43524297e-02 -4.69134421e-02 -3.12519383e-02 ... -1.86913536e-02
  -4.62052201e-05 -1.27419314e-02]
 [-3.72239985e-02  8.71483028e-01  5.16359985e-01 ... -3.66555005e-01
  -4.81440991e-01  3.89647990e-01]
 [ 3.09415996e-01  7.08140016e-01  9.55354989e-01 ...  7.57193029e-01
  -3.66930008e-01  1.36133999e-01]
 ...
 [ 1.87169884e-02 -3.94320069e-02  4.05645802e-02 ... -6.45872973e-03
  -3.31812777e-02  4.66582018e-03]
 [ 9.28860009e-01 -5.32925010e-01  1.28521994e-01 ...  2.75929999e-02
   5.61473012e-01  1.73931003e-01]
 [ 4.15888220e-02 -2.00259266e-02  3.55523590e-02 ... -3.04718027e-02
   2.39847658e-02 -2.84495522e-02]]
