In [1]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

subjects = [
    "algebra",
    "counting_and_probability",
    "geometry",
    "intermediate_algebra",
    "number_theory",
    "prealgebra",
    "precalculus"
]

datasets = [load_dataset("EleutherAI/hendrycks_math", subject) for subject in subjects]

train_datasets = [ds["train"] for ds in datasets]
test_datasets = [ds["test"] for ds in datasets]

combined_train = concatenate_datasets(train_datasets)
combined_test = concatenate_datasets(test_datasets)

combined_dataset = DatasetDict({
    "train": combined_train,
    "test": combined_test
})


combined_dataset["train"] = combined_dataset["train"].shuffle(seed=42)
combined_dataset["test"] = combined_dataset["test"].shuffle(seed=42)
combined_dataset["train"][0]


{'problem': 'What is the number of units in the distance between $(2,5)$ and $(-6,-1)$?',
 'level': 'Level 2',
 'type': 'Algebra',
 'solution': 'We use the distance formula: $\\sqrt{(-6 - 2)^2 + (-1 - 5)^2},$ so then we find that $\\sqrt{64 + 36} = \\boxed{10}$.\n\n- OR -\n\nWe note that the points $(2, 5)$, $(-6, -1)$, and $(2, -1)$ form a right triangle with legs of length 6 and 8. This is a Pythagorean triple, so the length of the hypotenuse must be $\\boxed{10}$.'}

In [2]:
import wikipedia

topics = [
    "Algebra", "Calculus", "Derivative", "Integral",
    "Matrix (mathematics)", "Probability", "Statistics",
    "Geometry", "Trigonometry", "Number theory", "linear algebra", "Linear Regression"
]

wiki_corpus = []

for topic in topics:
    try:
        summary = wikipedia.page(topic, auto_suggest=False).content
        wiki_corpus.append({"title": topic, "content": summary})
        print(f"Collected: {topic}")
    except Exception as e:
        print(f"Skipped {topic}: {e}")

print(len(wiki_corpus), "articles collected.")


Collected: Algebra
Collected: Calculus
Collected: Derivative
Collected: Integral
Collected: Matrix (mathematics)
Collected: Probability
Collected: Statistics
Collected: Geometry
Collected: Trigonometry
Collected: Number theory
Collected: linear algebra
Collected: Linear Regression
12 articles collected.


In [3]:
wiki_corpus[2]

{'title': 'Derivative',
 'content': 'In mathematics, the derivative is a fundamental tool that quantifies the sensitivity to change of a function\'s output with respect to its input. The derivative of a function of a single variable at a chosen input value, when it exists, is the slope of the tangent line to the graph of the function at that point. The tangent line is the best linear approximation of the function near that input value. The derivative is often described as the instantaneous rate of change, the ratio of the instantaneous change in the dependent variable to that of the independent variable. The process of finding a derivative is called differentiation.\nThere are multiple different notations for differentiation. Leibniz notation, named after Gottfried Wilhelm Leibniz, is represented as the ratio of two differentials, whereas prime notation is written by adding a prime mark. Higher order notations represent repeated differentiation, and they are usually denoted in Leibniz 

In [4]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
nltk.download('punkt')


sentences = [word_tokenize(doc["content"].lower()) for doc in wiki_corpus]

c = Word2Vec(
    sentences,
    vector_size=200,
    window=5,
    min_count=3,
    workers=4,
    sg=1 
)

def doc_vector(model, text):
    words = [w for w in word_tokenize(text.lower()) if w in model.wv]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

doc_vectors = np.array([doc_vector(c, doc["content"]) for doc in wiki_corpus])
titles = [doc["title"] for doc in wiki_corpus]
contents = [doc["content"] for doc in wiki_corpus]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/wangyuning/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import faiss

d = c.vector_size 
index = faiss.IndexFlatL2(d)
index.add(doc_vectors.astype('float32'))

print(f"Indexed {index.ntotal} documents.")


Indexed 12 documents.


In [6]:
query = "Explain the difference between derivative and integral."
query_vec = doc_vector(c, query).astype('float32')

k = 5 
D, I = index.search(np.array([query_vec]), k)

for i, idx in enumerate(I[0]):
    print(f"Rank {i+1}: {titles[idx]} (distance={D[0][i]:.4f})")


Rank 1: Calculus (distance=0.1930)
Rank 2: Statistics (distance=0.2092)
Rank 3: Geometry (distance=0.2277)
Rank 4: Algebra (distance=0.2853)
Rank 5: Number theory (distance=0.3375)


In [7]:
retrieved_docs = [wiki_corpus[i]["content"] for i in I[0]]
retrieved_titles = [titles[i] for i in I[0]]

In [8]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")


In [9]:
query = "Explain the difference between derivative and integral."
context = "\n\n".join(
    [f"[{retrieved_titles[i]}]\n{retrieved_docs[i]}" for i in range(len(retrieved_docs))]
)
prompt = f"""
Answer the question using only the context below.

Context:
{contents}

Question: {query}
"""

inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
outputs = model.generate(**inputs, max_new_tokens=150)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\nLLM Answer:\n", answer)


LLM Answer:
 is the main form of algebra taught in schools


In [10]:
context

'[Calculus]\nCalculus is the mathematical study of continuous change, in the same way that geometry is the study of shape, and algebra is the study of generalizations of arithmetic operations.\nOriginally called infinitesimal calculus or "the calculus of infinitesimals", it has two major branches, differential calculus and integral calculus. The former concerns instantaneous rates of change, and the slopes of curves, while the latter concerns accumulation of quantities, and areas under or between curves. These two branches are related to each other by the fundamental theorem of calculus. They make use of the fundamental notions of convergence of infinite sequences and infinite series to a well-defined limit. It is the "mathematical backbone" for dealing with problems where variables change with time or another reference variable.\nInfinitesimal calculus was formulated separately in the late 17th century by Isaac Newton and Gottfried Wilhelm Leibniz. Later work, including codifying the 