In [None]:
%pip install peft transformers accelerate bitsandbytes

In [None]:
# Fine-tuning Gemma-2B (Base for Knowledge Graph)

In [None]:
# Imports & setup

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from preprocess import preprocess
from peft import LoraConfig, get_peft_model, TaskType
from manual_eval import manual_eval   
import torch

import importlib
import manual_eval
importlib.reload(manual_eval)
from manual_eval import manual_eval

In [None]:
# Load & split raw data
dataset = load_dataset("stanfordnlp/web_questions")

# drop any examples with no answer
dataset = dataset.filter(lambda ex: len(ex["answers"]) > 0)

split = dataset["train"].train_test_split(test_size=0.2, seed=42)
raw_train_dataset = split["train"]
raw_test_dataset  = split["test"]


In [None]:
# Dataset: stanfordnlp/web_questions from hugging face

In [None]:
# Tokenize (keeping raw_* intact)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
    # update model's pad_token_id after loading model…

# map with preprocess(), which returns input_ids, attention_mask, labels
tokenized_train = raw_train_dataset.map(
    lambda ex: preprocess(ex, tokenizer),
    batched=True,
    batch_size=256,
    remove_columns=raw_train_dataset.column_names,
)

tokenized_test = raw_test_dataset.map(
    lambda ex: preprocess(ex, tokenizer),
    remove_columns=raw_test_dataset.column_names,
)


In [None]:
# Model, LoRA, Trainer setup
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b-it",
    torch_dtype=torch.float16,
    device_map="auto",
)
# ensure pad token is set
model.config.pad_token_id = tokenizer.pad_token_id
model.config._attn_implementation = "eager"

# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj",
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # confirm only LoRA params are trainable
model.gradient_checkpointing_disable()

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gemma-lora-webq",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=1,   # safe eval
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,             
    data_collator=data_collator,
    tokenizer=tokenizer, 
)


In [None]:
# Cell 5: Baseline manual eval on 100 examples
torch.cuda.empty_cache()

baseline_metrics = manual_eval(
    model,
    tokenizer,
    raw_test_dataset,  # use the RAW split so ex["question"] exists
    slice_size=100     # fast sanity‐check on 100 samples
)

print("Baseline (100 ex):", baseline_metrics)

In [None]:
# [To run later!!] Cell: Full evaluation on entire test set
import torch
torch.cuda.empty_cache()

full_metrics = manual_eval(
    model,
    tokenizer,
    raw_test_dataset,
    slice_size=len(raw_test_dataset)
)
print("Full evaluation:", full_metrics)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,              
    data_collator=data_collator,
    tokenizer=tokenizer, 
)

In [None]:
trainer.train()

In [None]:
# Post-training manual eval on same 100 examples
import torch
torch.cuda.empty_cache()

finetuned_metrics = manual_eval(
    model,
    tokenizer,
    raw_test_dataset,
    slice_size=100
)
print("After fine-tuning (100 ex): ", finetuned_metrics)


In [None]:
# (Optional) Full evaluation on entire test set
import torch
torch.cuda.empty_cache()

full_metrics = manual_eval(
    model,
    tokenizer,
    raw_test_dataset,
    slice_size=len(raw_test_dataset)
)
print("Full evaluation: ", full_metrics)

In [None]:
# Save model & tokenizer
trainer.save_model("gemma-lora-webq-finetuned")
tokenizer.save_pretrained("gemma-lora-webq-finetuned")

In [None]:
# Knowledge Graph Experiment [Exploring - will need to make it more robust]

In [None]:
%pip install SPARQLWrapper pandas

In [None]:
%pip install spacy
%pip install spacy-wikidata
%python -m spacy download en_core_web_sm

In [None]:
import spacy
from spacy_wikidata import WikidataEntityLinker

# Load the spaCy model with Wikidata component
nlp = spacy.load("en_core_web_sm")

linker = WikidataEntityLinker(
    name="wikidata",
    url="https://query.wikidata.org/sparql",
    entity_linker="wikidata",
    resolve_entities=True,
)

nlp.add_pipe("wikidata", config={"use_cache": True})

In [None]:
doc = nlp("What is electricity?")

qid = ""

In [None]:
for e in doc.ents:
    if e.kb_id_:
        print(f"Entity: {e.text}, Wikidata ID: {e.kb_id_}, Label: {e.label_}")
        qid = e.kb_id_.split("/")[-1]
    else:
        print(f"Entity: {e.text} (no Wikidata ID found)")
 # Get the QID from the first entity

In [None]:
#SPARQL query to get related entities

from SPARQLWrapper import SPARQLWrapper, JSON

def query_wikidata_by_qid(qid):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(f"""
    SELECT ?item ?itemLabel WHERE {{
        wd:{qid} wdt:P31 ?item .
        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    query = sparql.queryString
    return query

In [None]:
query = query_wikidata_by_qid(qid)
print("SPARQL Query:", query)

In [None]:
# Need to work on SPARQL + knowledge graph integration

In [None]:
# We can reuse Gemma and its tokenizer