In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ===================== INSTALL =====================
!pip install sentence-transformers transformers datasets faiss-cpu rouge-score tabulate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl

In [None]:
from nltk.translate.meteor_score import single_meteor_score


In [None]:
!pip install git+https://github.com/salaniz/pycocoevalcap


Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-kpqz952j
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-kpqz952j
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycocoevalcap
  Building wheel for pycocoevalcap (setup.py) ... [?25l[?25hdone
  Created wheel for pycocoevalcap: filename=pycocoevalcap-1.2-py3-none-any.whl size=104312245 sha256=8e0572e01cdce7052f9660b622699187ab4e15140695d59aef0832a27fec0902
  Stored in directory: /tmp/pip-ephem-wheel-cache-4ha5lthg/wheels/d2/1f/44/6485e566f8ae3d42b56e7c05fd50a3bbb70a50b0e6e7c55212
Successfully built pycocoevalcap
Installing collected packages: pycocoevalcap
Successfully installed pycocoevalcap-1.2


In [None]:
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional but recommended for WordNet synonyms


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# ===================== IMPORTS =====================
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset, load_from_disk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate

# CIDEr dependencies (after installing pycocoevalcap)
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer

# ===================== STEP 1: Load & Structure CSV =====================
df = pd.read_csv("/content/drive/MyDrive/Sem content/Capstone/files/combined_dataset01.csv")
data_dict = {}
for _, row in df.iterrows():
    week = f"week_{row['week']}"
    slide = f"slide_{row['slide']}"
    data_dict.setdefault(week, {}).setdefault(slide, []).append({
        "question": row["question"],
        "answer": row["answer"],
        "transcript": row["transcript"],
        "content_type": row["content type"],
        "image_path": row["Image Path"]
    })

# ===================== STEP 2: Embedding + HF Dataset =====================
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
df_passages = df[["transcript"]].drop_duplicates().reset_index(drop=True)
df_passages["title"] = "Slide"
df_passages = df_passages.rename(columns={"transcript": "text"})
rag_dataset = Dataset.from_pandas(df_passages)
embeddings = embedding_model.encode(rag_dataset["text"], show_progress_bar=True)
rag_dataset = rag_dataset.add_column("embeddings", [list(e) for e in embeddings])
rag_dataset.save_to_disk("rag_custom_dataset")
rag_dataset = load_from_disk("rag_custom_dataset")
rag_dataset.add_faiss_index(column="embeddings")

# ===================== STEP 3: Load FLAN-T5 & BART Models =====================
tokenizer_flan = AutoTokenizer.from_pretrained("google/flan-t5-base")
model_flan = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to("cuda" if torch.cuda.is_available() else "cpu")

tokenizer_bart = AutoTokenizer.from_pretrained("facebook/bart-large")
model_bart = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large").to("cuda" if torch.cuda.is_available() else "cpu")

# ===================== STEP 4: Utility Functions =====================
def get_best_matching_context(question, sentences, top_k=2):
    q_emb = embedding_model.encode([question], convert_to_tensor=True)
    s_emb = embedding_model.encode(sentences, convert_to_tensor=True)
    scores = util.cos_sim(q_emb, s_emb)[0]
    top_idxs = torch.topk(scores, k=top_k).indices
    return " ".join([sentences[i] for i in top_idxs])

def generate_answer_model(question, week, slide, model_type="flan"):
    docs = data_dict.get(f"week_{week}", {}).get(f"slide_{slide}", [])
    if not docs:
        return "No relevant content found."
    transcript = " ".join([d["transcript"] for d in docs])
    sentences = [s.strip() for s in transcript.split('.') if s.strip()]
    context = get_best_matching_context(question, sentences)
    prompt = f"question: {question} context: {context}" if model_type == "flan" else f"{question}\n\n{context}"

    if model_type == "flan":
        tokenizer, model = tokenizer_flan, model_flan
    elif model_type == "bart":
        tokenizer, model = tokenizer_bart, model_bart
    else:
        raise ValueError("Model type must be 'flan' or 'bart'.")

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=30, temperature=0.7, num_beams=2)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def retrieve_best_answer(question, week, slide):
    docs = data_dict.get(f"week_{week}", {}).get(f"slide_{slide}", [])
    if not docs:
        return "No relevant content found."
    questions = [d["question"] for d in docs]
    answers = [d["answer"] for d in docs]
    q_emb = embedding_model.encode([question], convert_to_numpy=True)
    db_embs = embedding_model.encode(questions, convert_to_numpy=True)
    best = np.argmax(np.dot(db_embs, q_emb.T).flatten())
    return answers[best]

# Initialize once
tokenizer_cider = PTBTokenizer()
cider_scorer = Cider()

def evaluate_generated_output(pred, gt):
    reference = [gt.lower().split()]
    candidate = pred.lower().split()
    bleu = sentence_bleu(reference, candidate, smoothing_function=SmoothingFunction().method1)
    rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = rouge.score(gt, pred)
    rouge1, rouge2, rougel = scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure
    emb = embedding_model.encode([pred, gt])
    cosine = cosine_similarity([emb[0]], [emb[1]])[0][0]
    exact = int(pred.strip().lower() == gt.strip().lower())
    true, pred_ = set(gt.lower().split()), set(pred.lower().split())
    prec = len(true & pred_) / len(pred_) if pred_ else 0
    rec = len(true & pred_) / len(true) if true else 0
    f1 = 2 * prec * rec / (prec + rec + 1e-8) if (prec + rec) else 0
    meteor = single_meteor_score(gt.split(), pred.split())

    gts = {0: [{'caption': gt}]}
    res = {0: [{'caption': pred}]}
    gts_tok = tokenizer_cider.tokenize(gts)
    res_tok = tokenizer_cider.tokenize(res)
    cider_score, _ = cider_scorer.compute_score(gts_tok, res_tok)


    return {
        "BLEU": bleu, "ROUGE-1": rouge1, "ROUGE-2": rouge2, "ROUGE-L": rougel,
        "Cosine Similarity": cosine, "Exact Match": exact, "F1 Score": f1,
        "METEOR": meteor, "CIDEr": cider_score
    }

def compare_selected_models(question, week, slide):
    ground_truth = data_dict[f"week_{week}"][f"slide_{slide}"][0]["answer"]
    results = {
        "FLAN-T5": generate_answer_model(question, week, slide, "flan"),
        "BART-Large": generate_answer_model(question, week, slide, "bart"),
        "Hybrid-Retriever": retrieve_best_answer(question, week, slide),
    }
    return {model: {"Answer": ans, **evaluate_generated_output(ans, ground_truth)} for model, ans in results.items()}

# ===================== STEP 5: Run Evaluation =====================
question = "How is text data used for social analysis?"
week = 1
slide = 1

results = compare_selected_models(question, week, slide)

# ===================== STEP 6: Show Table + Save to CSV =====================
table = []
csv_rows = []
headers = ["Model", "BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "Cosine", "F1", "Exact", "METEOR", "CIDEr", "Answer (Preview)"]

for model, metrics in results.items():
    row = [
        model,
        f"{metrics['BLEU']:.4f}", f"{metrics['ROUGE-1']:.4f}", f"{metrics['ROUGE-2']:.4f}", f"{metrics['ROUGE-L']:.4f}",
        f"{metrics['Cosine Similarity']:.4f}", f"{metrics['F1 Score']:.4f}", metrics['Exact Match'],
        f"{metrics['METEOR']:.4f}", f"{metrics['CIDEr']:.4f}",
        metrics['Answer'][:80] + ("..." if len(metrics['Answer']) > 80 else "")
    ]
    table.append(row)
    csv_rows.append([
        model,
        metrics['BLEU'], metrics['ROUGE-1'], metrics['ROUGE-2'], metrics['ROUGE-L'],
        metrics['Cosine Similarity'], metrics['F1 Score'], metrics['Exact Match'],
        metrics['METEOR'], metrics['CIDEr'],
        metrics['Answer']
    ])

print(tabulate(table, headers=headers, tablefmt="fancy_grid"))

# Save to CSV
df_results = pd.DataFrame(csv_rows, columns=[
    "Model", "BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "Cosine Similarity",
    "F1 Score", "Exact Match", "METEOR", "CIDEr", "Answer"
])
df_results.to_csv("evaluation_results.csv", index=False)


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/135 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

╒══════════════════╤════════╤═══════════╤═══════════╤═══════════╤══════════╤════════╤═════════╤══════════╤═════════╤═════════════════════════════════════════════════════════════════════════════════════╕
│ Model            │   BLEU │   ROUGE-1 │   ROUGE-2 │   ROUGE-L │   Cosine │     F1 │   Exact │   METEOR │   CIDEr │ Answer (Preview)                                                                    │
╞══════════════════╪════════╪═══════════╪═══════════╪═══════════╪══════════╪════════╪═════════╪══════════╪═════════╪═════════════════════════════════════════════════════════════════════════════════════╡
│ FLAN-T5          │ 0      │    0.0364 │         0 │    0.0364 │   0.2403 │ 0.0476 │       0 │   0.0108 │       0 │ income of citizens                                                                  │
├──────────────────┼────────┼───────────┼───────────┼───────────┼──────────┼────────┼─────────┼──────────┼─────────┼────────────────────────────────────────────────────────────────────────