In [6]:
import os
import fitz       # PyMuPDF
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from docx2pdf import convert

def preprocess_docs(input_dir: str):
    """
    Convert all .docx/.doc files in input_dir into PDFs
    with the same base filename.
    """
    for filename in os.listdir(input_dir):
        lower = filename.lower()
        if lower.endswith((".docx", ".doc")):
            doc_path = os.path.join(input_dir, filename)
            pdf_path = os.path.join(
                input_dir,
                os.path.splitext(filename)[0] + ".pdf"
            )
            try:
                print(f"🔧 Converting {filename} → {os.path.basename(pdf_path)} …")
                convert(doc_path, pdf_path)
            except Exception as e:
                print(f"⚠️ Failed to convert {filename}: {e}")


🔧 Converting vidyalatanvi_LATE_218146_14937211_COGS 160 A1-1.docx → vidyalatanvi_LATE_218146_14937211_COGS 160 A1-1.pdf …


100%|██████████| 1/1 [00:28<00:00, 28.76s/it]



📄 Extracting images from mainayardaniel_127050_14924649_COGS 160 Le Corbusier Doc.pdf (56 pages)…


In [7]:
import os
import fitz      # PyMuPDF
from PIL import Image
import io

def compress_all_pdfs(input_dir, output_dir, dpi=100, downscale_factor=2):
    """
    Compress all PDF files in `input_dir` by rendering each page to an image,
    optionally downscaling, and reassembling into a new PDF in `output_dir`.
    """
    os.makedirs(output_dir, exist_ok=True)
    for filename in os.listdir(input_dir):
        if not filename.lower().endswith(".pdf"):
            continue
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)
        print(f"🔄 Compressing {filename} ...")
        try:
            doc = fitz.open(input_path)
            new_pdf = fitz.open()
            for page in doc:
                pix = page.get_pixmap(dpi=dpi)
                img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
                new_size = (pix.width // downscale_factor, pix.height // downscale_factor)
                img = img.resize(new_size, Image.LANCZOS)
                buffer = io.BytesIO()
                img.save(buffer, format="PDF", resolution=dpi)
                buffer.seek(0)
                img_pdf = fitz.open("pdf", buffer)
                new_pdf.insert_pdf(img_pdf)
            new_pdf.save(output_path)
            new_pdf.close()
            doc.close()
            print(f"✅ Saved compressed PDF: {output_path}")
        except Exception as e:
            print(f"❌ Failed to compress {filename}: {e}")


🔄 Compressing mainayardaniel_127050_14924649_COGS 160 Le Corbusier Doc.pdf ...
✅ Saved compressed PDF: compressed_files/mainayardaniel_127050_14924649_COGS 160 Le Corbusier Doc.pdf


In [13]:
import pandas as pd
from difflib import get_close_matches

# ── 1) Load CSVs ──────────────────────────────────
meta_df   = pd.read_csv("image_metadata.csv")
roster_df = pd.read_csv("student_info.csv")

# ── 2) Extract login_id from PDF filenames ────────
meta_df["login_id"] = meta_df["pdf_file"].str.split("_").str[0]

# ── 3) Tidy roster columns ────────────────────────
roster_df = roster_df.rename(columns={
    "SIS Login ID": "login_id",
    "Student":      "student_name",
    "SIS User ID":  "pid"
})
roster_df["student_name"] = roster_df["student_name"].fillna("").astype(str).str.strip()

# ── 4) Exact-match merge ──────────────────────────
merged = pd.merge(
    meta_df,
    roster_df[["login_id","student_name","pid"]],
    on="login_id",
    how="left"
)

# ── 5) Find unmatched IDs ─────────────────────────
unmatched = merged.loc[merged["pid"].isna(), "login_id"].unique()
print("🔍 Unmatched login_ids:", unmatched)

# ── 6) Prepare fuzzy matching ─────────────────────
roster_df["norm_name"] = (
  roster_df["student_name"].str.lower().str.replace(r"[^a-z0-9]", "", regex=True)
)
roster_map = { row.norm_name: (row.login_id, row.student_name, row.pid) for row in roster_df.itertuples() }

# ── 7) Fuzzy-match suggestions ────────────────────
suggestions = {}
for uid in unmatched:
    key = str(uid).lower()
    if key in roster_map:
        suggestions[uid] = [roster_map[key]]
        continue
    hits = [roster_map[n] for n in roster_map if key in n or n in key]
    if hits:
        suggestions[uid] = hits
        continue
    best = get_close_matches(key, roster_map.keys(), n=1, cutoff=0.6)
    suggestions[uid] = [roster_map[best[0]]] if best else []

# ── 8) Auto-fill unique matches ───────────────────
for uid, matches in suggestions.items():
    if len(matches) == 1:
        _, name, pid = matches[0]
        merged.loc[merged["login_id"]==uid, "student_name"] = name
        merged.loc[merged["login_id"]==uid, "pid"]          = pid

# ── 9) Report ambiguous/no matches ───────────────
for uid, matches in suggestions.items():
    if len(matches)>1:
        print(f"⚠️ {uid!r} HAS MULTIPLE MATCHES:")
        for _, name, pid in matches:
            print(f"    {name} → {pid}")
    elif not matches:
        print(f"❌ {uid!r} HAS NO CLOSE MATCH")

# ──10) Save result ────────────────────────────────
merged.to_csv("image_metadata_with_name_pid.csv", index=False)
print("✅ Done! Check → image_metadata_with_name_pid.csv")


🔍 Unmatched login_ids: ['mainayardaniel' 'emralinolalaine' 'krukjulia' 'spavenchristine' 'khirwadkarisha' 'liangmichael' 'yangheiman' 'dasilvatheo' 'davidmatthew' 'marvanalicia' 'vidyalatanvi']

✅ Done! Check → image_metadata_with_name_pid.csv
