<a href="https://colab.research.google.com/github/wtrekell/soylent-army/blob/main/colab/ai_vs_human_v1.4c.2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# AI → Human Edit Analyzer (Updated)
Analyze 4 versions of an article — **draft → refined → edited → final** — and quantify how humans modified the AI-authored draft.

**Key improvements vs. prior version**  
- Batched & cached embeddings (dramatic speedup).  
- Robust sentence segmentation (spaCy → NLTK → regex fallback).  
- Gentler Markdown → text conversion (keeps list markers and inline code text).  
- Tunable thresholds & weights, exported with results.  
- Generalized file discovery (no hard-coded names).  
- Rich outputs: per-sentence CSV mapping + summary JSONs.  
- Optional plots to visualize similarity distributions.


## 1) Setup: install dependencies (run once per environment)

In [1]:
# If you're in a fresh environment, uncomment the pip installs.
# They are separated to avoid re-installing heavy deps repeatedly.
# You can safely run them all if unsure.

# %pip install -q --upgrade pip
%pip install -q sentence-transformers scikit-learn pandas numpy tqdm rapidfuzz markdown-it-py[linkify] beautifulsoup4 lxml #                 spacy nltk

# Try to fetch spaCy small English model if not present.
# If this fails (no internet), the code will fall back to NLTK/regex automatically.
# import spacy
# try:
#     spacy.load("en_core_web_sm")
# except Exception:
#     try:
#         import sys, subprocess
#         subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
#     except Exception as e:
#         print("Could not download spaCy model. Will fall back to NLTK/regex at runtime if needed.", e)

## 2) Imports & Config

In [2]:
import json
import math
import re
from dataclasses import asdict, dataclass
from pathlib import Path

import numpy as np
import pandas as pd

# Text / parsing
from bs4 import BeautifulSoup
from markdown_it import MarkdownIt

# Attempt to import linkify_it directly and also handle the extension import
linkify_it = None
try:
    from markdown_it.extensions.linkify import linkify_it
    print("Successfully imported markdown_it.extensions.linkify")
except ImportError:
    try:
        from linkify_it import linkify_it
        print("Successfully imported linkify_it directly")
    except ImportError:
        print("Could not import linkify_it from either source.")

import unicodedata

# Optional / fallback token similarity
from rapidfuzz import fuzz

# Embeddings
from sentence_transformers import SentenceTransformer

# Similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# NLP fallback layers (loaded lazily)
try:
    import spacy
except Exception:
    spacy = None

try:
    import nltk
except Exception:
    nltk = None


@dataclass
class Config:
    # File discovery
    data_dir: str = "./data"              # path with article version files
    draft_glob: str = "draft*.md"
    refined_glob: str = "refined*.md"
    edited_glob: str = "edited*.md"
    final_glob: str = "final*.md"
    article_name: str | None = None    # used in output filenames; defaults to dir name

    # Sentence filtering
    min_sentence_len: int = 10
    max_sentence_len: int = 200

    # NLP segmentation
    prefer_spacy: bool = True
    spacy_model: str = "en_core_web_sm"

    # Similarity thresholds
    semantic_origin_threshold: float = 0.30     # min cosine(sim) to count as "derived from earlier"
    unchanged_threshold: float = 0.80
    minor_change_threshold: float = 0.50

    # Combined score weighting
    weight_semantic: float = 0.60
    weight_tfidf: float = 0.40

    # Candidate pruning for expensive metrics (jaccard, edit distance)
    topk_candidates: int = 5

    # Output
    out_dir: str = "./output"

CFG = Config()
print("Current config:", CFG)

Could not import linkify_it from either source.
Current config: Config(data_dir='./data', draft_glob='draft*.md', refined_glob='refined*.md', edited_glob='edited*.md', final_glob='final*.md', article_name=None, min_sentence_len=10, max_sentence_len=200, prefer_spacy=True, spacy_model='en_core_web_sm', semantic_origin_threshold=0.3, unchanged_threshold=0.8, minor_change_threshold=0.5, weight_semantic=0.6, weight_tfidf=0.4, topk_candidates=5, out_dir='./output')


## 3) Utilities: file discovery, markdown → text, sentence splitting

In [3]:

def discover_files(cfg: Config) -> dict[str, Path]:
    # Return the best matching file per version key.
    d = Path(cfg.data_dir)
    assert d.exists(), f"Data dir not found: {cfg.data_dir}"
    matches = {}
    for key, glob in [("draft", cfg.draft_glob),
                      ("refined", cfg.refined_glob),
                      ("edited", cfg.edited_glob),
                      ("final", cfg.final_glob)]:
        # pick the first match in sorted order to be deterministic
        found = sorted(d.glob(glob))
        if found:
            matches[key] = found[0]
    if "draft" not in matches or "final" not in matches:
        raise FileNotFoundError("At minimum, a draft* and final* file must exist in data_dir.")
    return matches


_md = MarkdownIt("commonmark", {"breaks": False, "html": False})
_md.enable("strikethrough")
_md.enable("table")
# Add linkify cautiously (we won't preserve URLs in final text though)
try:
    _md.linkify = linkify_it()
except Exception:
    pass


def markdown_to_text(md: str) -> str:
    # Convert Markdown → HTML → visible text, preserving list markers and inline code text.
    md = unicodedata.normalize("NFC", md)

    # Preserve list bullets before HTML conversion
    md_preserved = re.sub(r"^(\s*)[-*+]\s+", r"\1• ", md, flags=re.MULTILINE)

    # Render to HTML, then strip tags but keep code text
    html = _md.render(md_preserved)
    soup = BeautifulSoup(html, "lxml")

    # Replace code tags with their text content wrapped in backticks to keep lexical signal
    for code in soup.find_all(["code", "tt"]):
        code.replace_with("`" + code.get_text(" ", strip=True) + "`")

    # Remove links but keep the text
    for a in soup.find_all("a"):
        a.replace_with(a.get_text(" ", strip=True))

    text = soup.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text


def try_load_spacy(model_name: str):
    if not spacy:
        return None
    try:
        return spacy.load(model_name)
    except Exception:
        return None


def ensure_nltk_punkt():
    if not nltk:
        return False
    try:
        nltk.download("punkt", quiet=True)
        return True
    except Exception:
        return False


class SentenceSegmenter:
    def __init__(self, cfg: Config):
        self.cfg = cfg
        self._nlp = try_load_spacy(cfg.spacy_model) if cfg.prefer_spacy else None
        self._nltk_ready = ensure_nltk_punkt()

    def split(self, text: str) -> list[str]:
        sents = []
        if self._nlp:
            doc = self._nlp(text)
            sents = [s.text.strip() for s in doc.sents]
        elif nltk and self._nltk_ready:
            from nltk.tokenize import sent_tokenize
            sents = sent_tokenize(text)
        else:
            # Simple regex fallback
            sents = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9`])", text.strip())

        out = []
        for s in sents:
            s = s.strip()
            if not s:
                continue
            if len(s) < self.cfg.min_sentence_len or len(s) > self.cfg.max_sentence_len:
                continue
            out.append(s)
        return out


def split_paragraphs(text: str) -> list[str]:
    # Split on two or more newlines; keep list bullets merged with their text.
    paras = re.split(r"\n\s*\n", text)
    paras = [re.sub(r"\s+", " ", p.strip()) for p in paras if p.strip()]
    return paras


## 4) Load & preprocess all versions

In [4]:

def load_versions(cfg: Config) -> dict[str, dict]:
    files = discover_files(cfg)
    print("Discovered:", {k: str(v.name) for k,v in files.items()})
    art_name = cfg.article_name or Path(cfg.data_dir).resolve().name

    seg = SentenceSegmenter(cfg)
    data = {}
    for vkey in ["draft", "refined", "edited", "final"]:
        if vkey not in files:
            continue
        raw = Path(files[vkey]).read_text(encoding="utf-8", errors="ignore")
        plain = markdown_to_text(raw)
        sents = seg.split(plain)
        paras = split_paragraphs(plain)
        data[vkey] = {
            "file": str(files[vkey]),
            "raw_text_len": len(raw),
            "plain_text_len": len(plain),
            "sentences": sents,
            "paragraphs": paras,
        }
        print(f"{vkey}: {len(sents)} sentences, {len(paras)} paragraphs")
    return {"article_name": art_name, "versions": data}


## 5) Vectorize: embeddings (batched) + TF‑IDF (global vocab)

In [5]:

class VectorStores:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = SentenceTransformer(model_name)
        self.embeddings: dict[str, np.ndarray] = {}
        self.tfidf: dict[str, scipy.sparse.csr_matrix] = {}
        self.vectorizer: TfidfVectorizer | None = None

    def encode_version(self, key: str, sentences: list[str], batch_size: int = 128):
        if key in self.embeddings:
            return
        if not sentences:
            self.embeddings[key] = np.zeros((0, 384), dtype=np.float32)
            return
        emb = self.model.encode(sentences, batch_size=batch_size, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
        self.embeddings[key] = emb.astype(np.float32)

    def build_tfidf(self, corpus: dict[str, list[str]]):
        # build a single vocabulary across all versions to ensure comparable vectors
        all_sents = []
        for key, sents in corpus.items():
            all_sents.extend(sents)
        if not all_sents:
            self.vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, norm="l2")
            self.vectorizer.fit(["placeholder"])
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, norm="l2")
            self.vectorizer.fit(all_sents)

        for key, sents in corpus.items():
            if not sents:
                from scipy.sparse import csr_matrix
                self.tfidf[key] = csr_matrix((0, len(self.vectorizer.vocabulary_)), dtype=np.float32)
            else:
                self.tfidf[key] = self.vectorizer.transform(sents)


def jaccard_tokens(a: str, b: str) -> float:
    A = set(re.findall(r"\w+", a.lower()))
    B = set(re.findall(r"\w+", b.lower()))
    if not A and not B:
        return 1.0
    if not A or not B:
        return 0.0
    return len(A & B) / len(A | B)


## 6) Sentence-level similarity matrices & origin attribution

In [6]:

def cosine_matrix_from_embeddings(emb_a: np.ndarray, emb_b: np.ndarray) -> np.ndarray:
    if emb_a.size == 0 or emb_b.size == 0:
        return np.zeros((emb_a.shape[0], emb_b.shape[0]), dtype=np.float32)
    # embeddings are normalized → cosine = dot product
    return np.dot(emb_a, emb_b.T).astype(np.float32)


def cosine_matrix_from_tfidf(tfidf_a, tfidf_b) -> np.ndarray:
    if tfidf_a.shape[0] == 0 or tfidf_b.shape[0] == 0:
        return np.zeros((tfidf_a.shape[0], tfidf_b.shape[0]), dtype=np.float32)
    sim = (tfidf_a @ tfidf_b.T)
    return sim.toarray().astype(np.float32)


def attribute_final_sentences(cfg: Config, data: dict, vecs: VectorStores) -> pd.DataFrame:
    versions = data["versions"]
    assert "final" in versions, "Final version is required."
    order = [v for v in ["draft", "refined", "edited"] if v in versions] + ["final"]

    finals = versions["final"]["sentences"]

    # Precompute similarity matrices final↔prev
    sem_mats = {}
    lex_mats = {}
    for prev in [v for v in order if v != "final"]:
        sem_mats[prev] = cosine_matrix_from_embeddings(vecs.embeddings["final"], vecs.embeddings[prev])  # [n_final, n_prev]
        lex_mats[prev] = cosine_matrix_from_tfidf(vecs.tfidf["final"], vecs.tfidf[prev])                 # [n_final, n_prev]

    rows = []
    for i_f, f_sent in enumerate(finals):
        # find the earliest previous version that passes the semantic threshold with its best match
        chosen = None
        best_prev = None
        best_idx = -1
        best_sem = 0.0
        best_lex = 0.0

        for prev in [v for v in order if v != "final"]:
            Ssem = sem_mats[prev][i_f]  # vector [n_prev]
            if Ssem.size == 0:
                continue
            j = int(np.argmax(Ssem))
            sem_val = float(Ssem[j])
            if sem_val >= cfg.semantic_origin_threshold:
                chosen = prev
                best_prev = prev
                best_idx = j
                best_sem = sem_val
                best_lex = float(lex_mats[prev][i_f, j]) if lex_mats[prev].shape[1] > j else 0.0
                break

        # If no previous version cleared threshold → mark as NEW
        if not chosen:
            rows.append({
                "final_index": i_f,
                "final_sentence": f_sent,
                "origin_version": "new",
                "origin_index": None,
                "origin_sentence": None,
                "semantic_sim": 0.0,
                "tfidf_sim": 0.0,
                "jaccard": 0.0,
                "levenshtein": 0.0,
                "combined_sim": 0.0,
                "modification_label": "new",
            })
            continue

        prev_sents = versions[best_prev]["sentences"]
        o_sent = prev_sents[best_idx] if 0 <= best_idx < len(prev_sents) else ""

        # Compute expensive similarities on a small candidate set (top-K semantic from this prev)
        Ssem_vec = sem_mats[best_prev][i_f]
        if Ssem_vec.size:
            topk = min(cfg.topk_candidates, Ssem_vec.size)
            cand_idx = np.argpartition(-Ssem_vec, topk-1)[:topk]
        else:
            cand_idx = np.array([best_idx]) if best_idx >= 0 else np.array([], dtype=int)

        # refine best using combined score of semantic + tfidf (and tiebreak with jaccard/edit)
        best_comb = -1.0
        best_tuple = (best_idx, best_sem, best_lex, 0.0, 0.0, -1.0)  # (idx, sem, lex, jac, lev, comb)
        for j in cand_idx:
            sem_val = float(Ssem_vec[j])
            lex_val = float(lex_mats[best_prev][i_f, j]) if lex_mats[best_prev].shape[1] > j else 0.0
            # jaccard and levenshtein
            jac = jaccard_tokens(f_sent, prev_sents[j])
            lev = fuzz.ratio(f_sent, prev_sents[j]) / 100.0
            comb = cfg.weight_semantic * sem_val + cfg.weight_tfidf * lex_val
            if (comb > best_comb) or (math.isclose(comb, best_comb) and (lev, jac) > (best_tuple[4], best_tuple[3])):
                best_comb = comb
                best_tuple = (int(j), sem_val, lex_val, jac, lev, comb)

        best_idx, best_sem, best_lex, jac, lev, comb = best_tuple
        o_sent = prev_sents[best_idx] if 0 <= best_idx < len(prev_sents) else ""

        # Modification label from combined similarity
        if comb >= cfg.unchanged_threshold:
            mod = "unchanged_or_minor"
        elif comb >= cfg.minor_change_threshold:
            mod = "minor_change"
        else:
            mod = "major_change"

        rows.append({
            "final_index": i_f,
            "final_sentence": f_sent,
            "origin_version": best_prev,
            "origin_index": int(best_idx),
            "origin_sentence": o_sent,
            "semantic_sim": round(best_sem, 4),
            "tfidf_sim": round(best_lex, 4),
            "jaccard": round(jac, 4),
            "levenshtein": round(lev, 4),
            "combined_sim": round(comb, 4),
            "modification_label": mod,
        })

    df = pd.DataFrame(rows)
    return df


## 7) Main pipeline

In [7]:

def run_pipeline(cfg: Config) -> dict:
    # Load versions
    data = load_versions(cfg)
    art = data["article_name"]
    versions = data["versions"]

    # Build vectors
    vecs = VectorStores()
    for k, v in versions.items():
        vecs.encode_version(k, v["sentences"])
    vecs.build_tfidf({k: v["sentences"] for k, v in versions.items()})

    # Attribution
    df_map = attribute_final_sentences(cfg, data, vecs)

    # Aggregates
    origin_dist = df_map["origin_version"].value_counts(dropna=False).to_dict()
    mod_dist = df_map["modification_label"].value_counts(dropna=False).to_dict()

    summary = {
        "article_name": art,
        "config": asdict(cfg),
        "file_map": {k: versions[k]["file"] for k in versions},
        "counts": {k: len(versions[k]["sentences"]) for k in versions},
        "origin_distribution": origin_dist,
        "modification_distribution": mod_dist,
    }

    # Prepare output paths
    out_dir = Path(cfg.out_dir) / art
    out_dir.mkdir(parents=True, exist_ok=True)

    csv_path = out_dir / f"{art}_final_sentence_attribution.csv"
    json_path = out_dir / f"{art}_complete_summary.json"
    footer_path = out_dir / f"{art}_footer_metrics.json"

    df_map.to_csv(csv_path, index=False, encoding="utf-8")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump({"summary": summary, "rows": df_map.to_dict(orient="records")}, f, ensure_ascii=False, indent=2)
    with open(footer_path, "w", encoding="utf-8") as f:
        json.dump({
            "article_name": art,
            "origin_distribution": origin_dist,
            "modification_distribution": mod_dist,
            "config": {
                "semantic_origin_threshold": cfg.semantic_origin_threshold,
                "unchanged_threshold": cfg.unchanged_threshold,
                "minor_change_threshold": cfg.minor_change_threshold,
                "weights": {"semantic": cfg.weight_semantic, "tfidf": cfg.weight_tfidf},
            }
        }, f, ensure_ascii=False, indent=2)

    print("Wrote:")
    print(" -", csv_path)
    print(" -", json_path)
    print(" -", footer_path)

    return {
        "summary": summary,
        "csv_path": str(csv_path),
        "json_path": str(json_path),
        "footer_path": str(footer_path),
        "mapping_df": df_map,
    }


## 8) (Optional) Plots

In [8]:

# These plots help visually inspect distributions.
# Matplotlib only; no seaborn. Run after you've produced df via run_pipeline().

import matplotlib.pyplot as plt


def plot_similarity_hist(df: pd.DataFrame):
    if df.empty:
        print("No data to plot.")
        return
    plt.figure()
    df["combined_sim"].hist(bins=30)
    plt.title("Combined similarity distribution")
    plt.xlabel("combined_sim")
    plt.ylabel("count")
    plt.show()

def plot_origin_pie(df: pd.DataFrame):
    if df.empty:
        print("No data to plot.")
        return
    counts = df["origin_version"].value_counts()
    plt.figure()
    counts.plot.pie(autopct="%1.1f%%")
    plt.title("Origin distribution (final sentences)")
    plt.ylabel("")
    plt.show()


## 9) Run it

In [13]:
# Configure your paths / thresholds here, then run this cell.
# Expects files like: draft*.md, refined*.md, edited*.md, final*.md in CFG.data_dir.
CFG.data_dir = "/content/"            # <- change me
CFG.out_dir = "./output"           # <- change me (will create subdir per article)
CFG.article_name = None            # <- optionally override

# Thresholds/weights (tune per domain)
CFG.semantic_origin_threshold = 0.30
CFG.unchanged_threshold = 0.80
CFG.minor_change_threshold = 0.50
CFG.weight_semantic = 0.60
CFG.weight_tfidf = 0.40

# Optional sentence length filter
CFG.min_sentence_len = 10
CFG.max_sentence_len = 200

# Prefer spaCy if available; fallback handled internally
CFG.prefer_spacy = True

# Run the pipeline
result = run_pipeline(CFG)
# display(result["mapping_df"].head(20))

Discovered: {'draft': 'draft-article.md', 'refined': 'refined-article.md', 'edited': 'edited-article.md', 'final': 'final-article.md'}
draft: 85 sentences, 1 paragraphs
refined: 70 sentences, 1 paragraphs
edited: 104 sentences, 1 paragraphs
final: 105 sentences, 1 paragraphs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Wrote:
 - output/content/content_final_sentence_attribution.csv
 - output/content/content_complete_summary.json
 - output/content/content_footer_metrics.json



---

### Notes
- **Attribution rule:** We classify each final sentence as (a) *derived* from the **earliest** prior version where semantic similarity ≥ `semantic_origin_threshold`; otherwise it's **new**.  
- **Modification label:** Based on a weighted combination of semantic & TF‑IDF similarity. Tune thresholds per writing style.
- **Performance:** Embeddings & TF‑IDF are computed **once per version** and reused. Expensive token metrics (Jaccard/Edit) are computed only on top‑K candidates.
- **Auditing:** Use the CSV to review `final_sentence → origin_sentence` with all scores. Adjust thresholds and re‑run as needed.


In [10]:
# Check installed version of markdown-it-py and its dependencies
%pip show markdown-it-py
%pip show linkify-it-py

Name: markdown-it-py
Version: 4.0.0
Summary: Python port of markdown-it. Markdown parsing, done right!
Home-page: https://github.com/executablebooks/markdown-it-py
Author: 
Author-email: Chris Sewell <chrisj_sewell@hotmail.com>
License: 
Location: /usr/local/lib/python3.12/dist-packages
Requires: mdurl
Required-by: jupytext, mdit-py-plugins, panel, rich
Name: linkify-it-py
Version: 2.0.3
Summary: Links recognition library with FULL unicode support.
Home-page: https://github.com/tsutsu3/linkify-it-py
Author: tsutsu3
Author-email: 
License: MIT
Location: /usr/local/lib/python3.12/dist-packages
Requires: uc-micro-py
Required-by: panel


In [11]:
try:
    from markdown_it.extensions.linkify import linkify_it
    print("Successfully imported markdown_it.extensions.linkify")
except ImportError as e:
    print(f"Failed to import from markdown_it.extensions: {e}")
    try:
        from linkify_it import linkify_it
        print("Successfully imported linkify_it directly")
    except ImportError as e:
        print(f"Failed to import linkify_it directly: {e}")

Failed to import from markdown_it.extensions: No module named 'markdown_it.extensions'
Failed to import linkify_it directly: cannot import name 'linkify_it' from 'linkify_it' (/usr/local/lib/python3.12/dist-packages/linkify_it/__init__.py)
