# spaCy Basics for AI Tells Detection

An interactive reference for spaCy concepts used in this project. Run cells to remind yourself how things work.

## Setup and Model Loading

In [1]:
# Ensure the spaCy model is installed
import subprocess
import sys

try:
    import spacy

    _ = spacy.load("en_core_web_sm")
except OSError:
    _ = subprocess.check_call(  # noqa: S603
        [sys.executable, "-m", "spacy", "download", "en_core_web_sm"]
    )

In [2]:
import spacy
from spacy import displacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# See what pipeline components are active
print("Pipeline components:", nlp.pipe_names)

Pipeline components: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [3]:
# Disable components you don't need for faster processing
# For detection, we typically need: tok2vec, tagger, parser
# We can disable: ner, lemmatizer, attribute_ruler

nlp_fast = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
print("Fast pipeline:", nlp_fast.pipe_names)

Fast pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler']


## Doc, Span, Token Basics

In [4]:
doc = nlp("The quick brown fox jumps over the lazy dog.")

# Iterate tokens and inspect attributes
print(f"{'Token':<12} {'POS':<8} {'Dep':<12} {'Head'}")
print("-" * 44)
for token in doc:
    print(f"{token.text:<12} {token.pos_:<8} {token.dep_:<12} {token.head.text}")

Token        POS      Dep          Head
--------------------------------------------
The          DET      det          fox
quick        ADJ      amod         fox
brown        ADJ      amod         fox
fox          NOUN     nsubj        jumps
jumps        VERB     ROOT         jumps
over         ADP      prep         jumps
the          DET      det          dog
lazy         ADJ      amod         dog
dog          NOUN     pobj         over
.            PUNCT    punct        jumps


In [5]:
# Iterate sentences
text = "First sentence here. Second sentence follows. Third completes the trio."
doc = nlp(text)

for i, sent in enumerate(doc.sents):
    print(f"Sentence {i}: {sent.text}")

Sentence 0: First sentence here.
Sentence 1: Second sentence follows.
Sentence 2: Third completes the trio.


In [6]:
# Spans are views into the Doc, not copies
doc = nlp("The quick brown fox jumps.")
span = doc[1:4]  # "quick brown fox"

print(f"Span text: {span.text}")
print(f"Span tokens share Doc: {span[0].doc is doc}")

Span text: quick brown fox
Span tokens share Doc: True


## Dependency Parsing

In [7]:
doc = nlp("The experienced developer quickly reviewed the complex codebase.")

print(f"{'Token':<15} {'Dep':<12} {'Head':<15} {'Children'}")
print("-" * 60)
for token in doc:
    children = [child.text for child in token.children]
    print(f"{token.text:<15} {token.dep_:<12} {token.head.text:<15} {children}")

Token           Dep          Head            Children
------------------------------------------------------------
The             det          developer       []
experienced     amod         developer       []
developer       nsubj        reviewed        ['The', 'experienced']
quickly         advmod       reviewed        []
reviewed        ROOT         reviewed        ['developer', 'quickly', 'codebase', '.']
the             det          codebase        []
complex         amod         codebase        []
codebase        dobj         reviewed        ['the', 'complex']
.               punct        reviewed        []


In [8]:
# token.subtree gives all descendants (the token + its recursive children)
doc = nlp("The experienced developer quickly reviewed the complex codebase.")
verb = doc[4]  # "reviewed"

print(f"Verb: {verb.text}")
print(f"Subtree: {[t.text for t in verb.subtree]}")

Verb: reviewed
Subtree: ['The', 'experienced', 'developer', 'quickly', 'reviewed', 'the', 'complex', 'codebase', '.']


In [9]:
# Visualize the dependency tree
doc = nlp("The experienced developer quickly reviewed the complex codebase.")
_ = displacy.render(doc, style="dep", jupyter=True)  # pyright: ignore[reportUnknownMemberType]

## Finding Conjunctions (Triads Preview)

AI text often uses "rule of three" lists. Conjuncts attach via the `conj` dependency relation.

In [10]:
doc = nlp("I like apples, oranges, and bananas.")

print(f"{'Token':<12} {'Dep':<12} {'Head':<12} {'Conj children'}")
print("-" * 50)
for token in doc:
    conj_children = [c.text for c in token.children if c.dep_ == "conj"]
    print(f"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {conj_children}")

Token        Dep          Head         Conj children
--------------------------------------------------
I            nsubj        like         []
like         ROOT         like         []
apples       dobj         like         ['oranges']
,            punct        apples       []
oranges      conj         apples       ['bananas']
,            punct        oranges      []
and          cc           oranges      []
bananas      conj         oranges      []
.            punct        like         []


In [11]:
_ = displacy.render(doc, style="dep", jupyter=True)  # pyright: ignore[reportUnknownMemberType]

In [12]:
from spacy.tokens import Doc, Token


def find_triads(doc: Doc) -> list[list[Token]]:
    """Find tokens with exactly two conj children (three-item lists)."""
    triads: list[list[Token]] = []
    for token in doc:
        conj_children = [c for c in token.children if c.dep_ == "conj"]
        if len(conj_children) == 2:
            items = [token, *conj_children]
            triads.append(items)
    return triads


# Test on a sentence with a triad
doc = nlp("Good code is readable, maintainable, and testable.")
triads = find_triads(doc)

for triad in triads:
    print(f"Triad found: {[t.text for t in triad]}")

In [13]:
# Test on a sentence without a triad
doc = nlp("I prefer coffee and tea.")
triads = find_triads(doc)
print(f"Triads in 'I prefer coffee and tea.': {triads}")

# Test on a longer list
doc = nlp("We need speed, accuracy, reliability, and cost-effectiveness.")
triads = find_triads(doc)
print(f"Triads in four-item list: {triads}")

Triads in 'I prefer coffee and tea.': []
Triads in four-item list: []


## POS Patterns

Part-of-speech sequences can reveal structural patterns. Sentence-initial POS is relevant for anaphora detection.

In [14]:
from spacy.tokens import Span


def get_pos_sequence(span: Span) -> tuple[str, ...]:
    """Extract POS tags as a tuple."""
    return tuple(token.pos_ for token in span)


doc = nlp("The quick fox jumps. It lands gracefully. It runs away.")

for sent in doc.sents:
    pos_seq = get_pos_sequence(sent)
    print(f"{sent.text}")
    print(f"  POS: {pos_seq}")
    print(f"  Initial: {pos_seq[0]}")
    print()

The quick fox jumps.
  POS: ('DET', 'ADJ', 'NOUN', 'NOUN', 'PUNCT')
  Initial: DET

It lands gracefully.
  POS: ('PRON', 'VERB', 'ADV', 'PUNCT')
  Initial: PRON

It runs away.
  POS: ('PRON', 'VERB', 'ADV', 'PUNCT')
  Initial: PRON



In [15]:
def analyze_sentence_starts(doc: Doc) -> list[tuple[str, str]]:
    """Analyze the first token POS of each sentence."""
    starts: list[tuple[str, str]] = []
    for sent in doc.sents:
        first_token = next(iter(sent))
        starts.append((first_token.text, first_token.pos_))
    return starts


# AI text often has repetitive sentence starts
ai_style = nlp(
    "This is important. This helps understand. This makes sense. This works well."
)
human_style = nlp(
    "Consider the implications. Many factors contribute. "
    + "Results vary widely. Experts disagree."
)

print("AI-style starts:", analyze_sentence_starts(ai_style))
print("Human-style starts:", analyze_sentence_starts(human_style))

AI-style starts: [('This', 'PRON'), ('This', 'PRON'), ('This', 'PRON'), ('This', 'PRON')]
Human-style starts: [('Consider', 'VERB'), ('Many', 'ADJ'), ('Results', 'NOUN'), ('Experts', 'NOUN')]


## Batch Processing

Use `nlp.pipe()` for processing multiple texts. It's significantly faster than calling `nlp()` in a loop.

In [16]:
import time

texts = [
    "First document to process.",
    "Second document with more content here.",
    "Third document completes our test set.",
] * 100  # 300 texts

# Method 1: Loop (slower)
start = time.perf_counter()
docs_loop = [nlp(text) for text in texts]
loop_time = time.perf_counter() - start

# Method 2: nlp.pipe (faster)
start = time.perf_counter()
docs_pipe = list(nlp.pipe(texts))
pipe_time = time.perf_counter() - start

print(f"Loop: {loop_time:.3f}s")
print(f"Pipe: {pipe_time:.3f}s")
print(f"Speedup: {loop_time / pipe_time:.1f}x")

Loop: 2.394s
Pipe: 0.800s
Speedup: 3.0x


In [17]:
# nlp.pipe with batch_size tuning
# Default batch_size is 1000, but smaller batches use less memory

docs = list(nlp.pipe(texts, batch_size=50))
print(f"Processed {len(docs)} documents")

Processed 300 documents


## Loading Sample Texts

In [18]:
from pathlib import Path

# Get notebook directory for relative paths (not a true constant, computed at runtime)
notebook_dir = Path(__file__).parent if "__file__" in dir() else Path.cwd()
if notebook_dir.name != "notebooks":
    notebook_dir = Path("notebooks")


def load_samples(directory: str) -> dict[str, str]:
    """Load all .txt files from a directory."""
    samples: dict[str, str] = {}
    sample_dir = notebook_dir / directory
    for txt_file in sample_dir.glob("*.txt"):
        samples[txt_file.stem] = txt_file.read_text()
    return samples


# Load AI-generated samples
ai_samples = load_samples("samples/ai_generated")
print(f"AI samples: {list(ai_samples.keys())}")

# Load human-written samples
human_samples = load_samples("samples/human_written")
print(f"Human samples: {list(human_samples.keys())}")

AI samples: ['technical_ai', 'explanation_ai', 'essay_ai']
Human samples: []


In [19]:
# Process all samples
ai_docs = {name: nlp(text) for name, text in ai_samples.items()}
human_docs = {name: nlp(text) for name, text in human_samples.items()}

# Quick stats
for name, doc in ai_docs.items():
    sents = list(doc.sents)
    print(f"AI/{name}: {len(sents)} sentences, {len(doc)} tokens")

AI/technical_ai: 12 sentences, 238 tokens
AI/explanation_ai: 10 sentences, 218 tokens
AI/essay_ai: 11 sentences, 204 tokens


In [20]:
# Example: find triads in all AI samples
print("Triads in AI samples:")
print("-" * 40)
for name, doc in ai_docs.items():
    triads = find_triads(doc)
    if triads:
        print(f"\n{name}:")
        for triad in triads:
            print(f"  {[t.text for t in triad]}")

Triads in AI samples:
----------------------------------------
