In [None]:
# --- Coded with @ChatGPT ---
!pip -q install transformers torch

import re
from typing import List, Tuple

def remove_last_word_per_line(poem: str) -> Tuple[str, List[str]]:
    """
    Treat each line as a phrase.
    Removes the last word of each non-empty line.
    Returns (new_poem, removed_words).

    Notes:
    - If line ends with punctuation, we remove the last word but keep trailing punctuation.
      Example: "hello, world!" -> "hello, !" (you can change this behavior below)
    - Lines with 0 or 1 word become "" (or just punctuation if it existed).
    """
    lines = poem.splitlines()
    new_lines = []
    removed = []

    # Pattern: capture everything up to last "word", then that last word, then trailing non-word chars
    # "word" here = letters/digits/underscore; works well for most poems.
    pat = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

    for line in lines:
        if line.strip() == "":
            new_lines.append(line)   # keep blank line as-is
            removed.append("")
            continue

        m = pat.match(line)
        if not m:
            # If nothing matches (e.g., line is only punctuation), keep as-is
            new_lines.append(line)
            removed.append("")
            continue

        before, last_word, trailing = m.group(1), m.group(2), m.group(3)
        # Remove possible extra spaces before trailing punctuation nicely
        new_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
        new_lines.append(new_line)
        removed.append(last_word)

    return "\n".join(new_lines), removed


# --- Paste the poem here ---
poem = """Minha terra tem palmeiras,
Onde canta o Sabiá;
As aves, que aqui gorjeiam,
Não gorjeiam como lá.

Nosso céu tem mais estrelas,
Nossas várzeas têm mais flores,
Nossos bosques têm mais vida,
Nossa vida mais amores.

Em  cismar, sozinho, à noite,
Mais prazer eu encontro lá;
Minha terra tem palmeiras,
Onde canta o Sabiá.

Minha terra tem primores,
Que tais não encontro eu cá;
Em cismar –sozinho, à noite–
Mais prazer eu encontro lá;
Minha terra tem palmeiras,
Onde canta o Sabiá.

Não permita Deus que eu morra,
Sem que eu volte para lá;
Sem que disfrute os primores
Que não encontro por cá;
Sem qu'inda aviste as palmeiras,
Onde canta o Sabiá. """

new_poem, removed_words = remove_last_word_per_line(poem)

print("=== Original ===")
print(poem)
print("\n=== Without last word per line ===")
print(new_poem)
print("\n=== Removed last words ===")
print(removed_words)


=== Original ===
Minha terra tem palmeiras,
Onde canta o Sabiá;
As aves, que aqui gorjeiam,
Não gorjeiam como lá.

Nosso céu tem mais estrelas,
Nossas várzeas têm mais flores,
Nossos bosques têm mais vida,
Nossa vida mais amores.

Em  cismar, sozinho, à noite,
Mais prazer eu encontro lá;
Minha terra tem palmeiras,
Onde canta o Sabiá.

Minha terra tem primores,
Que tais não encontro eu cá;
Em cismar –sozinho, à noite–
Mais prazer eu encontro lá;
Minha terra tem palmeiras,
Onde canta o Sabiá.

Não permita Deus que eu morra,
Sem que eu volte para lá;
Sem que disfrute os primores
Que não encontro por cá;
Sem qu'inda aviste as palmeiras,
Onde canta o Sabiá. 

=== Without last word per line ===
Minha terra tem ,
Onde canta o ;
As aves, que aqui ,
Não gorjeiam como .

Nosso céu tem mais ,
Nossas várzeas têm mais ,
Nossos bosques têm mais ,
Nossa vida mais .

Em  cismar, sozinho, à ,
Mais prazer eu encontro ;
Minha terra tem ,
Onde canta o .

Minha terra tem ,
Que tais não encontro eu ;
Em cis

In [None]:
# =========================
# This script: CODE 1

# 1. Takes a poem

# 2. Treats each line as a phrase

# 3. Removes the last word of each line, but keep the punctution

# 4. Asks GPT-2:“Given this incomplete line, what is the most likely next word?”

# 5. Prints the top 7 most probable next words per line, according to GPT-2

#This is a "probabilistic poetry".
# =========================

!pip -q install transformers torch

#re: regular expressions (used to manipulate text)

#AutoTokenizer: converts text → tokens (numbers)

#AutoModelForCausalLM: GPT-2 model that predicts the next token

import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- 1) Paste the poem here ---
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

# --- 2) Remove the last word of each line (each line = one phrase) ---
def remove_last_word_per_line(poem: str):
    lines = poem.splitlines()
    new_lines = []
    removed = []

    pat = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

    for line in lines:
        if line.strip() == "":
            new_lines.append(line)
            removed.append("")
            continue

        m = pat.match(line)
        if not m:
            new_lines.append(line)
            removed.append("")
            continue

        before, last_word, trailing = m.group(1), m.group(2), m.group(3)

        # keep trailing punctuation (if any) but remove the last word
        new_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
        new_lines.append(new_line)
        removed.append(last_word)

    return "\n".join(new_lines), removed

prompts_poem, removed_words = remove_last_word_per_line(poem)
prompts_lines = prompts_poem.splitlines()

print("=== Prompts (each line missing last word) ===")
print(prompts_poem)
print("\n=== Removed last words (for reference) ===")
print(removed_words)

# --- 3) Load GPT-2 ("openai-community/gpt2") ---
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# --- 4) Get the 7 highest-probability "next-word" candidates per line ---
# Note: GPT-2 predicts next *token*, so it filters for "word-like" tokens and return 7 of them.
word_like = re.compile(r"^[A-Za-zÀ-ÖØ-öø-ÿ]+(?:[-'][A-Za-zÀ-ÖØ-öø-ÿ]+)*$")

def top7_next_words(prompt: str, k: int = 7):
    if prompt.strip() == "":
        return []

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        out = model(**inputs)
        logits = out.logits[0, -1, :]  # next-token logits

    probs = torch.softmax(logits, dim=-1)
    sorted_ids = torch.argsort(probs, descending=True)

    candidates = []
    for tid in sorted_ids.tolist():
        piece = tokenizer.decode([tid])  # decoded token string (often begins with a space)
        candidate = piece.strip()

        # filter for word-like candidates
        if word_like.match(candidate):
            candidates.append((candidate, float(probs[tid].cpu())))
            if len(candidates) == k:
                break

    return candidates

print("\n=== GPT-2 top-7 next-word predictions per line ===")
for i, line in enumerate(prompts_lines, start=1):
    if line.strip() == "":
        print(f"\nLine {i}: (blank)")
        continue

    preds = top7_next_words(line, k=7)
    print(f"\nLine {i} prompt: {line!r}")
    for rank, (w, p) in enumerate(preds, start=1):
        print(f"  {rank}. {w}   (p≈{p:.6f})")


=== Prompts (each line missing last word) ===
One must have a mind of
To regard the frost and the
Of the pine-trees crusted with ;
And have been cold a long
To behold the junipers shagged with ,
The spruces rough in the distant
Of the January sun; and not to
Of any misery in the sound of the ,
In the sound of a few ,
Which is the sound of the
Full of the same
That is blowing in the same bare
For the listener, who listens in the ,
And, nothing himself,
Nothing that is not there and the nothing that .

=== Removed last words (for reference) ===
['winter', 'boughs', 'snow', 'time', 'ice', 'glitter', 'think', 'wind', 'leaves', 'land', 'wind', 'place', 'snow', 'beholds', 'is']

=== GPT-2 top-7 next-word predictions per line ===

Line 1 prompt: 'One must have a mind of'
  1. their   (p≈0.327742)
  2. its   (p≈0.134323)
  3. his   (p≈0.130412)
  4. your   (p≈0.024782)
  5. a   (p≈0.020237)
  6. our   (p≈0.016856)
  7. her   (p≈0.016227)

Line 2 prompt: 'To regard the frost and the'
  1. cold 

In [None]:
#Choose and print P7.txt
#This is a "probabilistic poetry".
!pip -q install transformers torch

import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

# -----------------------------------------------------
# 1) Build prompts AND keep structured parts per line:
#    before (everything before last word),
#    last_word (removed),
#    trailing (punctuation after last word)
# -----------------------------------------------------
def split_last_word_per_line(poem: str):
    lines = poem.splitlines()
    parts = []  # list of dicts: {original, before, last_word, trailing}
    pat = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

    for line in lines:
        if line.strip() == "":
            parts.append({"original": line, "before": line, "last_word": "", "trailing": ""})
            continue

        m = pat.match(line)
        if not m:
            parts.append({"original": line, "before": line, "last_word": "", "trailing": ""})
            continue

        before, last_word, trailing = m.group(1), m.group(2), m.group(3)
        parts.append({"original": line, "before": before, "last_word": last_word, "trailing": trailing})

    return parts

parts = split_last_word_per_line(poem)

# Prompt must be EXACTLY "before + trailing" (this matches your original logic/output)
prompts_lines = []
removed_words = []
for d in parts:
    before, trailing = d["before"], d["trailing"]
    prompt_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
    prompts_lines.append(prompt_line)
    removed_words.append(d["last_word"])

print("=== Prompts (each line missing last word) ===")
print("\n".join(prompts_lines))
print("\n=== Removed last words (for reference) ===")
print(removed_words)

# -----------------------------------------------------
# 2) Load GPT-2
# -----------------------------------------------------
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# -----------------------------------------------------
# 3) Top-7 predictions (word-like tokens)
# -----------------------------------------------------
word_like = re.compile(r"^[A-Za-zÀ-ÖØ-öø-ÿ]+(?:[-'][A-Za-zÀ-ÖØ-öø-ÿ]+)*$")

def top7_next_words(prompt: str, k: int = 7):
    if prompt.strip() == "":
        return []

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(**inputs)
        logits = out.logits[0, -1, :]

    probs = torch.softmax(logits, dim=-1)
    sorted_ids = torch.argsort(probs, descending=True)

    candidates = []
    for tid in sorted_ids.tolist():
        candidate = tokenizer.decode([tid]).strip()
        if word_like.match(candidate):
            candidates.append((candidate, float(probs[tid].cpu())))
            if len(candidates) == k:
                break
    return candidates

# -----------------------------------------------------
# 4) Rebuild CORRECTLY:
#    [before] + chosen_word + [trailing]
#    (word goes BEFORE punctuation)
# -----------------------------------------------------
def rebuild_from_parts(before: str, chosen_word: str, trailing: str):
    base = before.rstrip()
    if base == "":
        return f"{chosen_word}{trailing}"
    return f"{base} {chosen_word}{trailing}"

# -----------------------------------------------------
# 5) Interactive: print top-7, choose 1-7, rebuild poem, save txt
# -----------------------------------------------------
new_lines = []

print("\n=== GPT-2 top-7 next-word predictions per line (and choose 1-7) ===")
for i, prompt_line in enumerate(prompts_lines, start=1):
    if prompt_line.strip() == "":
        print(f"\nLine {i}: (blank)")
        new_lines.append(prompt_line)
        continue

    preds = top7_next_words(prompt_line, k=7)

    print(f"\nLine {i} prompt: {prompt_line!r}")
    for rank, (w, p) in enumerate(preds, start=1):
        print(f"  {rank}. {w}   (p≈{p:.6f})")

    while True:
        raw = input("Choose an option (1-7): ").strip()
        if raw.isdigit():
            n = int(raw)
            if 1 <= n <= 7:
                chosen_word = preds[n - 1][0]
                break
        print("Invalid choice. Type a number from 1 to 7.")

    before = parts[i - 1]["before"]
    trailing = parts[i - 1]["trailing"]
    new_lines.append(rebuild_from_parts(before, chosen_word, trailing))

new_poem = "\n".join(new_lines)

print("\n=== NEW POEM ===")
print(new_poem)

out_path = "P7_poem_probabilistic.txt"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(new_poem)

print(f"\nSaved to: {out_path}")


=== Prompts (each line missing last word) ===
One must have a mind of
To regard the frost and the
Of the pine-trees crusted with ;
And have been cold a long
To behold the junipers shagged with ,
The spruces rough in the distant
Of the January sun; and not to
Of any misery in the sound of the ,
In the sound of a few ,
Which is the sound of the
Full of the same
That is blowing in the same bare
For the listener, who listens in the ,
And, nothing himself,
Nothing that is not there and the nothing that .

=== Removed last words (for reference) ===
['winter', 'boughs', 'snow', 'time', 'ice', 'glitter', 'think', 'wind', 'leaves', 'land', 'wind', 'place', 'snow', 'beholds', 'is']

=== GPT-2 top-7 next-word predictions per line (and choose 1-7) ===

Line 1 prompt: 'One must have a mind of'
  1. their   (p≈0.327742)
  2. its   (p≈0.134323)
  3. his   (p≈0.130412)
  4. your   (p≈0.024782)
  5. a   (p≈0.020237)
  6. our   (p≈0.016856)
  7. her   (p≈0.016227)
Choose an option (1-7): 7

Line 2 promp

In [None]:
# =========================
# VERSION 1 — For each line: show TOP-7 next-word candidates in ENGLISH + TOP-7 in PORTUGUESE
# (same GPT-2 model; we just filter the ranked next-token list into 2 language buckets)
#NOTE: discovered that GPT-2 model does not work in Portuguese
# =========================
!pip -q install transformers torch

import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- 1) Paste your poem here ---
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

# --- 2) Remove the last word of each line (each line = one phrase) ---
def remove_last_word_per_line(poem: str):
    lines = poem.splitlines()
    new_lines = []
    removed = []

    pat = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

    for line in lines:
        if line.strip() == "":
            new_lines.append(line)
            removed.append("")
            continue

        m = pat.match(line)
        if not m:
            new_lines.append(line)
            removed.append("")
            continue

        before, last_word, trailing = m.group(1), m.group(2), m.group(3)

        # keep trailing punctuation (if any) but remove the last word
        new_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
        new_lines.append(new_line)
        removed.append(last_word)

    return "\n".join(new_lines), removed


# --- 3) Load GPT-2 ---
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# --- 4) Language-ish filters (heuristics) ---
# "word-like": letters (incl. accents) + optional internal - or '
WORDLIKE = re.compile(r"^[A-Za-zÀ-ÖØ-öø-ÿ]+(?:[-'][A-Za-zÀ-ÖØ-öø-ÿ]+)*$")

# English-ish: only basic Latin letters (no accents)
ENGLISHISH = re.compile(r"^[A-Za-z]+(?:[-'][A-Za-z]+)*$")

# Portuguese-ish: either has a Portuguese diacritic or contains common PT letter combos
# (still a heuristic; GPT-2 is not a Portuguese-specialized model)
PT_DIACRITIC = re.compile(r"[áàâãéêíóôõúçÁÀÂÃÉÊÍÓÔÕÚÇ]")
PT_COMBOS = re.compile(r"(nh|lh|ção|ções|mente|ões|ões|que|pra|não|uma|uma|para|com|dos|das|ção)$", re.IGNORECASE)

def is_portugueseish(word: str) -> bool:
    return bool(WORDLIKE.match(word) and (PT_DIACRITIC.search(word) or PT_COMBOS.search(word)))

def is_englishish(word: str) -> bool:
    return bool(ENGLISHISH.match(word))

# --- 5) Get ranked next-token list once, then filter into EN/PT buckets ---
def get_ranked_next_tokens(prompt: str, top_n: int = 500):
    if prompt.strip() == "":
        return []

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits[0, -1, :]

    probs = torch.softmax(logits, dim=-1)
    sorted_ids = torch.argsort(probs, descending=True)[:top_n]

    ranked = []
    for tid in sorted_ids.tolist():
        piece = tokenizer.decode([tid])
        candidate = piece.strip()
        if candidate and WORDLIKE.match(candidate):
            ranked.append((candidate, float(probs[tid].cpu())))
    return ranked

def pick_top_k_by_lang(ranked, k=7):
    en = []
    pt = []
    for w, p in ranked:
        if len(en) < k and is_englishish(w):
            en.append((w, p))
        if len(pt) < k and is_portugueseish(w):
            pt.append((w, p))
        if len(en) >= k and len(pt) >= k:
            break
    return en, pt

# --- Run ---
prompts_poem, removed_words = remove_last_word_per_line(poem)
prompts_lines = prompts_poem.splitlines()

print("=== Prompts (each line missing last word) ===")
print(prompts_poem)
print("\n=== Removed last words (reference) ===")
print(removed_words)

for i, line in enumerate(prompts_lines, start=1):
    print("\n" + "="*80)
    if line.strip() == "":
        print(f"Line {i}: (blank)")
        continue

    ranked = get_ranked_next_tokens(line, top_n=1500)  # increase if PT bucket is too sparse
    top_en, top_pt = pick_top_k_by_lang(ranked, k=7)

    print(f"Line {i} prompt: {line!r}")

    print("\nTop-7 (English-ish) next words:")
    if top_en:
        for r, (w, p) in enumerate(top_en, start=1):
            print(f"  {r}. {w:<18} p≈{p:.6f}")
    else:
        print("  (none found — try increasing top_n)")

    print("\nTop-7 (Portuguese-ish) next words:")
    if top_pt:
        for r, (w, p) in enumerate(top_pt, start=1):
            print(f"  {r}. {w:<18} p≈{p:.6f}")
    else:
        print("  (none found — try increasing top_n)")


=== Prompts (each line missing last word) ===
One must have a mind of
To regard the frost and the
Of the pine-trees crusted with ;
And have been cold a long
To behold the junipers shagged with ,
The spruces rough in the distant
Of the January sun; and not to
Of any misery in the sound of the ,
In the sound of a few ,
Which is the sound of the
Full of the same
That is blowing in the same bare
For the listener, who listens in the ,
And, nothing himself,
Nothing that is not there and the nothing that .

=== Removed last words (reference) ===
['winter', 'boughs', 'snow', 'time', 'ice', 'glitter', 'think', 'wind', 'leaves', 'land', 'wind', 'place', 'snow', 'beholds', 'is']

Line 1 prompt: 'One must have a mind of'

Top-7 (English-ish) next words:
  1. their              p≈0.327749
  2. its                p≈0.134322
  3. his                p≈0.130409
  4. your               p≈0.024783
  5. a                  p≈0.020238
  6. our                p≈0.016856
  7. her                p≈0.016227

To

In [None]:
# =========================
# This script: CODE 2
# TOP-7 "SEMANTIC" next-word candidates per line (GPT-2)
# Variant 1: filter out function words (articles, pronouns, conjunctions, etc.)
# NOTE: GPT-2 doesn't provide POS (Part-of-Speech tags) tags,
#so we use a strong stopword (remove very common function words as I, you, of, in, on,and, or, but, etc)
#+ heuristic filter (Keep only tokens that look like real words removing very short tokensnumbers or symbols) .
# =========================
!pip -q install transformers torch

import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- 1) Paste your poem here ---
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

# --- 2) Remove last word per line (each line = one phrase) ---
def remove_last_word_per_line(poem: str):
    lines = poem.splitlines()
    new_lines, removed = [], []
    pat = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

    for line in lines:
        if line.strip() == "":
            new_lines.append(line)
            removed.append("")
            continue

        m = pat.match(line)
        if not m:
            new_lines.append(line)
            removed.append("")
            continue

        before, last_word, trailing = m.group(1), m.group(2), m.group(3)
        new_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
        new_lines.append(new_line)
        removed.append(last_word)

    return "\n".join(new_lines), removed

prompts_poem, removed_words = remove_last_word_per_line(poem)
prompts_lines = prompts_poem.splitlines()

print("=== Prompts (each line missing last word) ===")
print(prompts_poem)
print("\n=== Removed last words (reference) ===")
print(removed_words)

# --- 3) Load GPT-2 ---
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# --- 4) Filters: word-like + "content word" heuristic ---
WORDLIKE = re.compile(r"^[A-Za-z]+(?:[-'][A-Za-z]+)*$")  # keep simple English word forms

# A compact but strong English stopword set to remove function words.
# (Includes: articles, pronouns, conjunctions, auxiliaries, prepositions, etc.)
STOPWORDS = {
    # articles / determiners
    "a","an","the","this","that","these","those","some","any","each","every","either","neither",
    "no","many","much","few","several","such","what","which","whose",
    # pronouns
    "i","me","my","mine","myself","we","us","our","ours","ourselves",
    "you","your","yours","yourself","yourselves",
    "he","him","his","himself","she","her","hers","herself",
    "it","its","itself","they","them","their","theirs","themselves",
    "one","ones","someone","somebody","anyone","anybody","everyone","everybody","nothing","something",
    # conjunctions
    "and","or","but","nor","so","yet","for","although","though","because","since","unless","while","if","than",
    # common prepositions
    "of","to","in","on","at","by","with","from","into","onto","over","under","between","among","through","during","before","after",
    "above","below","about","against","around","across","toward","towards","within","without","upon",
    # auxiliaries / modals / copulas (often non-content)
    "am","is","are","was","were","be","been","being",
    "do","does","did","doing",
    "have","has","had","having",
    "can","could","may","might","must","shall","should","will","would",
    # misc function-ish
    "not","no","yes","very","too","also","just","only","even","still","then","there","here","when","where","why","how",
    "as","up","down","out","off","again","more","most","less","least"
}

# Additional heuristic: reject very short tokens & common suffix-only tokens that slip in
def is_content_word(w: str) -> bool:
    w_low = w.lower()
    if not WORDLIKE.match(w):
        return False
    if len(w_low) < 3:
        return False
    if w_low in STOPWORDS:
        return False
    return True

def top_k_content_words(prompt: str, k: int = 7, oversample: int = 5000):
    """
    GPT-2 predicts next TOKEN. We'll rank all tokens by probability, then filter
    to "content words" and return the first k.
    oversample: how many top tokens to scan to find enough content words.
    """
    if prompt.strip() == "":
        return []

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits[0, -1, :]  # next-token logits

    probs = torch.softmax(logits, dim=-1)
    sorted_ids = torch.argsort(probs, descending=True)[:oversample]

    out = []
    seen = set()
    for tid in sorted_ids.tolist():
        cand = tokenizer.decode([tid]).strip()
        if not cand:
            continue

        # keep unique lowercase words (avoid repeats with casing)
        key = cand.lower()
        if key in seen:
            continue

        if is_content_word(cand):
            out.append((cand, float(probs[tid].cpu())))
            seen.add(key)
            if len(out) == k:
                break

    return out

print("\n=== GPT-2 top-7 CONTENT-WORD (semantic-ish) predictions per line ===")
for i, line in enumerate(prompts_lines, start=1):
    print("\n" + "="*80)
    if line.strip() == "":
        print(f"Line {i}: (blank)")
        continue

    preds = top_k_content_words(line, k=7, oversample=20000)

    print(f"Line {i} prompt: {line!r}")
    if preds:
        for rank, (w, p) in enumerate(preds, start=1):
            print(f"  {rank}. {w:<18} p≈{p:.6f}")
    else:
        print("  (No content-word candidates found; try increasing oversample.)")


=== Prompts (each line missing last word) ===
One must have a mind of
To regard the frost and the
Of the pine-trees crusted with ;
And have been cold a long
To behold the junipers shagged with ,
The spruces rough in the distant
Of the January sun; and not to
Of any misery in the sound of the ,
In the sound of a few ,
Which is the sound of the
Full of the same
That is blowing in the same bare
For the listener, who listens in the ,
And, nothing himself,
Nothing that is not there and the nothing that .

=== Removed last words (reference) ===
['winter', 'boughs', 'snow', 'time', 'ice', 'glitter', 'think', 'wind', 'leaves', 'land', 'wind', 'place', 'snow', 'beholds', 'is']

=== GPT-2 top-7 CONTENT-WORD (semantic-ish) predictions per line ===

Line 1 prompt: 'One must have a mind of'
  1. balance            p≈0.005845
  2. good               p≈0.003427
  3. steel              p≈0.002962
  4. self               p≈0.002643
  5. order              p≈0.002605
  6. thy                p≈0.002361
 

In [None]:
# =========================
# TOP-7 "SEMANTIC" next-word candidates per line (GPT-2)
# + choose 1..7 for each line
# + rebuild line correctly (word BEFORE punctuation)
# + save new poem to P7_poem_semantic.txt
# =========================

!pip -q install transformers torch

import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- 1) Paste your poem here ---
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

# -----------------------------------------------------
# 2) Split each line into: before + last_word + trailing punctuation
#    Build prompts exactly like your original: prompt = before + trailing
# -----------------------------------------------------
def split_last_word_per_line(poem: str):
    lines = poem.splitlines()
    parts = []
    pat = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

    for line in lines:
        if line.strip() == "":
            parts.append({"original": line, "before": line, "last_word": "", "trailing": ""})
            continue

        m = pat.match(line)
        if not m:
            parts.append({"original": line, "before": line, "last_word": "", "trailing": ""})
            continue

        before, last_word, trailing = m.group(1), m.group(2), m.group(3)
        parts.append({"original": line, "before": before, "last_word": last_word, "trailing": trailing})

    return parts

parts = split_last_word_per_line(poem)

prompts_lines = []
removed_words = []
for d in parts:
    before, trailing = d["before"], d["trailing"]
    prompt_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
    prompts_lines.append(prompt_line)
    removed_words.append(d["last_word"])

print("=== Prompts (each line missing last word) ===")
print("\n".join(prompts_lines))
print("\n=== Removed last words (reference) ===")
print(removed_words)

# -----------------------------------------------------
# 3) Load GPT-2
# -----------------------------------------------------
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# -----------------------------------------------------
# 4) Filters: word-like + "content word" heuristic
# -----------------------------------------------------
WORDLIKE = re.compile(r"^[A-Za-z]+(?:[-'][A-Za-z]+)*$")

STOPWORDS = {
    # articles / determiners
    "a","an","the","this","that","these","those","some","any","each","every","either","neither",
    "no","many","much","few","several","such","what","which","whose",
    # pronouns
    "i","me","my","mine","myself","we","us","our","ours","ourselves",
    "you","your","yours","yourself","yourselves",
    "he","him","his","himself","she","her","hers","herself",
    "it","its","itself","they","them","their","theirs","themselves",
    "one","ones","someone","somebody","anyone","anybody","everyone","everybody","nothing","something",
    # conjunctions
    "and","or","but","nor","so","yet","for","although","though","because","since","unless","while","if","than",
    # common prepositions
    "of","to","in","on","at","by","with","from","into","onto","over","under","between","among","through","during","before","after",
    "above","below","about","against","around","across","toward","towards","within","without","upon",
    # auxiliaries / modals / copulas
    "am","is","are","was","were","be","been","being",
    "do","does","did","doing",
    "have","has","had","having",
    "can","could","may","might","must","shall","should","will","would",
    # misc function-ish
    "not","no","yes","very","too","also","just","only","even","still","then","there","here","when","where","why","how",
    "as","up","down","out","off","again","more","most","less","least"
}

def is_content_word(w: str) -> bool:
    w_low = w.lower()
    if not WORDLIKE.match(w):
        return False
    if len(w_low) < 3:
        return False
    if w_low in STOPWORDS:
        return False
    return True

def top_k_content_words(prompt: str, k: int = 7, oversample: int = 20000):
    if prompt.strip() == "":
        return []

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits[0, -1, :]

    probs = torch.softmax(logits, dim=-1)
    sorted_ids = torch.argsort(probs, descending=True)[:oversample]

    out = []
    seen = set()
    for tid in sorted_ids.tolist():
        cand = tokenizer.decode([tid]).strip()
        if not cand:
            continue

        key = cand.lower()
        if key in seen:
            continue

        if is_content_word(cand):
            out.append((cand, float(probs[tid].cpu())))
            seen.add(key)
            if len(out) == k:
                break

    return out

# -----------------------------------------------------
# 5) Rebuild line correctly: before + chosen_word + trailing
# -----------------------------------------------------
def rebuild_from_parts(before: str, chosen_word: str, trailing: str):
    base = before.rstrip()
    if base == "":
        return f"{chosen_word}{trailing}"
    return f"{base} {chosen_word}{trailing}"

# -----------------------------------------------------
# 6) Interactive choose 1..7, build poem, save txt
# -----------------------------------------------------
new_lines = []

print("\n=== GPT-2 top-7 CONTENT-WORD (semantic-ish) predictions per line (choose 1-7) ===")
for i, prompt_line in enumerate(prompts_lines, start=1):
    print("\n" + "="*80)

    if prompt_line.strip() == "":
        print(f"Line {i}: (blank)")
        new_lines.append(prompt_line)
        continue

    preds = top_k_content_words(prompt_line, k=7, oversample=20000)

    print(f"Line {i} prompt: {prompt_line!r}")
    if preds:
        for rank, (w, p) in enumerate(preds, start=1):
            print(f"  {rank}. {w:<18} p≈{p:.6f}")
    else:
        print("  (No content-word candidates found; try increasing oversample.)")
        # keep original line if nothing found
        new_lines.append(parts[i - 1]["original"])
        continue

    while True:
        raw = input("Choose an option (1-7): ").strip()
        if raw.isdigit():
            n = int(raw)
            if 1 <= n <= 7:
                chosen_word = preds[n - 1][0]
                break
        print("Invalid choice. Type a number from 1 to 7.")

    before = parts[i - 1]["before"]
    trailing = parts[i - 1]["trailing"]
    new_lines.append(rebuild_from_parts(before, chosen_word, trailing))

new_poem = "\n".join(new_lines)

print("\n=== NEW POEM (SEMANTIC FILTER) ===")
print(new_poem)

out_path = "P7_poem_semantic.txt"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(new_poem)

print(f"\nSaved to: {out_path}")


=== Prompts (each line missing last word) ===
One must have a mind of
To regard the frost and the
Of the pine-trees crusted with ;
And have been cold a long
To behold the junipers shagged with ,
The spruces rough in the distant
Of the January sun; and not to
Of any misery in the sound of the ,
In the sound of a few ,
Which is the sound of the
Full of the same
That is blowing in the same bare
For the listener, who listens in the ,
And, nothing himself,
Nothing that is not there and the nothing that .

=== Removed last words (reference) ===
['winter', 'boughs', 'snow', 'time', 'ice', 'glitter', 'think', 'wind', 'leaves', 'land', 'wind', 'place', 'snow', 'beholds', 'is']

=== GPT-2 top-7 CONTENT-WORD (semantic-ish) predictions per line (choose 1-7) ===

Line 1 prompt: 'One must have a mind of'
  1. balance            p≈0.005845
  2. good               p≈0.003427
  3. steel              p≈0.002962
  4. self               p≈0.002643
  5. order              p≈0.002605
  6. thy               

In [None]:
# =========================
# This script: CODE 3
#  WORDLIKE ranks 23–43, but ONLY "content-word" candidates
# (filters out common function words: articles, pronouns, conjunctions, etc.)
#
# IMPORTANT:
# - GPT-2 doesn't output POS tags, so "nouns/verbs/adjectives" is approximated by
#   removing function words via a stopword list + simple heuristics.
# - Ranks here are computed AFTER filtering to word-like tokens, then we keep only
#   those that pass the content-word filter, and report the ones whose content-rank
#   falls in 23..43.
# =========================
!pip -q install transformers torch

import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ---- Paste poem here ----
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

MODEL_NAME = "openai-community/gpt2"
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Remove last word per line (each line = one phrase) ----
pat_remove = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

def remove_last_word_per_line(poem: str):
    lines = poem.splitlines()
    new_lines, removed = [], []
    for line in lines:
        if line.strip() == "":
            new_lines.append(line); removed.append(""); continue
        m = pat_remove.match(line)
        if not m:
            new_lines.append(line); removed.append(""); continue
        before, last_word, trailing = m.group(1), m.group(2), m.group(3)
        new_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
        new_lines.append(new_line); removed.append(last_word)
    return new_lines, removed

# ---- Word-like tokens (single "word") ----
WORDLIKE = re.compile(r"^[A-Za-z]+(?:[-'][A-Za-z]+)*$")

# ---- Function-word filter (remove articles/pronouns/conjunctions + more) ----
STOPWORDS = {
    # articles/determiners
    "a","an","the","this","that","these","those","some","any","each","every","either","neither",
    "no","many","much","few","several","such","what","which","whose",
    # pronouns
    "i","me","my","mine","myself","we","us","our","ours","ourselves",
    "you","your","yours","yourself","yourselves",
    "he","him","his","himself","she","her","hers","herself",
    "it","its","itself","they","them","their","theirs","themselves",
    "one","ones","someone","somebody","anyone","anybody","everyone","everybody","nothing","something",
    # conjunctions
    "and","or","but","nor","so","yet","for","although","though","because","since","unless","while","if","than",
    # (extra common function words—optional but improves “semantic” feel)
    "of","to","in","on","at","by","with","from","into","onto","over","under","between","among","through","during",
    "before","after","above","below","about","against","around","across","toward","towards","within","without","upon",
    "am","is","are","was","were","be","been","being",
    "do","does","did","doing","have","has","had","having",
    "can","could","may","might","must","shall","should","will","would",
    "not","very","too","also","just","only","even","still","then","there","here","when","where","why","how","as",
}

def is_content_word(w: str) -> bool:
    wl = w.lower()
    if not WORDLIKE.match(w):
        return False
    if len(wl) < 3:
        return False
    if wl in STOPWORDS:
        return False
    return True

def ranked_content_window(prompt: str, tokenizer, model, start_rank: int = 23, end_rank: int = 43, max_wordlike_scan: int = 5000):
    """
    1) Get GPT-2 next-token probabilities
    2) Walk tokens by probability, keep WORDLIKE tokens
    3) Among WORDLIKE tokens, keep only content-word ones
    4) Return those whose *content-word rank* is in [start_rank, end_rank]
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits[0, -1, :]
    probs = torch.softmax(logits, dim=-1)
    sorted_ids = torch.argsort(probs, descending=True)

    out = []
    content_rank = 0
    wordlike_seen = 0

    for tid in sorted_ids.tolist():
        piece = tokenizer.decode([tid])
        w = piece.strip()

        if w and WORDLIKE.match(w):
            wordlike_seen += 1
            if is_content_word(w):
                content_rank += 1
                if start_rank <= content_rank <= end_rank:
                    out.append((content_rank, w, float(probs[tid].cpu())))
                if content_rank > end_rank:
                    break

            if wordlike_seen >= max_wordlike_scan and content_rank < end_rank:
                # scanned a lot but not enough content words
                break

    return out, content_rank, wordlike_seen

# ---- Load model ----
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()

# ---- Run per line ----
lines, removed = remove_last_word_per_line(poem)
START_R, END_R = 23, 43

print("=== Prompts (missing last word) ===")
for i, l in enumerate(lines):
    print(f"{i}: {l}")

print(f"\n=== CONTENT-WORD ranks {START_R}–{END_R} per line ===")
for i, line in enumerate(lines):
    print("\n" + "="*80)
    if line.strip() == "":
        print(f"Line {i}: (blank)")
        continue

    window, total_content_found, total_wordlike_scanned = ranked_content_window(
        line, tokenizer, model, start_rank=START_R, end_rank=END_R, max_wordlike_scan=20000
    )

    print(f"Line {i} prompt: {line!r}")
    if not window:
        print(f"  (No content-word candidates in ranks {START_R}–{END_R}. "
              f"Found {total_content_found} content words after scanning {total_wordlike_scanned} word-like tokens.)")
    else:
        for r, w, p in window:
            print(f"  content-rank {r:>2}: {w:<18} p≈{p:.6e}")


=== Prompts (missing last word) ===
0: One must have a mind of
1: To regard the frost and the
2: Of the pine-trees crusted with ;
3: And have been cold a long
4: To behold the junipers shagged with ,
5: The spruces rough in the distant
6: Of the January sun; and not to
7: Of any misery in the sound of the ,
8: In the sound of a few ,
9: Which is the sound of the
10: Full of the same
11: That is blowing in the same bare
12: For the listener, who listens in the ,
13: And, nothing himself,
14: Nothing that is not there and the nothing that .

=== CONTENT-WORD ranks 23–43 per line ===

Line 0 prompt: 'One must have a mind of'
  content-rank 23: integrity          p≈1.157843e-03
  content-rank 24: common             p≈1.150411e-03
  content-rank 25: humour             p≈1.136575e-03
  content-rank 26: power              p≈1.088183e-03
  content-rank 27: two                p≈1.083188e-03
  content-rank 28: stone              p≈1.028237e-03
  content-rank 29: matter             p≈9.090302e-04

In [None]:
# =========================
# CONTENT-WORD window ranks 23–43 per line (GPT-2)
# Choose by CONTENT-RANK number (23..43)
# Rebuild line word BEFORE punctuation
# Save to "PX_poem_semantic_r23_43.txt"
# =========================

!pip -q install transformers torch

import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ---- Paste poem here ----
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

MODEL_NAME = "openai-community/gpt2"
device = "cuda" if torch.cuda.is_available() else "cpu"

# -----------------------------------------------------
# 1) Split each line into: before + last_word + trailing punctuation
#    (used to rebuild correctly later)
# -----------------------------------------------------
pat_remove = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

def split_last_word_per_line(poem: str):
    lines = poem.splitlines()
    parts = []
    for line in lines:
        if line.strip() == "":
            parts.append({"original": line, "before": line, "last_word": "", "trailing": ""})
            continue
        m = pat_remove.match(line)
        if not m:
            parts.append({"original": line, "before": line, "last_word": "", "trailing": ""})
            continue
        before, last_word, trailing = m.group(1), m.group(2), m.group(3)
        parts.append({"original": line, "before": before, "last_word": last_word, "trailing": trailing})
    return parts

parts = split_last_word_per_line(poem)

# Prompts must match your original logic: prompt = before + trailing (punctuation kept)
prompts_lines = []
removed_words = []
for d in parts:
    before, trailing = d["before"], d["trailing"]
    prompt_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
    prompts_lines.append(prompt_line)
    removed_words.append(d["last_word"])

print("=== Prompts (missing last word) ===")
for i, l in enumerate(prompts_lines, start=1):
    print(f"Line {i}: {l}")

print("\n=== Removed last words (reference) ===")
print(removed_words)

# -----------------------------------------------------
# 2) Word-like + stopword filter (semantic-ish)
# -----------------------------------------------------
WORDLIKE = re.compile(r"^[A-Za-z]+(?:[-'][A-Za-z]+)*$")

STOPWORDS = {
    # articles/determiners
    "a","an","the","this","that","these","those","some","any","each","every","either","neither",
    "no","many","much","few","several","such","what","which","whose",
    # pronouns
    "i","me","my","mine","myself","we","us","our","ours","ourselves",
    "you","your","yours","yourself","yourselves",
    "he","him","his","himself","she","her","hers","herself",
    "it","its","itself","they","them","their","theirs","themselves",
    "one","ones","someone","somebody","anyone","anybody","everyone","everybody","nothing","something",
    # conjunctions
    "and","or","but","nor","so","yet","for","although","though","because","since","unless","while","if","than",
    # extra common function words
    "of","to","in","on","at","by","with","from","into","onto","over","under","between","among","through","during",
    "before","after","above","below","about","against","around","across","toward","towards","within","without","upon",
    "am","is","are","was","were","be","been","being",
    "do","does","did","doing","have","has","had","having",
    "can","could","may","might","must","shall","should","will","would",
    "not","very","too","also","just","only","even","still","then","there","here","when","where","why","how","as",
}

def is_content_word(w: str) -> bool:
    wl = w.lower()
    if not WORDLIKE.match(w):
        return False
    if len(wl) < 3:
        return False
    if wl in STOPWORDS:
        return False
    return True

# -----------------------------------------------------
# 3) Get content-word candidates whose CONTENT-RANK is in [start_rank, end_rank]
# -----------------------------------------------------
def ranked_content_window(prompt: str, tokenizer, model, start_rank: int = 23, end_rank: int = 43, max_wordlike_scan: int = 20000):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits[0, -1, :]
    probs = torch.softmax(logits, dim=-1)
    sorted_ids = torch.argsort(probs, descending=True)

    out = []  # list of (content_rank, word, prob)
    content_rank = 0
    wordlike_seen = 0

    for tid in sorted_ids.tolist():
        w = tokenizer.decode([tid]).strip()

        if w and WORDLIKE.match(w):
            wordlike_seen += 1
            if is_content_word(w):
                content_rank += 1
                if start_rank <= content_rank <= end_rank:
                    out.append((content_rank, w, float(probs[tid].cpu())))
                if content_rank > end_rank:
                    break

            if wordlike_seen >= max_wordlike_scan and content_rank < end_rank:
                break

    return out, content_rank, wordlike_seen

# -----------------------------------------------------
# 4) Rebuild correctly: before + chosen_word + trailing
# -----------------------------------------------------
def rebuild_from_parts(before: str, chosen_word: str, trailing: str):
    base = before.rstrip()
    if base == "":
        return f"{chosen_word}{trailing}"
    return f"{base} {chosen_word}{trailing}"

# -----------------------------------------------------
# 5) Load model
# -----------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()

# -----------------------------------------------------
# 6) Interactive: choose by CONTENT-RANK number (23..43)
# -----------------------------------------------------
START_R, END_R = 23, 43
new_lines = []

print(f"\n=== Choose by CONTENT-RANK number ({START_R}–{END_R}) ===")

for i, prompt_line in enumerate(prompts_lines, start=1):
    print("\n" + "="*80)

    if prompt_line.strip() == "":
        print(f"Line {i}: (blank)")
        new_lines.append(prompt_line)
        continue

    window, total_content_found, total_wordlike_scanned = ranked_content_window(
        prompt_line, tokenizer, model, start_rank=START_R, end_rank=END_R, max_wordlike_scan=20000
    )

    print(f"Line {i} prompt: {prompt_line!r}")

    if not window:
        print(f"  (No content-word candidates in ranks {START_R}–{END_R}. "
              f"Found {total_content_found} content words after scanning {total_wordlike_scanned} word-like tokens.)")
        new_lines.append(parts[i - 1]["original"])
        continue

    # show all ranks 23..43 that exist
    rank_to_word = {}
    for r, w, p in window:
        rank_to_word[r] = (w, p)
        print(f"  content-rank {r:>2}: {w:<18} p≈{p:.6e}")

    valid_ranks = sorted(rank_to_word.keys())

    while True:
        raw = input(f"Type the CONTENT-RANK you want ({valid_ranks[0]}–{valid_ranks[-1]}): ").strip()
        if raw.isdigit():
            chosen_rank = int(raw)
            if chosen_rank in rank_to_word:
                chosen_word = rank_to_word[chosen_rank][0]
                break
        print(f"Invalid. Choose one of these ranks: {valid_ranks}")

    before = parts[i - 1]["before"]
    trailing = parts[i - 1]["trailing"]
    new_lines.append(rebuild_from_parts(before, chosen_word, trailing))

new_poem = "\n".join(new_lines)

print("\n=== NEW POEM (CONTENT-WINDOW FILTER) ===")
print(new_poem)

out_path = "PX_poem_semantic_r23_43.txt"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(new_poem)

print(f"\nSaved to: {out_path}")


=== Prompts (missing last word) ===
Line 1: One must have a mind of
Line 2: To regard the frost and the
Line 3: Of the pine-trees crusted with ;
Line 4: And have been cold a long
Line 5: To behold the junipers shagged with ,
Line 6: The spruces rough in the distant
Line 7: Of the January sun; and not to
Line 8: Of any misery in the sound of the ,
Line 9: In the sound of a few ,
Line 10: Which is the sound of the
Line 11: Full of the same
Line 12: That is blowing in the same bare
Line 13: For the listener, who listens in the ,
Line 14: And, nothing himself,
Line 15: Nothing that is not there and the nothing that .

=== Removed last words (reference) ===
['winter', 'boughs', 'snow', 'time', 'ice', 'glitter', 'think', 'wind', 'leaves', 'land', 'wind', 'place', 'snow', 'beholds', 'is']

=== Choose by CONTENT-RANK number (23–43) ===

Line 1 prompt: 'One must have a mind of'
  content-rank 23: integrity          p≈1.157843e-03
  content-rank 24: common             p≈1.150411e-03
  content-ra

In [None]:
# =========================
#This script: CODE 4
# Semantic CONTENT-WORD ranks 150–160 per line (GPT-2)
# Variant 1: filter out function words (articles/pronouns/conjunctions/etc.)
#
# IMPORTANT:
# - GPT-2 doesn't output tags, so "nouns/verbs/adjectives" is approximated by:
#   (a) keep WORDLIKE tokens, (b) remove common function words via STOPWORDS + heuristics.
# - The ranks 150–160 are computed among CONTENT-WORD candidates only.
# =========================
!pip -q install transformers torch

import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ---- Paste poem here ----
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

MODEL_NAME = "openai-community/gpt2"
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Remove last word per line (each line = one phrase) ----
pat_remove = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

def remove_last_word_per_line(poem: str):
    lines = poem.splitlines()
    new_lines, removed = [], []
    for line in lines:
        if line.strip() == "":
            new_lines.append(line); removed.append(""); continue
        m = pat_remove.match(line)
        if not m:
            new_lines.append(line); removed.append(""); continue
        before, last_word, trailing = m.group(1), m.group(2), m.group(3)
        new_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
        new_lines.append(new_line); removed.append(last_word)
    return new_lines, removed

# ---- Word-like tokens (single "word") ----
WORDLIKE = re.compile(r"^[A-Za-z]+(?:[-'][A-Za-z]+)*$")

# ---- Function-word filter ----
STOPWORDS = {
    # articles/determiners
    "a","an","the","this","that","these","those","some","any","each","every","either","neither",
    "no","many","much","few","several","such","what","which","whose",
    # pronouns
    "i","me","my","mine","myself","we","us","our","ours","ourselves",
    "you","your","yours","yourself","yourselves",
    "he","him","his","himself","she","her","hers","herself",
    "it","its","itself","they","them","their","theirs","themselves",
    "one","ones","someone","somebody","anyone","anybody","everyone","everybody","nothing","something",
    # conjunctions
    "and","or","but","nor","so","yet","for","although","though","because","since","unless","while","if","than",
    # (extra common function words—optional but improves “semantic” feel)
    "of","to","in","on","at","by","with","from","into","onto","over","under","between","among","through","during",
    "before","after","above","below","about","against","around","across","toward","towards","within","without","upon",
    "am","is","are","was","were","be","been","being",
    "do","does","did","doing","have","has","had","having",
    "can","could","may","might","must","shall","should","will","would",
    "not","very","too","also","just","only","even","still","then","there","here","when","where","why","how","as",
}

def is_content_word(w: str) -> bool:
    wl = w.lower()
    if not WORDLIKE.match(w):
        return False
    if len(wl) < 3:
        return False
    if wl in STOPWORDS:
        return False
    return True

def content_rank_window(prompt: str, tokenizer, model, start_rank: int = 150, end_rank: int = 160, max_wordlike_scan: int = 200000):
    """
    Build CONTENT-WORD ranking (semantic-ish):
    - sort all next tokens by probability
    - keep WORDLIKE tokens
    - keep only those passing is_content_word
    - assign ranks among content words only
    - return those in [start_rank, end_rank]
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits[0, -1, :]
    probs = torch.softmax(logits, dim=-1)
    sorted_ids = torch.argsort(probs, descending=True)

    out = []
    content_rank = 0
    wordlike_seen = 0

    for tid in sorted_ids.tolist():
        w = tokenizer.decode([tid]).strip()

        if w and WORDLIKE.match(w):
            wordlike_seen += 1
            if is_content_word(w):
                content_rank += 1
                if start_rank <= content_rank <= end_rank:
                    out.append((content_rank, w, float(probs[tid].cpu())))
                if content_rank > end_rank:
                    break

            if wordlike_seen >= max_wordlike_scan and content_rank < end_rank:
                break

    return out, content_rank, wordlike_seen

# ---- Load model ----
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()

# ---- Run per line ----
lines, removed = remove_last_word_per_line(poem)
START_R, END_R = 150, 160

print("=== Prompts (missing last word) ===")
for i, l in enumerate(lines):
    print(f"{i}: {l}")

print(f"\n=== CONTENT-WORD ranks {START_R}–{END_R} per line ===")
for i, line in enumerate(lines):
    print("\n" + "="*80)
    if line.strip() == "":
        print(f"Line {i}: (blank)")
        continue

    window, total_content_found, total_wordlike_scanned = content_rank_window(
        line, tokenizer, model, start_rank=START_R, end_rank=END_R, max_wordlike_scan=200000
    )

    print(f"Line {i} prompt: {line!r}")
    if not window:
        print(f"  (No content-word candidates in ranks {START_R}–{END_R}. "
              f"Found {total_content_found} content words after scanning {total_wordlike_scanned} word-like tokens.)")
    else:
        for r, w, p in window:
            print(f"  content-rank {r:>3}: {w:<18} p≈{p:.6e}")


=== Prompts (missing last word) ===
0: One must have a mind of
1: To regard the frost and the
2: Of the pine-trees crusted with ;
3: And have been cold a long
4: To behold the junipers shagged with ,
5: The spruces rough in the distant
6: Of the January sun; and not to
7: Of any misery in the sound of the ,
8: In the sound of a few ,
9: Which is the sound of the
10: Full of the same
11: That is blowing in the same bare
12: For the listener, who listens in the ,
13: And, nothing himself,
14: Nothing that is not there and the nothing that .

=== CONTENT-WORD ranks 150–160 per line ===

Line 0 prompt: 'One must have a mind of'
  content-rank 150: clear              p≈2.116131e-04
  content-rank 151: proportion         p≈2.112066e-04
  content-rank 152: open               p≈2.061693e-04
  content-rank 153: personal           p≈2.041018e-04
  content-rank 154: communication      p≈2.028661e-04
  content-rank 155: prophecy           p≈2.015056e-04
  content-rank 156: honesty            p≈1.9

In [None]:
# =========================
# ranks 150–160 per line (GPT-2)
# Choose by CONTENT-RANK number (150..160)
# Rebuild line correctly (word BEFORE punctuation)
# Save to PX_poem_r150_160.txt
# =========================

!pip -q install transformers torch

import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ---- Paste poem here ----
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

MODEL_NAME = "openai-community/gpt2"
device = "cuda" if torch.cuda.is_available() else "cpu"

# -----------------------------------------------------
# 1) Split each line into: before + last_word + trailing punctuation
#    Build prompts like your original: prompt = before + trailing
# -----------------------------------------------------
pat_remove = re.compile(r"^(.*?)(\b[\w']+\b)([^\w']*)$")

def split_last_word_per_line(poem: str):
    lines = poem.splitlines()
    parts = []
    for line in lines:
        if line.strip() == "":
            parts.append({"original": line, "before": line, "last_word": "", "trailing": ""})
            continue
        m = pat_remove.match(line)
        if not m:
            parts.append({"original": line, "before": line, "last_word": "", "trailing": ""})
            continue
        before, last_word, trailing = m.group(1), m.group(2), m.group(3)
        parts.append({"original": line, "before": before, "last_word": last_word, "trailing": trailing})
    return parts

parts = split_last_word_per_line(poem)

prompts_lines = []
removed_words = []
for d in parts:
    before, trailing = d["before"], d["trailing"]
    prompt_line = (before.rstrip() + (" " if trailing and not before.rstrip().endswith((" ", "\t")) else "") + trailing).rstrip()
    prompts_lines.append(prompt_line)
    removed_words.append(d["last_word"])

print("=== Prompts (missing last word) ===")
for i, l in enumerate(prompts_lines, start=1):
    print(f"Line {i}: {l}")

print("\n=== Removed last words (reference) ===")
print(removed_words)

# -----------------------------------------------------
# 2) Word-like + stopword filter (semantic-ish)
# -----------------------------------------------------
WORDLIKE = re.compile(r"^[A-Za-z]+(?:[-'][A-Za-z]+)*$")

STOPWORDS = {
    # articles/determiners
    "a","an","the","this","that","these","those","some","any","each","every","either","neither",
    "no","many","much","few","several","such","what","which","whose",
    # pronouns
    "i","me","my","mine","myself","we","us","our","ours","ourselves",
    "you","your","yours","yourself","yourselves",
    "he","him","his","himself","she","her","hers","herself",
    "it","its","itself","they","them","their","theirs","themselves",
    "one","ones","someone","somebody","anyone","anybody","everyone","everybody","nothing","something",
    # conjunctions
    "and","or","but","nor","so","yet","for","although","though","because","since","unless","while","if","than",
    # extra common function words
    "of","to","in","on","at","by","with","from","into","onto","over","under","between","among","through","during",
    "before","after","above","below","about","against","around","across","toward","towards","within","without","upon",
    "am","is","are","was","were","be","been","being",
    "do","does","did","doing","have","has","had","having",
    "can","could","may","might","must","shall","should","will","would",
    "not","very","too","also","just","only","even","still","then","there","here","when","where","why","how","as",
}

def is_content_word(w: str) -> bool:
    wl = w.lower()
    if not WORDLIKE.match(w):
        return False
    if len(wl) < 3:
        return False
    if wl in STOPWORDS:
        return False
    return True

# -----------------------------------------------------
# 3) Content-rank window 150..160 (among content words only)
# -----------------------------------------------------
def content_rank_window(prompt: str, tokenizer, model, start_rank: int = 150, end_rank: int = 160, max_wordlike_scan: int = 200000):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits[0, -1, :]
    probs = torch.softmax(logits, dim=-1)
    sorted_ids = torch.argsort(probs, descending=True)

    out = []
    content_rank = 0
    wordlike_seen = 0

    for tid in sorted_ids.tolist():
        w = tokenizer.decode([tid]).strip()

        if w and WORDLIKE.match(w):
            wordlike_seen += 1
            if is_content_word(w):
                content_rank += 1
                if start_rank <= content_rank <= end_rank:
                    out.append((content_rank, w, float(probs[tid].cpu())))
                if content_rank > end_rank:
                    break

            if wordlike_seen >= max_wordlike_scan and content_rank < end_rank:
                break

    return out, content_rank, wordlike_seen

# -----------------------------------------------------
# 4) Rebuild correctly: before + chosen_word + trailing
# -----------------------------------------------------
def rebuild_from_parts(before: str, chosen_word: str, trailing: str):
    base = before.rstrip()
    if base == "":
        return f"{chosen_word}{trailing}"
    return f"{base} {chosen_word}{trailing}"

# -----------------------------------------------------
# 5) Load model
# -----------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()

# -----------------------------------------------------
# 6) Interactive: choose by CONTENT-RANK number 150..160
# -----------------------------------------------------
START_R, END_R = 150, 160
new_lines = []

print(f"\n=== Choose by CONTENT-RANK number ({START_R}–{END_R}) ===")

for i, prompt_line in enumerate(prompts_lines, start=1):
    print("\n" + "="*80)

    if prompt_line.strip() == "":
        print(f"Line {i}: (blank)")
        new_lines.append(prompt_line)
        continue

    window, total_content_found, total_wordlike_scanned = content_rank_window(
        prompt_line, tokenizer, model, start_rank=START_R, end_rank=END_R, max_wordlike_scan=200000
    )

    print(f"Line {i} prompt: {prompt_line!r}")

    if not window:
        print(f"  (No content-word candidates in ranks {START_R}–{END_R}. "
              f"Found {total_content_found} content words after scanning {total_wordlike_scanned} word-like tokens.)")
        # fallback: keep original line
        new_lines.append(parts[i - 1]["original"])
        continue

    # show all available ranks in 150..160
    rank_to_word = {}
    for r, w, p in window:
        rank_to_word[r] = (w, p)
        print(f"  content-rank {r:>3}: {w:<18} p≈{p:.6e}")

    valid_ranks = sorted(rank_to_word.keys())

    while True:
        raw = input(f"Type the CONTENT-RANK you want ({START_R}–{END_R}): ").strip()
        if raw.isdigit():
            chosen_rank = int(raw)
            if chosen_rank in rank_to_word:
                chosen_word = rank_to_word[chosen_rank][0]
                break
        print(f"Invalid. Choose one of these ranks: {valid_ranks}")

    before = parts[i - 1]["before"]
    trailing = parts[i - 1]["trailing"]
    new_lines.append(rebuild_from_parts(before, chosen_word, trailing))

new_poem = "\n".join(new_lines)

print("\n=== NEW POEM (CONTENT-RANK 150–160 WINDOW) ===")
print(new_poem)

out_path = "PX_poem_semantic_r150_160.txt"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(new_poem)

print(f"\nSaved to: {out_path}")


=== Prompts (missing last word) ===
Line 1: One must have a mind of
Line 2: To regard the frost and the
Line 3: Of the pine-trees crusted with ;
Line 4: And have been cold a long
Line 5: To behold the junipers shagged with ,
Line 6: The spruces rough in the distant
Line 7: Of the January sun; and not to
Line 8: Of any misery in the sound of the ,
Line 9: In the sound of a few ,
Line 10: Which is the sound of the
Line 11: Full of the same
Line 12: That is blowing in the same bare
Line 13: For the listener, who listens in the ,
Line 14: And, nothing himself,
Line 15: Nothing that is not there and the nothing that .

=== Removed last words (reference) ===
['winter', 'boughs', 'snow', 'time', 'ice', 'glitter', 'think', 'wind', 'leaves', 'land', 'wind', 'place', 'snow', 'beholds', 'is']

=== Choose by CONTENT-RANK number (150–160) ===

Line 1 prompt: 'One must have a mind of'
  content-rank 150: clear              p≈2.116131e-04
  content-rank 151: proportion         p≈2.112066e-04
  conten

In [5]:
# =========================
# P+7 NOUN REPLACER — STRICT “COMPLETE NOUN WORDS”
# - candidate must start a new word (token begins with space)
# - candidate must be NOUN/PROPN in CONTEXT (spaCy)
# - candidate must match singular/plural in CONTEXT
# - candidate must be a common enough word (wordfreq)
# - prints ranks 1..7
# =========================

!pip -q install transformers torch spacy wordfreq
!python -m spacy download en_core_web_sm -q

import re
import torch
import spacy
from wordfreq import zipf_frequency
from transformers import AutoTokenizer, AutoModelForCausalLM

# ---- Paste your poem/text here ----
text = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time"""

MODEL_NAME = "openai-community/gpt2"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

nlp = spacy.load("en_core_web_sm")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()

WORDLIKE = re.compile(r"^[A-Za-zÀ-ÖØ-öø-ÿ]+(?:[-'][A-Za-zÀ-ÖØ-öø-ÿ]+)*$")
VOWEL = re.compile(r"[aeiouy]", re.IGNORECASE)

STOPWORDS = {
    "a","an","the","this","that","these","those",
    "and","or","but","nor","so","yet","for",
    "of","to","in","on","at","by","with","from","into","over","under",
    "is","are","was","were","be","been","being",
    "do","does","did","have","has","had",
    "can","could","may","might","must","shall","should","will","would",
    "not","very","also","just","only","even","then","there","here","as"
}

def next_token_probs(prompt: str):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits[0, -1, :]
    return torch.softmax(logits, dim=-1)

def noun_number(tok) -> str:
    nums = tok.morph.get("Number")
    return nums[0] if nums else ""  # 'Sing'/'Plur'/''

def match_capitalization(src: str, cand: str) -> str:
    if src.isupper(): return cand.upper()
    if src.istitle(): return cand.capitalize()
    return cand.lower() if src.islower() else cand

def candidate_is_noun_in_context(context: str, piece: str):
    """
    Check POS + number of the LAST token when appending piece to context.
    """
    doc = nlp(context + piece)
    if not doc:
        return False, "", ""
    last = doc[-1]
    return (last.pos_ in {"NOUN","PROPN"}), noun_number(last), last.text

def is_common_enough(word: str, min_zipf: float = 3.0) -> bool:
    """
    zipf_frequency ~ 3.0 = reasonably common word
    Raise to 3.5/4.0 if you want to be stricter.
    """
    return zipf_frequency(word.lower(), "en") >= min_zipf

def top7_complete_noun_candidates(context: str, target_number: str, k: int = 7,
                                 min_zipf: float = 3.0, overscan: int = 200000):
    probs = next_token_probs(context)
    sorted_ids = torch.argsort(probs, descending=True)

    out = []
    seen = set()

    for tid in sorted_ids.tolist():
        piece = tokenizer.decode([tid])  # keep raw (may start with space)

        # Require NEW WORD boundary once we have context
        if context.strip() != "" and not piece.startswith(" "):
            continue

        cand = piece.strip()
        if not cand:
            continue

        # Word-shape filters
        if not WORDLIKE.match(cand):
            continue
        if len(cand) < 3:
            continue
        if not VOWEL.search(cand):         # kills many fragments like "ut", "cwm"-like chunks
            continue
        if cand.lower() in STOPWORDS:
            continue
        if cand.lower() in seen:
            continue

        # Must be a noun when placed in context
        ok, cand_num, last_text = candidate_is_noun_in_context(context, piece)
        if not ok:
            continue

        # Must match number if available
        if target_number in {"Sing","Plur"} and cand_num != target_number:
            continue

        # Must be a common enough English word
        if not is_common_enough(cand, min_zipf=min_zipf):
            continue

        seen.add(cand.lower())
        out.append((cand, float(probs[tid].cpu()), cand_num, zipf_frequency(cand.lower(), "en")))

        if len(out) == k:
            break

        if len(seen) > overscan:
            break

    return out

def p7_replace_nouns_strict(text: str, k: int = 7, min_zipf: float = 3.0):
    doc = nlp(text)
    out = []
    context = ""

    for tok in doc:
        src = tok.text
        ws = tok.whitespace_

        if tok.pos_ in {"NOUN","PROPN"} and src.strip():
            target_num = noun_number(tok)

            cands = top7_complete_noun_candidates(
                context,
                target_number=target_num,
                k=k,
                min_zipf=min_zipf
            )

            print("\n" + "="*85)
            print(f"NOUN FOUND: '{src}'   (target Number={target_num or 'Unknown'})")
            print("Top-7 COMPLETE NOUN candidates (rank 1→7):")
            for i, (w, p, num, z) in enumerate(cands, start=1):
                print(f"  {i}. {w:<18} p≈{p:.6f}   (Number={num or 'Unknown'}, zipf={z:.2f})")

            if len(cands) >= k:
                chosen = match_capitalization(src, cands[k-1][0])
                print(f"→ P+{k} chosen replacement: {chosen}")
            else:
                chosen = src
                print(f"→ Only found {len(cands)} valid candidates; keeping original.")

            out.append(chosen + ws)
            context += chosen + ws
        else:
            out.append(src + ws)
            context += src + ws

    return "".join(out)

# ---- Run ----
mutated = p7_replace_nouns_strict(text, k=7, min_zipf=3.0)

print("\n=== ORIGINAL ===")
print(text)
print("\n=== P+7 NOUN MUTATION (STRICT) ===")
print(mutated)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.1/183.1 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Using device: cuda

NOUN FOUND: 'mind'   (target Number=Sing)
Top-7 COMPLETE NOUN candidates (rank 1→7):
  1. Reply              p≈0.000014   (Number=Sin