# Fragment Extraction - Extract Emotionally Intense Moments

Extracts **10-40 word fragments** (1-4 complete sentences) from AO3 stories.

Uses NLTK sentence tokenizer to properly handle dialogue and quotes.

## Setup

In [2]:
import sys
sys.path.append('..')

import json
import re
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
import spacy 
from config import RAW_DATA_DIR, PROCESSED_DATA_DIR

print("✓ Imports loaded")

✓ Imports loaded


In [2]:
# pip install spacy && python -m spacy download en_core_web_sm

## Load Models

In [3]:
from transformers import pipeline
import nltk

# Load emotion classifier
emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    top_k=None
)

# Load NSFW classifier
nsfw_classifier = pipeline(
    "text-classification",
    model="michellejieli/NSFW_text_classifier"
)

# Download NLTK sentence tokenizer (handles quotes/dialogue properly)
try:
    nltk.data.find('tokenizers/punkt')
except:
    nltk.download('punkt', quiet=True)

# Spacy 
nlp = spacy.load("en_core_web_sm")

Device set to use mps:0
Device set to use mps:0


## Fragment Extractor

In [21]:
class FragmentExtractor:
    def __init__(self, min_words=10, max_words=40, emotion_threshold=0.8, batch_size=32):
        self.min_words = min_words
        self.max_words = max_words
        self.emotion_threshold = emotion_threshold
        self.batch_size = batch_size
        self.stats = {
            'stories_processed': 0,
            'total_fragments': 0,
            'high_emotion': 0,
            'has_narrative': 0,
            'final_fragments': 0
        }

    def extract_sentence_groups(self, text):
        """Extract 2-4 complete sentences using spaCy"""
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
        
        fragments = []
        
        # Try groups of 2, 3, and 4 consecutive sentences
        for group_size in [2, 3, 4]:
            for i in range(len(sentences) - group_size + 1):
                group = sentences[i:i + group_size]
                combined = ' '.join(group)
                
                word_count = len(combined.split())
                if self.min_words <= word_count <= self.max_words: 
                    fragments.append(combined)
        
        return fragments

    def clean_fragment(self, text):
        """Remove extra whitespace, normalize spacing, and confirm complete sentence structure."""
        text = text.strip()
        text = re.sub(r'\s+', ' ', text)
        if text and text[-1] not in '.!?':
            text += '.'
        return text

    def has_narrative_content(self, text):
        """
        Check if at least ONE sentence in the fragment has both person entity and verb.
        Uses spaCy's NER (Named Entity Recognition) and POS (Part-of-Speech) tagging.
        """
        doc = nlp(text)
        
        for sent in doc.sents:
            has_person = any(ent.label_ == "PERSON" for ent in sent.ents)
            has_verb = any(token.pos_ == "VERB" for token in sent)
            
            if has_person and has_verb:
                return True
        
        return False

    def get_emotion_scores_batch(self, texts):
        """Get emotion scores for a batch of texts"""
        try:
            results = emotion_classifier(texts, batch_size=self.batch_size)
            emotions = []
            for result in results:
                best = max(result, key=lambda x: x['score'])
                emotions.append((best['label'], best['score']))
            return emotions
        except:
            return [(None, 0.0)] * len(texts)

    def get_nsfw_labels_batch(self, texts):
        """Get NSFW labels for a batch of texts"""
        try:
            results = nsfw_classifier(texts, batch_size=self.batch_size)
            return [result['label'] == 'NSFW' and result['score'] > 0.9 for result in results]
        except:
            return [False] * len(texts)

    def extract_from_story(self, story):
        text = story['text']
        candidates = self.extract_sentence_groups(text)
        self.stats['total_fragments'] += len(candidates)

        # Step 1: Clean and deduplicate
        cleaned_fragments = []
        seen = set()
        for frag_text in candidates:
            clean_text = self.clean_fragment(frag_text)
            if clean_text not in seen:
                seen.add(clean_text)
                cleaned_fragments.append(clean_text)

        if not cleaned_fragments:
            return []

        # Step 2: Batch emotion classification
        emotion_results = self.get_emotion_scores_batch(cleaned_fragments)
        
        # Step 3: Filter by emotion threshold and neutral
        high_emotion_fragments = []
        high_emotion_data = []
        for frag_text, (emotion_label, emotion_score) in zip(cleaned_fragments, emotion_results):
            if emotion_score >= self.emotion_threshold and emotion_label != "neutral":
                high_emotion_fragments.append(frag_text)
                high_emotion_data.append((emotion_label, emotion_score))
                self.stats['high_emotion'] += 1

        if not high_emotion_fragments:
            return []

        # Step 4: Check narrative content (sequential - spaCy)
        narrative_fragments = []
        narrative_data = []
        for frag_text, emotion_data in zip(high_emotion_fragments, high_emotion_data):
            if self.has_narrative_content(frag_text):
                narrative_fragments.append(frag_text)
                narrative_data.append(emotion_data)
                self.stats['has_narrative'] += 1

        if not narrative_fragments:
            return []

        # Step 5: Batch NSFW classification
        nsfw_results = self.get_nsfw_labels_batch(narrative_fragments)

        # Step 6: Assemble final fragments
        fragments = []
        for idx, (frag_text, (emotion_label, emotion_score), nsfw) in enumerate(
            zip(narrative_fragments, narrative_data, nsfw_results)
        ):
            fragments.append({
                'id': f"{story.get('id')}_{idx}", 
                'text': frag_text,
                'word_count': len(frag_text.split()),
                'emotion': emotion_label,
                'emotion_score': round(emotion_score, 3),
                'nsfw': nsfw,
                'source_story_id': story['id'],
                'title': story.get('title', 'Unknown'),
                'tag': story.get('search_tag', ''),
                'tags': story.get('tags', []),
                'source': 'ao3',
                'kudos': story.get('kudos', 0),
                'url': story.get('url', '')
            })
            self.stats['final_fragments'] += 1

        return fragments

    def print_stats(self):
        print("\n" + "="*60)
        print("Extraction Statistics")
        print("="*60)
        print(f"Stories: {self.stats['stories_processed']}")
        print(f"Candidates: {self.stats['total_fragments']}")
        print(f"High emotion: {self.stats['high_emotion']} ({self.stats['high_emotion']/max(1,self.stats['total_fragments'])*100:.1f}%)")
        print(f"With narrative: {self.stats['has_narrative']} ({self.stats['has_narrative']/max(1,self.stats['high_emotion'])*100:.1f}%)")
        print(f"Final: {self.stats['final_fragments']}")
        print("="*60)

print("✓ FragmentExtractor loaded (batch processing enabled)")

✓ FragmentExtractor loaded (batch processing enabled)


## Load Stories

In [22]:
input_file = RAW_DATA_DIR / "ao3_stories.csv"
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} stories")
df.head()

Loaded 510 stories


Unnamed: 0,id,text,title,maturity_rating,nsfw,word_count,kudos,tags,search_tag,source,url
0,ao3_34500952,"art bycatherine7mk, poster bynikitajobson\n\n~...",Draco Malfoy and the Mortifying Ordeal of Bein...,Explicit,True,4941,93970,"['Healer Hermione Granger', 'Researcher Hermio...",Humor,ao3,https://archiveofourown.org/works/34500952?vie...
1,ao3_30153540,"Peter crouches on the edge of a dusty ruin, wa...",Dark Matter,Not Rated,False,3508,93061,"['Not Avengers: Endgame (Movie) Compliant', 'P...",Humor,ao3,https://archiveofourown.org/works/30153540?vie...
2,ao3_4701869,He'd been dreaming of it since the defeat of V...,Oh God Not Again!,General Audiences,False,3427,78258,"['Old Fic', 'Humor', 'Time Travel', 'like lite...",Humor,ao3,https://archiveofourown.org/works/4701869?view...
3,ao3_234222,\n\nHis cloak was soaked within minutes. Three...,Then Comes a Mist and a Weeping Rain,Explicit,True,21159,77801,"['Humor', 'Romance', 'Hogwarts Eighth Year', '...",Humor,ao3,https://archiveofourown.org/works/234222?view_...
4,ao3_33489424,(See the end of the chapter fornotes.)\n\nYou ...,tommyinnit's clinic for supervillains,Teen And Up Audiences,False,5205,69696,['Alternate Universe - Superheroes/Superpowers...,Humor,ao3,https://archiveofourown.org/works/33489424?vie...


## Test on a Few Stories

In [24]:
extractor = FragmentExtractor(min_words=10, max_words=40, emotion_threshold=0.8)
test_stories = df.iloc[[400]].to_dict('records')
test_fragments = []

for i, story in enumerate(test_stories, 1):
    print(f"\nStory {i}: {story['title']} {story['url']}")
    frags = extractor.extract_from_story(story)
    test_fragments.extend(frags)
    extractor.stats['stories_processed'] += 1
    print(f"Extracted {len(frags)} fragments")
    
    for j, frag in enumerate(frags, 1):
        print(f"  {j}. [{frag['emotion']}] ({frag['word_count']}w) ({frag['nsfw']})")
        print(f"     {frag['text']}")

extractor.print_stats()


Story 1: Slithering https://archiveofourown.org/works/7548181?view_adult=true
Extracted 167 fragments
  1. [disgust] (27w) (False)
     Lucius talked of nothing, and ate only a little more than that. His hair hung in lank clumps around his face, and he’d begun to stink noticeably.
  2. [disgust] (11w) (True)
     Mulciber snorted. “Rolled in some blood and pretended I was dead.
  3. [fear] (30w) (False)
     He stopped, panting. After a moment, Harry said, uncertainly, “What—what did you do with them?” “What do youthink?I healed them, I Obliviated them and sent them on their way home!”.
  4. [fear] (33w) (False)
     After a moment, Harry said, uncertainly, “What—what did you do with them?” “What do youthink?I healed them, I Obliviated them and sent them on their way home!” One of the other Aurors snorted.
  5. [surprise] (21w) (False)
     Coil swung his head round in surprise and then hissed a complicated reply that made Harry blink. “What did he say?”.
  6. [surprise] (10w) (False)

## Full Extraction

In [None]:
extractor = FragmentExtractor(min_words=10, max_words=40, emotion_threshold=0.8)
all_fragments = []

for story in tqdm(df.to_dict('records'), desc="Processing"):
    if pd.isna(story.get('text')):
        continue
    frags = extractor.extract_from_story(story)
    all_fragments.extend(frags)
    extractor.stats['stories_processed'] += 1

Processing:   0%|          | 0/510 [00:00<?, ?it/s]

## Save

In [None]:
fragments_to_save = test_fragments  # Change to all_fragments
output = Path("../data/processed/ao3_fragments.jsonl")
output.parent.mkdir(parents=True, exist_ok=True)

with open(output, 'w') as f:
    for frag in fragments_to_save:
        f.write(json.dumps(frag) + '\n')

print(f"Saved {len(fragments_to_save)} fragments")

## View Samples

In [None]:
import random
samples = random.sample(fragments_to_save, min(10, len(fragments_to_save)))

for i, frag in enumerate(samples, 1):
    print(f"\n{i}. [{frag['emotion'].upper()}] ({frag['word_count']}w)")
    print(f"   {frag['text']}")

## Emotion Distribution

In [None]:
emotions = {}
for frag in fragments_to_save:
    e = frag['emotion']
    emotions[e] = emotions.get(e, 0) + 1

for e, cnt in sorted(emotions.items(), key=lambda x: x[1], reverse=True):
    print(f"{e:15s}: {cnt:5d} ({cnt/len(fragments_to_save)*100:5.1f}%)")

## NSFW Distribution

In [None]:
nsfw_counts = {'NSFW': 0, 'SFW': 0}
for frag in fragments_to_save:
    if frag['nsfw']:
        nsfw_counts['NSFW'] += 1
    else:
        nsfw_counts['SFW'] += 1

for label, cnt in sorted(nsfw_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{label:15s}: {cnt:5d} ({cnt/len(fragments_to_save)*100:5.1f}%)")

## Explore the Data 

In [4]:
df = pd.read_csv(PROCESSED_DATA_DIR / "ao3_fragments.csv")

In [11]:
pd.set_option('display.max_colwidth', None)

df[['text', 'title']].sample(10)

Unnamed: 0,text,title
325169,“They were alive when I left.” “For how much longer?” Jin muttered ominously. Yoongi paused.,'Cause I'll Be In Love Maze
73451,"Fear. Briefly. Only a flash of it before Jun Wu’s gaze returns to its usual demeanor of eternal calm. “...Well,” the emperor mutters, not lifting his eyes from the weapon.",No Paths Are Bound
39403,"He didn’t mean to get frustrated so fast, why did he feel so angry? He blamed it on the mass sensory input he was still getting. “..right. Okay, cool.” Tim shrugged, smiling a bit.",Dumpster Diving for Treasure
154235,This one’s new. Wouldn’t mind fucking her though.” Harry replied and saw Louis’ face contorting in disgust. “She has small breasts.” Louis commented and Harry smiled because he’d gotten a reaction from him.,Baby Heaven's in your Eyes
29061,"Harry's anger swelled. ""I know very well it's not customary to ask.",Tales of the Potters
33086,She looked to Harry. “Goyle Manor?” Harry grimaced. “Shacklebolt is not happy.,"BLOODY, SLUTTY, AND PATHETIC"
278256,"Could still remember his horror when the rockfall had separated them. All those centuries I was dead, Arthur thought. I only ever heard one voice. His voice.","And like the cycle of the year, we begin again"
329580,"“If I get nervous, I’ll touch the pendant and call out for you.” “And I will be there in a moment’s notice,” Loki said. “Once you’re done, go home. I will meet you there.",Get On Your Knees And Pray To Me
38325,"“I’mso gladthat I’ve got you in here.” Peter dramatically sighed, bumping her with his elbow as they entered the gymnasium together. The teenage girl only grinned with a sly eye roll, then began to sign with exaggerated motions.",Dumpster Diving for Treasure
113699,"There are someveryunhappy looks, but the leader soldiers on. ""Yes, Young Master Wei, we know. The Lan do not like or respect us, and the Lan are very strict.",And Time Is But a Paper Moon


In [56]:
df.head()

Unnamed: 0,id,text,word_count,emotion,emotion_score,nsfw,source_story_id,title,tag,tags,source,kudos,url
0,ao3_21116591_0,They'd make it quick. --- Zuko didn't like tha...,23,disgust,0.814,True,ao3_21116591,Salvage,Awkwardness,"['Hakoda just wants to talk terms', 'Ozai just...",ao3,69973,https://archiveofourown.org/works/21116591?vie...
1,ao3_21116591_1,--- Zuko didn't like that the one guy kepthitt...,37,disgust,0.856,False,ao3_21116591,Salvage,Awkwardness,"['Hakoda just wants to talk terms', 'Ozai just...",ao3,69973,https://archiveofourown.org/works/21116591?vie...
2,ao3_21116591_2,On the other side of the sick bay—which implie...,30,disgust,0.913,False,ao3_21116591,Salvage,Awkwardness,"['Hakoda just wants to talk terms', 'Ozai just...",ao3,69973,https://archiveofourown.org/works/21116591?vie...
3,ao3_21116591_3,"""He's a bit out of it,"" Uncle said. ""Makes him...",21,sadness,0.926,False,ao3_21116591,Salvage,Awkwardness,"['Hakoda just wants to talk terms', 'Ozai just...",ao3,69973,https://archiveofourown.org/works/21116591?vie...
4,ao3_21116591_4,"I'll stop feeling sorry for him, then; they de...",18,sadness,0.938,True,ao3_21116591,Salvage,Awkwardness,"['Hakoda just wants to talk terms', 'Ozai just...",ao3,69973,https://archiveofourown.org/works/21116591?vie...
