In [None]:
# =============================================================================
# CELL 1: SETUP AND CONFIGURATION
# =============================================================================

import os
import re
import json
import pandas as pd
from pathlib import Path
from datetime import datetime
from google.colab import drive, files
import zipfile

# Mount Google Drive (will prompt for authorization)
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("✓ Drive mounted successfully")

# Configuration
class Config:
    """Configuration settings for the analysis"""

    # File naming conventions
    VERSION_PREFIXES = ['draft-', 'refined-', 'edited-', 'final-']
    VERSION_ORDER = {prefix: i for i, prefix in enumerate(VERSION_PREFIXES)}

    # Analysis settings
    MIN_SENTENCE_LENGTH = 10  # Minimum characters for a sentence
    MAX_SENTENCE_LENGTH = 1000  # Maximum characters for a sentence

def setup_output_directories(base_path):
    """Create necessary output directories"""
    # Don't create nested folders - use the base path directly
    output_dir = base_path
    archive_dir = os.path.join(base_path, 'archive')

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(archive_dir, exist_ok=True)

    print(f"✓ Output directory ready: {output_dir}")
    print(f"✓ Archive directory ready: {archive_dir}")

    return output_dir, archive_dir

print("📋 Configuration loaded. Ready to process articles.")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Drive mounted successfully
📋 Configuration loaded. Ready to process articles.


In [None]:
# =============================================================================
# CELL 2: DATA INGESTION & VALIDATION (STEP 1)
# =============================================================================

class ArticleVersions:
    """Class to handle loading and validating article versions"""

    def __init__(self, article_name, input_path):
        self.article_name = article_name
        self.input_path = input_path
        self.versions = {}
        self.metadata = {
            'article_name': article_name,
            'input_path': input_path,
            'processing_timestamp': datetime.now().isoformat(),
            'versions_found': [],
            'validation_status': 'pending'
        }

    def load_versions(self):
        """Load all versions of an article from the specified path"""
        print(f"\n📁 Loading versions for article: {self.article_name}")
        print(f"📂 Input path: {self.input_path}")

        # Find all files matching the article name pattern
        for prefix in Config.VERSION_PREFIXES:
            filename = f"{prefix}{self.article_name}.md"
            filepath = os.path.join(self.input_path, filename)

            if os.path.exists(filepath):
                try:
                    with open(filepath, 'r', encoding='utf-8') as file:
                        content = file.read()
                        self.versions[prefix.rstrip('-')] = {
                            'filename': filename,
                            'filepath': filepath,
                            'content': content,
                            'loaded_at': datetime.now().isoformat(),
                            'file_size': len(content)
                        }
                        print(f"  ✓ Loaded: {filename} ({len(content)} characters)")

                except Exception as e:
                    print(f"  ✗ Error loading {filename}: {str(e)}")
            else:
                print(f"  - Not found: {filename}")

        self.metadata['versions_found'] = list(self.versions.keys())
        return self.versions

    def validate_version_sequence(self):
        """Validate that we have the minimum required versions"""
        found_versions = set(self.versions.keys())
        required_versions = ['draft', 'final']

        # Check for required versions
        missing_required = []
        for version in required_versions:
            if version not in found_versions:
                missing_required.append(version)

        # Validation results
        validation_results = {
            'has_draft': 'draft' in found_versions,
            'has_final': 'final' in found_versions,
            'missing_required': missing_required,
            'versions_found': list(found_versions),
            'is_valid': len(missing_required) == 0
        }

        # Update metadata
        self.metadata['validation_results'] = validation_results

        if validation_results['is_valid']:
            self.metadata['validation_status'] = 'passed'
            print(f"✓ Validation passed: Found {len(found_versions)} versions")
        else:
            self.metadata['validation_status'] = 'failed'
            print(f"✗ Validation failed: Missing required versions")
            print(f"  Missing: {', '.join(missing_required)}")

        return validation_results

    def get_summary(self):
        """Get a summary of loaded versions"""
        summary = {
            'article_name': self.article_name,
            'input_path': self.input_path,
            'versions_count': len(self.versions),
            'validation_status': self.metadata['validation_status'],
            'file_sizes': {}
        }

        for version, data in self.versions.items():
            summary['file_sizes'][version] = data['file_size']

        return summary

print("📖 ArticleVersions class loaded. Ready for data ingestion.")

📖 ArticleVersions class loaded. Ready for data ingestion.


In [None]:
# =============================================================================
# CELL 3: TEXT PREPROCESSING (STEP 2)
# =============================================================================

class TextPreprocessor:
    """Class to handle text preprocessing and segmentation"""

    def __init__(self):
        self.processed_versions = {}

    def clean_markdown(self, text):
        """Clean markdown formatting while preserving content structure"""
        # Remove markdown formatting but keep the text
        patterns = [
            (r'^\s*#{1,6}\s+', ''),  # Headers
            (r'\*\*(.*?)\*\*', r'\1'),  # Bold
            (r'\*(.*?)\*', r'\1'),  # Italic
            (r'`(.*?)`', r'\1'),  # Inline code
            (r'```.*?```', ''),  # Code blocks
            (r'!\[.*?\]\(.*?\)', ''),  # Images
            (r'\[([^\]]+)\]\([^\)]+\)', r'\1'),  # Links
            (r'^\s*[\*\-\+]\s+', ''),  # Bullet points
            (r'^\s*\d+\.\s+', ''),  # Numbered lists
            (r'\n{3,}', '\n\n'),  # Multiple newlines
        ]

        cleaned_text = text

        # Apply patterns that need multiline flag
        multiline_patterns = [
            (r'^\s*[\*\-\+]\s+', ''),  # Bullet points
            (r'^\s*\d+\.\s+', ''),  # Numbered lists
        ]

        for pattern, replacement in multiline_patterns:
            cleaned_text = re.sub(pattern, replacement, cleaned_text, flags=re.MULTILINE)

        # Apply regular patterns
        regular_patterns = [
            (r'^\s*#{1,6}\s+', ''),  # Headers
            (r'\*\*(.*?)\*\*', r'\1'),  # Bold
            (r'\*(.*?)\*', r'\1'),  # Italic
            (r'`(.*?)`', r'\1'),  # Inline code
            (r'```.*?```', ''),  # Code blocks
            (r'!\[.*?\]\(.*?\)', ''),  # Images
            (r'\[([^\]]+)\]\([^\)]+\)', r'\1'),  # Links
            (r'\n{3,}', '\n\n'),  # Multiple newlines
        ]

        for pattern, replacement in regular_patterns:
            cleaned_text = re.sub(pattern, replacement, cleaned_text)

        return cleaned_text.strip()

    def segment_into_sentences(self, text):
        """Segment text into sentences with basic filtering"""
        # Simple sentence segmentation (can be enhanced with spaCy later if needed)
        sentences = re.split(r'[.!?]+\s+', text)

        # Filter sentences
        filtered_sentences = []
        for sentence in sentences:
            sentence = sentence.strip()
            if (Config.MIN_SENTENCE_LENGTH <= len(sentence) <= Config.MAX_SENTENCE_LENGTH
                and sentence):
                filtered_sentences.append(sentence)

        return filtered_sentences

    def segment_into_paragraphs(self, text):
        """Segment text into paragraphs"""
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
        return paragraphs

    def process_version(self, version_name, raw_content):
        """Process a single version of the article"""
        print(f"  Processing {version_name} version...")

        # Clean the markdown
        cleaned_content = self.clean_markdown(raw_content)

        # Segment into different units
        sentences = self.segment_into_sentences(cleaned_content)
        paragraphs = self.segment_into_paragraphs(cleaned_content)

        # Calculate basic statistics
        stats = {
            'character_count': len(cleaned_content),
            'word_count': len(cleaned_content.split()),
            'sentence_count': len(sentences),
            'paragraph_count': len(paragraphs),
            'avg_sentence_length': sum(len(s) for s in sentences) / len(sentences) if sentences else 0,
            'avg_paragraph_length': sum(len(p) for p in paragraphs) / len(paragraphs) if paragraphs else 0
        }

        processed_data = {
            'version_name': version_name,
            'raw_content': raw_content,
            'cleaned_content': cleaned_content,
            'sentences': sentences,
            'paragraphs': paragraphs,
            'statistics': stats,
            'processed_at': datetime.now().isoformat()
        }

        self.processed_versions[version_name] = processed_data

        print(f"    ✓ {stats['sentence_count']} sentences, {stats['paragraph_count']} paragraphs")
        print(f"    ✓ {stats['word_count']} words, {stats['character_count']} characters")

        return processed_data

    def process_all_versions(self, article_versions):
        """Process all versions of an article"""
        print(f"\n🔄 Preprocessing text for all versions...")

        for version_name, version_data in article_versions.versions.items():
            self.process_version(version_name, version_data['content'])

        return self.processed_versions

    def get_processing_summary(self):
        """Get a summary of processing results"""
        summary = {}
        for version_name, data in self.processed_versions.items():
            summary[version_name] = data['statistics']

        return summary

print("🔧 TextPreprocessor class loaded. Ready for text processing.")

🔧 TextPreprocessor class loaded. Ready for text processing.


In [None]:
# =============================================================================
# CELL 4: EXECUTION FUNCTIONS AND CHECKPOINT MANAGEMENT
# =============================================================================

def save_checkpoint_data(article_versions, preprocessor, output_path, checkpoint_name="steps_1_2"):
    """Save checkpoint data for review"""
    checkpoint_data = {
        'checkpoint_name': checkpoint_name,
        'timestamp': datetime.now().isoformat(),
        'article_metadata': article_versions.metadata,
        'processing_summary': preprocessor.get_processing_summary(),
        'validation_results': article_versions.metadata.get('validation_results', {}),
        'article_summary': article_versions.get_summary()
    }

    # Save to output directory
    checkpoint_file = f"{output_path}/{article_versions.article_name}_checkpoint_{checkpoint_name}.json"

    with open(checkpoint_file, 'w', encoding='utf-8') as f:
        json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Checkpoint saved: {checkpoint_file}")
    return checkpoint_data

def run_steps_1_2(article_name, input_path, base_output_path):
    """Run steps 1-2 for a given article"""
    print(f"🚀 Starting Steps 1-2 for article: {article_name}")
    print(f"📂 Input path: {input_path}")

    # Setup output directories
    output_dir, archive_dir = setup_output_directories(base_output_path)

    # Step 1: Data Ingestion & Validation
    article_versions = ArticleVersions(article_name, input_path)
    article_versions.load_versions()
    validation_results = article_versions.validate_version_sequence()

    if not validation_results['is_valid']:
        print("❌ Cannot proceed: Missing required versions (draft and final)")
        return None, None

    # Step 2: Text Preprocessing
    preprocessor = TextPreprocessor()
    preprocessor.process_all_versions(article_versions)

    # Save checkpoint
    checkpoint_data = save_checkpoint_data(article_versions, preprocessor, output_dir)

    print(f"\n✅ Steps 1-2 completed successfully!")
    print(f"📊 Processing Summary:")
    for version, stats in preprocessor.get_processing_summary().items():
        print(f"  {version}: {stats['word_count']} words, {stats['sentence_count']} sentences")

    return article_versions, preprocessor

# Interactive input functions
def get_user_inputs():
    """Get user inputs for processing"""
    print("📝 Please provide the following information:")

    article_name = input("Enter article name (without .md extension): ").strip()
    input_path = input("Enter full path to input folder containing markdown files: ").strip()
    base_output_path = input("Enter full path to base output folder: ").strip()

    print(f"\n📋 Configuration:")
    print(f"  Article name: {article_name}")
    print(f"  Input path: {input_path}")
    print(f"  Output path: {base_output_path}")

    confirm = input("\nProceed with these settings? (y/n): ").strip().lower()

    if confirm == 'y':
        return article_name, input_path, base_output_path
    else:
        print("❌ Cancelled. Run get_user_inputs() again to restart.")
        return None, None, None

def process_article_interactive():
    """Process an article with interactive inputs"""
    article_name, input_path, base_output_path = get_user_inputs()

    if article_name and input_path and base_output_path:
        return run_steps_1_2(article_name, input_path, base_output_path)
    else:
        return None, None

print("📋 Ready to process your article!")
print("Run: article_versions, preprocessor = process_article_interactive()")
print("\nMake sure your markdown files are named:")
print("- draft-your-article-name.md")
print("- refined-your-article-name.md")
print("- edited-your-article-name.md")
print("- final-your-article-name.md")

📋 Ready to process your article!
Run: article_versions, preprocessor = process_article_interactive()

Make sure your markdown files are named:
- draft-your-article-name.md
- refined-your-article-name.md
- edited-your-article-name.md
- final-your-article-name.md


In [None]:
# =============================================================================
# CELL 5: SAMPLE DATA CREATOR (FOR TESTING ONLY)
# =============================================================================

def create_sample_files_for_testing(output_path):
    """Create sample markdown files for testing (run this once to test)"""
    sample_content = {
        'draft-': """# Sample Article

This is a draft article about artificial intelligence and its impact on society. AI has revolutionized many industries.

The technology continues to evolve rapidly. Machine learning algorithms are becoming more sophisticated every day.

We must consider the ethical implications of AI development.""",

        'refined-': """# Sample Article

This is a refined article examining artificial intelligence and its transformative impact on modern society. AI has fundamentally revolutionized numerous industries across the globe.

The technology continues to evolve at an unprecedented pace. Advanced machine learning algorithms are becoming increasingly sophisticated with each passing day.

We must carefully consider the complex ethical implications of AI development and deployment.""",

        'edited-': """# Sample Article

This comprehensive article examines artificial intelligence and its transformative impact on modern society. AI has fundamentally revolutionized numerous industries worldwide, reshaping how we work and live.

The technology continues to evolve at an unprecedented pace, driven by breakthrough innovations. Advanced machine learning algorithms are becoming increasingly sophisticated, enabling new applications we never thought possible.

We must carefully consider the complex ethical implications of AI development and deployment, ensuring responsible innovation for the benefit of humanity.""",

        'final-': """# Sample Article

This comprehensive article examines artificial intelligence and its transformative impact on modern society. AI has fundamentally revolutionized numerous industries worldwide, reshaping how we work, communicate, and live.

The technology continues to evolve at an unprecedented pace, driven by breakthrough innovations in computing power and algorithmic design. Advanced machine learning algorithms are becoming increasingly sophisticated, enabling new applications we never thought possible just a decade ago.

We must carefully consider the complex ethical implications of AI development and deployment, ensuring responsible innovation that serves the benefit of all humanity while mitigating potential risks."""
    }

    article_name = "sample-article"
    os.makedirs(output_path, exist_ok=True)

    for prefix, content in sample_content.items():
        filename = f"{prefix}{article_name}.md"
        filepath = os.path.join(output_path, filename)

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)

        print(f"Created: {filename}")

    return article_name, output_path

print("\n🧪 Sample data creator available for testing if needed.")
print("To create test files, run:")
print("sample_name, sample_path = create_sample_files_for_testing('/your/test/path')")


🧪 Sample data creator available for testing if needed.
To create test files, run:
sample_name, sample_path = create_sample_files_for_testing('/your/test/path')


In [None]:
article_versions, preprocessor = process_article_interactive()

📝 Please provide the following information:
Enter article name (without .md extension): markup-languages
Enter full path to input folder containing markdown files: /content/drive/MyDrive/Google Drive/syntaxandempathy/30-articles/ai-vs-human
Enter full path to base output folder: /content/drive/MyDrive/Google Drive/syntaxandempathy/30-articles/ai-vs-human/output/output

📋 Configuration:
  Article name: markup-languages
  Input path: /content/drive/MyDrive/Google Drive/syntaxandempathy/30-articles/ai-vs-human
  Output path: /content/drive/MyDrive/Google Drive/syntaxandempathy/30-articles/ai-vs-human/output/output

Proceed with these settings? (y/n): y
🚀 Starting Steps 1-2 for article: markup-languages
📂 Input path: /content/drive/MyDrive/Google Drive/syntaxandempathy/30-articles/ai-vs-human
✓ Output directory ready: /content/drive/MyDrive/Google Drive/syntaxandempathy/30-articles/ai-vs-human/output/output
✓ Archive directory ready: /content/drive/MyDrive/Google Drive/syntaxandempathy/30-

In [None]:
!pip install sentence-transformers scikit-learn -q

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import difflib
from collections import defaultdict
import json
from datetime import datetime

print("📦 Dependencies installed and imported successfully!")
print("🤖 Loading SentenceTransformer model (this may take a moment)...")

# Load the semantic similarity model
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Semantic model loaded successfully!")

📦 Dependencies installed and imported successfully!
🤖 Loading SentenceTransformer model (this may take a moment)...
✅ Semantic model loaded successfully!


In [None]:
# =============================================================================
# CELL 2: SIMILARITY ANALYSIS (STEP 3)
# =============================================================================

class SimilarityAnalyzer:
    """Class to handle lexical and semantic similarity analysis"""

    def __init__(self, semantic_model):
        self.semantic_model = semantic_model
        self.similarity_results = {}

    def calculate_lexical_similarity(self, text1, text2):
        """Calculate lexical similarity using multiple metrics"""
        # Jaccard similarity (word-level)
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())
        jaccard = len(words1.intersection(words2)) / len(words1.union(words2)) if words1.union(words2) else 0

        # Edit distance similarity (character-level)
        sequence_matcher = difflib.SequenceMatcher(None, text1, text2)
        edit_similarity = sequence_matcher.ratio()

        # TF-IDF cosine similarity
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
        try:
            tfidf_matrix = vectorizer.fit_transform([text1, text2])
            tfidf_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        except:
            tfidf_similarity = 0.0

        return {
            'jaccard_similarity': jaccard,
            'edit_similarity': edit_similarity,
            'tfidf_similarity': tfidf_similarity,
            'lexical_average': (jaccard + edit_similarity + tfidf_similarity) / 3
        }

    def calculate_semantic_similarity(self, text1, text2):
        """Calculate semantic similarity using sentence embeddings"""
        # Get embeddings
        embeddings = self.semantic_model.encode([text1, text2])

        # Calculate cosine similarity
        similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

        return {
            'semantic_similarity': float(similarity),
            'embedding_dim': len(embeddings[0])
        }

    def calculate_sentence_level_similarities(self, sentences1, sentences2, version1, version2):
        """Calculate similarities at sentence level between two versions"""
        print(f"    Analyzing {len(sentences1)} vs {len(sentences2)} sentences...")

        sentence_similarities = []

        # Calculate all pairwise similarities
        for i, sent1 in enumerate(sentences1):
            best_match = {'index': -1, 'lexical': 0, 'semantic': 0, 'combined': 0}

            for j, sent2 in enumerate(sentences2):
                # Calculate similarities
                lexical = self.calculate_lexical_similarity(sent1, sent2)
                semantic = self.calculate_semantic_similarity(sent1, sent2)

                # Combined score (weighted average)
                combined = (lexical['lexical_average'] + semantic['semantic_similarity']) / 2

                if combined > best_match['combined']:
                    best_match = {
                        'index': j,
                        'lexical': lexical['lexical_average'],
                        'semantic': semantic['semantic_similarity'],
                        'combined': combined,
                        'target_sentence': sent2
                    }

            sentence_similarities.append({
                'source_index': i,
                'source_sentence': sent1,
                'best_match': best_match
            })

        return sentence_similarities

    def analyze_version_pair(self, version1_data, version2_data, version1_name, version2_name):
        """Analyze similarities between two versions"""
        print(f"  🔍 Analyzing {version1_name} → {version2_name}")

        # Full text similarity
        full_text_lexical = self.calculate_lexical_similarity(
            version1_data['cleaned_content'],
            version2_data['cleaned_content']
        )
        full_text_semantic = self.calculate_semantic_similarity(
            version1_data['cleaned_content'],
            version2_data['cleaned_content']
        )

        # Sentence-level analysis
        sentence_analysis = self.calculate_sentence_level_similarities(
            version1_data['sentences'],
            version2_data['sentences'],
            version1_name,
            version2_name
        )

        # Paragraph-level similarity
        para_lexical = self.calculate_lexical_similarity(
            ' '.join(version1_data['paragraphs']),
            ' '.join(version2_data['paragraphs'])
        )
        para_semantic = self.calculate_semantic_similarity(
            ' '.join(version1_data['paragraphs']),
            ' '.join(version2_data['paragraphs'])
        )

        # Aggregate sentence similarities
        sentence_similarities = [s['best_match']['combined'] for s in sentence_analysis]
        avg_sentence_similarity = np.mean(sentence_similarities) if sentence_similarities else 0

        return {
            'version_pair': f"{version1_name}_to_{version2_name}",
            'full_text': {
                'lexical': full_text_lexical,
                'semantic': full_text_semantic,
                'combined': (full_text_lexical['lexical_average'] + full_text_semantic['semantic_similarity']) / 2
            },
            'sentence_level': {
                'average_similarity': avg_sentence_similarity,
                'individual_similarities': sentence_similarities,
                'detailed_analysis': sentence_analysis
            },
            'paragraph_level': {
                'lexical': para_lexical,
                'semantic': para_semantic,
                'combined': (para_lexical['lexical_average'] + para_semantic['semantic_similarity']) / 2
            }
        }

    def analyze_all_versions(self, processed_versions):
        """Analyze similarities between all version pairs"""
        print(f"\n🔍 Step 3: Similarity Analysis")

        version_names = list(processed_versions.keys())
        version_order = ['draft', 'refined', 'edited', 'final']

        # Sort versions by expected order
        sorted_versions = []
        for expected in version_order:
            if expected in version_names:
                sorted_versions.append(expected)

        # Sequential analysis (draft→refined→edited→final)
        sequential_results = []
        for i in range(len(sorted_versions) - 1):
            current_version = sorted_versions[i]
            next_version = sorted_versions[i + 1]

            result = self.analyze_version_pair(
                processed_versions[current_version],
                processed_versions[next_version],
                current_version,
                next_version
            )
            sequential_results.append(result)

        # Draft to final comparison
        draft_to_final = None
        if 'draft' in version_names and 'final' in version_names:
            print(f"  🔍 Analyzing draft → final (overall change)")
            draft_to_final = self.analyze_version_pair(
                processed_versions['draft'],
                processed_versions['final'],
                'draft',
                'final'
            )

        self.similarity_results = {
            'sequential_analysis': sequential_results,
            'draft_to_final': draft_to_final,
            'analysis_timestamp': datetime.now().isoformat(),
            'versions_analyzed': sorted_versions
        }

        return self.similarity_results

print("🔍 SimilarityAnalyzer class loaded. Ready for similarity analysis.")

🔍 SimilarityAnalyzer class loaded. Ready for similarity analysis.


In [None]:
# =============================================================================
# CELL 3: ATTRIBUTION MAPPING (STEP 4)
# =============================================================================

class AttributionMapper:
    """Class to track content attribution across versions"""

    def __init__(self, similarity_threshold=0.3):
        self.similarity_threshold = similarity_threshold
        self.attribution_results = {}

    def trace_sentence_origins(self, processed_versions, similarity_results):
        """Trace each final sentence back to its earliest appearance"""
        print(f"\n📍 Step 4: Attribution Mapping")

        version_order = ['draft', 'refined', 'edited', 'final']
        available_versions = [v for v in version_order if v in processed_versions]

        if 'final' not in available_versions:
            print("❌ Cannot perform attribution - final version not found")
            return None

        final_sentences = processed_versions['final']['sentences']
        sentence_attributions = []

        print(f"  📝 Tracing {len(final_sentences)} final sentences...")

        for final_idx, final_sentence in enumerate(final_sentences):
            attribution = {
                'final_index': final_idx,
                'final_sentence': final_sentence,
                'origin_version': None,
                'origin_index': None,
                'similarity_scores': {},
                'modification_path': []
            }

            # Check each previous version (in reverse order to find earliest origin)
            for version in reversed(available_versions[:-1]):  # Exclude 'final'
                version_sentences = processed_versions[version]['sentences']

                best_match = {'index': -1, 'similarity': 0, 'sentence': ''}

                for sent_idx, version_sentence in enumerate(version_sentences):
                    # Calculate similarity
                    lexical = self._quick_lexical_similarity(final_sentence, version_sentence)
                    semantic = self._quick_semantic_similarity(final_sentence, version_sentence)
                    combined = (lexical + semantic) / 2

                    if combined > best_match['similarity']:
                        best_match = {
                            'index': sent_idx,
                            'similarity': combined,
                            'sentence': version_sentence
                        }

                attribution['similarity_scores'][version] = best_match['similarity']

                # If similarity is above threshold, this could be the origin
                if best_match['similarity'] >= self.similarity_threshold:
                    if attribution['origin_version'] is None:  # First match found (earliest version)
                        attribution['origin_version'] = version
                        attribution['origin_index'] = best_match['index']

                    attribution['modification_path'].append({
                        'version': version,
                        'similarity': best_match['similarity'],
                        'sentence': best_match['sentence']
                    })

            # If no origin found, it's new content
            if attribution['origin_version'] is None:
                attribution['origin_version'] = 'new_in_final'

            sentence_attributions.append(attribution)

        return sentence_attributions

    def _quick_lexical_similarity(self, text1, text2):
        """Quick lexical similarity calculation"""
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())
        if not words1 and not words2:
            return 1.0
        if not words1 or not words2:
            return 0.0
        return len(words1.intersection(words2)) / len(words1.union(words2))

    def _quick_semantic_similarity(self, text1, text2):
        """Quick semantic similarity calculation"""
        embeddings = semantic_model.encode([text1, text2])
        return float(cosine_similarity([embeddings[0]], [embeddings[1]])[0][0])

    def calculate_attribution_statistics(self, sentence_attributions):
        """Calculate overall attribution statistics"""
        total_sentences = len(sentence_attributions)

        # Count by origin version
        origin_counts = defaultdict(int)
        for attribution in sentence_attributions:
            origin_counts[attribution['origin_version']] += 1

        # Calculate percentages
        origin_percentages = {}
        for version, count in origin_counts.items():
            origin_percentages[version] = {
                'count': count,
                'percentage': (count / total_sentences) * 100
            }

        # Calculate modification statistics
        modification_stats = {
            'high_similarity': 0,  # >0.8
            'medium_similarity': 0,  # 0.5-0.8
            'low_similarity': 0,  # 0.3-0.5
            'new_content': 0  # <0.3 or new_in_final
        }

        for attribution in sentence_attributions:
            if attribution['origin_version'] == 'new_in_final':
                modification_stats['new_content'] += 1
            else:
                # Get highest similarity score
                max_similarity = max(attribution['similarity_scores'].values()) if attribution['similarity_scores'] else 0

                if max_similarity > 0.8:
                    modification_stats['high_similarity'] += 1
                elif max_similarity > 0.5:
                    modification_stats['medium_similarity'] += 1
                elif max_similarity > 0.3:
                    modification_stats['low_similarity'] += 1
                else:
                    modification_stats['new_content'] += 1

        # Convert to percentages
        modification_percentages = {}
        for category, count in modification_stats.items():
            modification_percentages[category] = {
                'count': count,
                'percentage': (count / total_sentences) * 100
            }

        return {
            'total_sentences': total_sentences,
            'origin_distribution': origin_percentages,
            'modification_distribution': modification_percentages
        }

    def analyze_attribution(self, processed_versions, similarity_results):
        """Perform complete attribution analysis"""
        sentence_attributions = self.trace_sentence_origins(processed_versions, similarity_results)

        if sentence_attributions is None:
            return None

        attribution_statistics = self.calculate_attribution_statistics(sentence_attributions)

        self.attribution_results = {
            'sentence_attributions': sentence_attributions,
            'statistics': attribution_statistics,
            'analysis_timestamp': datetime.now().isoformat(),
            'similarity_threshold': self.similarity_threshold
        }

        # Print summary
        print(f"\n📊 Attribution Summary:")
        print(f"  Total sentences in final: {attribution_statistics['total_sentences']}")

        print(f"\n  Origin Distribution:")
        for version, data in attribution_statistics['origin_distribution'].items():
            print(f"    {version}: {data['count']} sentences ({data['percentage']:.1f}%)")

        print(f"\n  Modification Levels:")
        for category, data in attribution_statistics['modification_distribution'].items():
            print(f"    {category}: {data['count']} sentences ({data['percentage']:.1f}%)")

        return self.attribution_results

print("📍 AttributionMapper class loaded. Ready for attribution analysis.")

📍 AttributionMapper class loaded. Ready for attribution analysis.


In [None]:
# =============================================================================
# CELL 4: COMBINED EXECUTION FUNCTION
# =============================================================================

def run_steps_3_4(article_versions, preprocessor, output_path):
    """Run steps 3-4: Similarity Analysis and Attribution Mapping"""
    print(f"🚀 Starting Steps 3-4 for article: {article_versions.article_name}")

    # Step 3: Similarity Analysis
    similarity_analyzer = SimilarityAnalyzer(semantic_model)
    similarity_results = similarity_analyzer.analyze_all_versions(preprocessor.processed_versions)

    # Step 4: Attribution Mapping
    attribution_mapper = AttributionMapper(similarity_threshold=0.3)
    attribution_results = attribution_mapper.analyze_attribution(
        preprocessor.processed_versions,
        similarity_results
    )

    # Combine results
    combined_results = {
        'article_name': article_versions.article_name,
        'analysis_timestamp': datetime.now().isoformat(),
        'article_metadata': article_versions.metadata,
        'processing_summary': preprocessor.get_processing_summary(),
        'similarity_analysis': similarity_results,
        'attribution_analysis': attribution_results
    }

    # Save comprehensive results
    results_file = f"{output_path}/{article_versions.article_name}_complete_analysis.json"
    with open(results_file, 'w', encoding='utf-8') as f:
        json.dump(combined_results, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Complete analysis saved: {results_file}")

    # Generate summary metrics for article footer
    footer_metrics = generate_footer_metrics(combined_results)

    # Save footer metrics separately
    footer_file = f"{output_path}/{article_versions.article_name}_footer_metrics.json"
    with open(footer_file, 'w', encoding='utf-8') as f:
        json.dump(footer_metrics, f, indent=2, ensure_ascii=False)

    print(f"📊 Footer metrics saved: {footer_file}")

    return combined_results, footer_metrics

def generate_footer_metrics(combined_results):
    """Generate clean metrics for article footer"""

    # Get processing stats
    processing = combined_results['processing_summary']

    # Get attribution stats
    if combined_results['attribution_analysis']:
        attribution = combined_results['attribution_analysis']['statistics']
        origin_dist = attribution['origin_distribution']
        modification_dist = attribution['modification_distribution']
    else:
        origin_dist = {}
        modification_dist = {}

    # Get similarity stats (draft to final)
    draft_to_final = combined_results['similarity_analysis']['draft_to_final']
    overall_similarity = draft_to_final['full_text']['combined'] if draft_to_final else 0

    footer_metrics = {
        'article_name': combined_results['article_name'],
        'word_progression': {
            'draft': processing.get('draft', {}).get('word_count', 0),
            'final': processing.get('final', {}).get('word_count', 0),
            'change_percentage': 0
        },
        'content_retention': {
            'overall_similarity': round(overall_similarity * 100, 1),
            'content_origins': {}
        },
        'modification_summary': {},
        'generated_at': datetime.now().isoformat()
    }

    # Calculate word change percentage
    if footer_metrics['word_progression']['draft'] > 0:
        draft_words = footer_metrics['word_progression']['draft']
        final_words = footer_metrics['word_progression']['final']
        change = ((final_words - draft_words) / draft_words) * 100
        footer_metrics['word_progression']['change_percentage'] = round(change, 1)

    # Simplify origin distribution for footer
    for version, data in origin_dist.items():
        if version != 'new_in_final':
            footer_metrics['content_retention']['content_origins'][version] = round(data['percentage'], 1)

    # Simplify modification distribution
    for category, data in modification_dist.items():
        clean_category = category.replace('_', ' ').title()
        footer_metrics['modification_summary'][clean_category] = round(data['percentage'], 1)

    return footer_metrics

print("🎯 Execution functions loaded. Ready to run complete analysis!")
print("\nTo run the complete analysis:")
print("combined_results, footer_metrics = run_steps_3_4(article_versions, preprocessor, 'your_output_path')")

🎯 Execution functions loaded. Ready to run complete analysis!

To run the complete analysis:
combined_results, footer_metrics = run_steps_3_4(article_versions, preprocessor, 'your_output_path')


In [None]:
# =============================================================================
# CELL 5: QUICK EXECUTION FOR EXISTING DATA
# =============================================================================

def run_complete_analysis_from_existing(article_versions, preprocessor):
    """Run steps 3-4 using the output path from existing data"""
    # Find where the Step 1-2 checkpoint was actually saved
    base_path = article_versions.input_path

    # Check for the nested output structure that was created in Steps 1-2
    nested_output_path = os.path.join(base_path, 'output', 'output')
    regular_output_path = os.path.join(base_path, 'output')

    # Use the path where the checkpoint file exists
    checkpoint_file = f"markup-languages_checkpoint_steps_1_2.json"

    if os.path.exists(os.path.join(nested_output_path, checkpoint_file)):
        output_path = nested_output_path
        print(f"📂 Using nested output path: {output_path}")
    elif os.path.exists(os.path.join(regular_output_path, checkpoint_file)):
        output_path = regular_output_path
        print(f"📂 Using regular output path: {output_path}")
    else:
        # Create regular output path as fallback
        output_path = regular_output_path
        os.makedirs(output_path, exist_ok=True)
        print(f"📂 Created output path: {output_path}")

    return run_steps_3_4(article_versions, preprocessor, output_path)

print("⚡ Quick execution function available:")
print("combined_results, footer_metrics = run_complete_analysis_from_existing(article_versions, preprocessor)")

⚡ Quick execution function available:
combined_results, footer_metrics = run_complete_analysis_from_existing(article_versions, preprocessor)


In [None]:
combined_results, footer_metrics = run_complete_analysis_from_existing(article_versions, preprocessor)

📂 Using nested output path: /content/drive/MyDrive/Google Drive/syntaxandempathy/30-articles/ai-vs-human/output/output
🚀 Starting Steps 3-4 for article: markup-languages

🔍 Step 3: Similarity Analysis
  🔍 Analyzing draft → refined
    Analyzing 47 vs 60 sentences...
  🔍 Analyzing refined → edited
    Analyzing 60 vs 45 sentences...
  🔍 Analyzing edited → final
    Analyzing 45 vs 42 sentences...
  🔍 Analyzing draft → final (overall change)
  🔍 Analyzing draft → final
    Analyzing 47 vs 42 sentences...

📍 Step 4: Attribution Mapping
  📝 Tracing 42 final sentences...

📊 Attribution Summary:
  Total sentences in final: 42

  Origin Distribution:
    edited: 36 sentences (85.7%)
    draft: 1 sentences (2.4%)
    new_in_final: 5 sentences (11.9%)

  Modification Levels:
    high_similarity: 3 sentences (7.1%)
    medium_similarity: 17 sentences (40.5%)
    low_similarity: 17 sentences (40.5%)
    new_content: 5 sentences (11.9%)

💾 Complete analysis saved: /content/drive/MyDrive/Google Dri

In [None]:
# =============================================================================
# CELL 1: TREND ANALYSIS SETUP
# =============================================================================

import os
import json
import re
import glob
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from collections import defaultdict, Counter

# Archive configuration
ARCHIVE_BASE_PATH = '/content/drive/MyDrive/Google Drive/syntaxandempathy/99-past/'

class TrendAnalyzer:
    """Class to analyze trends across multiple articles over time periods"""

    def __init__(self, archive_path=ARCHIVE_BASE_PATH):
        self.archive_path = archive_path
        self.found_articles = []
        self.period_data = {}
        self.trend_results = {}

    def parse_period(self, period):
        """Parse period string into date patterns"""
        period_patterns = []

        if len(period) == 4:  # Year: "2025"
            # All months in the year: 202501*, 202502*, etc.
            for month in range(1, 13):
                pattern = f"{period}{month:02d}*"
                period_patterns.append(pattern)

        elif len(period) == 7 and period[4] == '-':  # Month: "2025-01"
            year, month = period.split('-')
            pattern = f"{year}{month}*"  # e.g., "202501*"
            period_patterns.append(pattern)

        elif period.endswith(('Q1', 'Q2', 'Q3', 'Q4')):  # Quarter: "2025-Q1"
            year, quarter = period.split('-')
            quarter_months = {
                'Q1': ['01', '02', '03'],
                'Q2': ['04', '05', '06'],
                'Q3': ['07', '08', '09'],
                'Q4': ['10', '11', '12']
            }

            for month in quarter_months[quarter]:
                pattern = f"{year}{month}*"  # e.g., "202501*", "202502*", "202503*"
                period_patterns.append(pattern)
        else:
            raise ValueError(f"Invalid period format: {period}. Use 'YYYY', 'YYYY-MM', or 'YYYY-Q#'")

        return period_patterns

    def find_articles_for_period(self, period):
        """Find all articles matching the period"""
        print(f"🔍 Searching for articles in period: {period}")
        print(f"📂 Archive path: {self.archive_path}")

        if not os.path.exists(self.archive_path):
            print(f"❌ Archive path does not exist: {self.archive_path}")
            return []

        patterns = self.parse_period(period)
        found_articles = []

        # Get all directories in archive
        all_dirs = [d for d in os.listdir(self.archive_path)
                   if os.path.isdir(os.path.join(self.archive_path, d))]

        print(f"📁 Found {len(all_dirs)} directories in archive")

        # Match directories against patterns
        for pattern in patterns:
            # Convert shell pattern to regex: 202506* becomes ^202506.*
            pattern_regex = f"^{pattern.replace('*', '.*')}"
            regex = re.compile(pattern_regex)

            for dir_name in all_dirs:
                if regex.match(dir_name):
                    article_path = os.path.join(self.archive_path, dir_name)

                    # Look for analysis file
                    analysis_files = glob.glob(os.path.join(article_path, 'output', '*_complete_analysis.json'))

                    if analysis_files:
                        # Avoid duplicates
                        if not any(item['folder_name'] == dir_name for item in found_articles):
                            found_articles.append({
                                'folder_name': dir_name,
                                'article_path': article_path,
                                'analysis_file': analysis_files[0],
                                'date_prefix': dir_name[:8],  # Extract YYYYMMDD
                                'article_name': dir_name[9:]  # Extract name after date-
                            })
                            print(f"  ✓ Found: {dir_name} (matches {pattern})")
                    else:
                        print(f"  ⚠ No analysis file found in: {dir_name}")

        found_articles.sort(key=lambda x: x['date_prefix'])  # Sort by date
        self.found_articles = found_articles

        print(f"📊 Total articles found for {period}: {len(found_articles)}")
        return found_articles

    def load_article_data(self, article_info):
        """Load analysis data for a single article"""
        try:
            with open(article_info['analysis_file'], 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Add metadata
            data['folder_name'] = article_info['folder_name']
            data['publication_date'] = article_info['date_prefix']

            return data

        except Exception as e:
            print(f"❌ Error loading {article_info['analysis_file']}: {str(e)}")
            return None

    def aggregate_attribution_trends(self, articles_data):
        """Aggregate attribution data across articles"""
        attribution_trends = {
            'by_article': [],
            'averages': {},
            'totals': {},
            'trends_over_time': []
        }

        # Collect data for each article
        for article in articles_data:
            if not article or 'attribution_analysis' not in article:
                continue

            attr_stats = article['attribution_analysis']['statistics']
            origin_dist = attr_stats['origin_distribution']
            mod_dist = attr_stats['modification_distribution']

            article_data = {
                'article_name': article['article_name'],
                'publication_date': article['publication_date'],
                'total_sentences': attr_stats['total_sentences'],
                'origin_percentages': {},
                'modification_percentages': {}
            }

            # Extract origin percentages
            for version, data in origin_dist.items():
                article_data['origin_percentages'][version] = data['percentage']

            # Extract modification percentages
            for category, data in mod_dist.items():
                article_data['modification_percentages'][category] = data['percentage']

            attribution_trends['by_article'].append(article_data)

        # Calculate averages across all articles
        if attribution_trends['by_article']:
            all_origins = set()
            all_modifications = set()

            for article in attribution_trends['by_article']:
                all_origins.update(article['origin_percentages'].keys())
                all_modifications.update(article['modification_percentages'].keys())

            # Average origin percentages
            for origin in all_origins:
                values = [a['origin_percentages'].get(origin, 0) for a in attribution_trends['by_article']]
                attribution_trends['averages'][f'origin_{origin}'] = np.mean(values)

            # Average modification percentages
            for mod in all_modifications:
                values = [a['modification_percentages'].get(mod, 0) for a in attribution_trends['by_article']]
                attribution_trends['averages'][f'modification_{mod}'] = np.mean(values)

        return attribution_trends

    def aggregate_similarity_trends(self, articles_data):
        """Aggregate similarity data across articles"""
        similarity_trends = {
            'draft_to_final': [],
            'sequential_changes': [],
            'averages': {}
        }

        for article in articles_data:
            if not article or 'similarity_analysis' not in article:
                continue

            sim_analysis = article['similarity_analysis']

            # Draft to final similarity
            if sim_analysis['draft_to_final']:
                draft_final = sim_analysis['draft_to_final']['full_text']
                similarity_trends['draft_to_final'].append({
                    'article_name': article['article_name'],
                    'publication_date': article['publication_date'],
                    'lexical_similarity': draft_final['lexical']['lexical_average'],
                    'semantic_similarity': draft_final['semantic']['semantic_similarity'],
                    'combined_similarity': draft_final['combined']
                })

            # Sequential changes
            for seq in sim_analysis['sequential_analysis']:
                similarity_trends['sequential_changes'].append({
                    'article_name': article['article_name'],
                    'publication_date': article['publication_date'],
                    'version_pair': seq['version_pair'],
                    'combined_similarity': seq['full_text']['combined']
                })

        # Calculate averages
        if similarity_trends['draft_to_final']:
            df_sims = similarity_trends['draft_to_final']
            similarity_trends['averages']['draft_to_final'] = {
                'lexical': np.mean([s['lexical_similarity'] for s in df_sims]),
                'semantic': np.mean([s['semantic_similarity'] for s in df_sims]),
                'combined': np.mean([s['combined_similarity'] for s in df_sims])
            }

        return similarity_trends

    def aggregate_word_count_trends(self, articles_data):
        """Aggregate word count progression across articles"""
        word_trends = {
            'by_article': [],
            'averages': {},
            'progression_patterns': []
        }

        for article in articles_data:
            if not article or 'processing_summary' not in article:
                continue

            processing = article['processing_summary']

            article_words = {
                'article_name': article['article_name'],
                'publication_date': article['publication_date'],
                'word_counts': {},
                'progression': []
            }

            # Extract word counts for each version
            version_order = ['draft', 'refined', 'edited', 'final']
            for version in version_order:
                if version in processing:
                    count = processing[version]['word_count']
                    article_words['word_counts'][version] = count
                    article_words['progression'].append(count)

            word_trends['by_article'].append(article_words)

        # Calculate average progressions
        if word_trends['by_article']:
            version_order = ['draft', 'refined', 'edited', 'final']
            for version in version_order:
                counts = [a['word_counts'].get(version, 0) for a in word_trends['by_article'] if version in a['word_counts']]
                if counts:
                    word_trends['averages'][version] = np.mean(counts)

        return word_trends

    def analyze_trends(self, period):
        """Perform complete trend analysis for a period"""
        print(f"\n🚀 Starting trend analysis for period: {period}")

        # Find articles
        articles = self.find_articles_for_period(period)

        if not articles:
            print(f"❌ No articles found for period {period}")
            return None

        # Load article data
        print(f"\n📖 Loading analysis data for {len(articles)} articles...")
        articles_data = []

        for article_info in articles:
            data = self.load_article_data(article_info)
            if data:
                articles_data.append(data)
                print(f"  ✓ Loaded: {article_info['article_name']}")
            else:
                print(f"  ✗ Failed: {article_info['article_name']}")

        print(f"📊 Successfully loaded {len(articles_data)} articles")

        # Perform trend analysis
        print(f"\n🔍 Analyzing trends...")

        attribution_trends = self.aggregate_attribution_trends(articles_data)
        similarity_trends = self.aggregate_similarity_trends(articles_data)
        word_count_trends = self.aggregate_word_count_trends(articles_data)

        # Compile results
        self.trend_results = {
            'period': period,
            'analysis_timestamp': datetime.now().isoformat(),
            'articles_analyzed': len(articles_data),
            'article_list': [{'name': a['article_name'], 'date': a['publication_date']} for a in articles_data],
            'attribution_trends': attribution_trends,
            'similarity_trends': similarity_trends,
            'word_count_trends': word_count_trends,
            'summary': self.generate_trend_summary(attribution_trends, similarity_trends, word_count_trends)
        }

        return self.trend_results

    def generate_trend_summary(self, attribution_trends, similarity_trends, word_count_trends):
        """Generate a summary of key trends"""
        summary = {
            'key_metrics': {},
            'patterns': [],
            'insights': []
        }

        # Key attribution metrics
        if attribution_trends['averages']:
            avg_draft = attribution_trends['averages'].get('origin_draft', 0)
            avg_edited = attribution_trends['averages'].get('origin_edited', 0)
            avg_new = attribution_trends['averages'].get('origin_new_in_final', 0)

            summary['key_metrics']['avg_draft_retention'] = round(avg_draft, 1)
            summary['key_metrics']['avg_edited_dominance'] = round(avg_edited, 1)
            summary['key_metrics']['avg_new_content'] = round(avg_new, 1)

        # Key similarity metrics
        if similarity_trends['averages'].get('draft_to_final'):
            df_sim = similarity_trends['averages']['draft_to_final']
            summary['key_metrics']['avg_draft_final_similarity'] = round(df_sim['combined'] * 100, 1)

        # Word count patterns
        if word_count_trends['averages']:
            avg_draft = word_count_trends['averages'].get('draft', 0)
            avg_final = word_count_trends['averages'].get('final', 0)
            if avg_draft > 0:
                change_pct = ((avg_final - avg_draft) / avg_draft) * 100
                summary['key_metrics']['avg_word_change_pct'] = round(change_pct, 1)

        return summary

    def save_trend_analysis(self, output_path, filename_prefix="trend_analysis"):
        """Save trend analysis results"""
        if not self.trend_results:
            print("❌ No trend results to save")
            return None

        # Create output directory
        os.makedirs(output_path, exist_ok=True)

        # Save comprehensive results
        results_file = os.path.join(output_path, f"{filename_prefix}_{self.trend_results['period']}.json")

        with open(results_file, 'w', encoding='utf-8') as f:
            json.dump(self.trend_results, f, indent=2, ensure_ascii=False)

        print(f"💾 Trend analysis saved: {results_file}")

        # Also save a summary report
        summary_file = os.path.join(output_path, f"{filename_prefix}_{self.trend_results['period']}_summary.json")

        summary_data = {
            'period': self.trend_results['period'],
            'articles_count': self.trend_results['articles_analyzed'],
            'key_metrics': self.trend_results['summary']['key_metrics'],
            'article_list': self.trend_results['article_list']
        }

        with open(summary_file, 'w', encoding='utf-8') as f:
            json.dump(summary_data, f, indent=2, ensure_ascii=False)

        print(f"📋 Summary report saved: {summary_file}")

        return results_file, summary_file

print("📈 TrendAnalyzer class loaded. Ready for trend analysis!")

📈 TrendAnalyzer class loaded. Ready for trend analysis!


In [None]:
# =============================================================================
# CELL 2: EXECUTION FUNCTIONS
# =============================================================================

def analyze_period_trends(period, output_path=None):
    """Analyze trends for a specific period"""
    print(f"📈 Analyzing trends for period: {period}")

    # Create analyzer
    analyzer = TrendAnalyzer()

    # Run analysis
    results = analyzer.analyze_trends(period)

    if not results:
        return None

    # Print summary
    print(f"\n📊 TREND ANALYSIS SUMMARY for {period}")
    print(f"{'='*50}")

    summary = results['summary']['key_metrics']

    if 'avg_draft_retention' in summary:
        print(f"📝 Average content retention from draft: {summary['avg_draft_retention']}%")

    if 'avg_edited_dominance' in summary:
        print(f"✏️  Average content from editing phase: {summary['avg_edited_dominance']}%")

    if 'avg_new_content' in summary:
        print(f"🆕 Average new content in final: {summary['avg_new_content']}%")

    if 'avg_draft_final_similarity' in summary:
        print(f"🔄 Average draft-to-final similarity: {summary['avg_draft_final_similarity']}%")

    if 'avg_word_change_pct' in summary:
        change = summary['avg_word_change_pct']
        direction = "increase" if change > 0 else "decrease"
        print(f"📏 Average word count {direction}: {abs(change):.1f}%")

    print(f"\n📚 Articles analyzed: {results['articles_analyzed']}")
    for article in results['article_list']:
        print(f"  • {article['date']}: {article['name']}")

    # Save results if output path provided
    if output_path:
        analyzer.save_trend_analysis(output_path)

    return results, analyzer

def quick_trend_check(period):
    """Quick trend check without saving files"""
    results, analyzer = analyze_period_trends(period)
    return results

print("🎯 Execution functions loaded!")
print("\nUsage examples:")
print("# Quick check (no files saved)")
print("results = quick_trend_check('2025-Q1')")
print("\n# Full analysis with file output")
print("results, analyzer = analyze_period_trends('2025-01', '/your/output/path')")
print("\n# Supported period formats:")
print("  • '2025' (full year)")
print("  • '2025-01' (specific month)")
print("  • '2025-Q1' (quarter)")

🎯 Execution functions loaded!

Usage examples:
# Quick check (no files saved)
results = quick_trend_check('2025-Q1')

# Full analysis with file output
results, analyzer = analyze_period_trends('2025-01', '/your/output/path')

# Supported period formats:
  • '2025' (full year)
  • '2025-01' (specific month)
  • '2025-Q1' (quarter)


In [None]:
# =============================================================================
# CELL 3: TESTING AND VALIDATION
# =============================================================================

def test_archive_structure():
    """Test the archive structure and show what's available"""
    print(f"🔍 Testing archive structure...")
    print(f"📂 Archive path: {ARCHIVE_BASE_PATH}")

    if not os.path.exists(ARCHIVE_BASE_PATH):
        print(f"❌ Archive path does not exist!")
        return False

    # List all directories
    all_dirs = [d for d in os.listdir(ARCHIVE_BASE_PATH)
               if os.path.isdir(os.path.join(ARCHIVE_BASE_PATH, d))]

    print(f"📁 Found {len(all_dirs)} directories:")

    # Look for date pattern: YYYYMMDD-*
    date_pattern = re.compile(r'^\d{8}-.*')
    valid_dirs = []

    for dir_name in sorted(all_dirs):
        print(f"  🔍 Checking: '{dir_name}'")

        if date_pattern.match(dir_name):
            # Check for analysis file
            analysis_files = glob.glob(os.path.join(ARCHIVE_BASE_PATH, dir_name, 'output', '*_complete_analysis.json'))
            status = "✓" if analysis_files else "⚠"
            print(f"    {status} MATCHES date pattern (YYYYMMDD-name)")
            if analysis_files:
                valid_dirs.append(dir_name)
                print(f"      Found analysis file: {os.path.basename(analysis_files[0])}")
            else:
                print(f"      No analysis file in output/ folder")
        else:
            print(f"    ✗ Does NOT match YYYYMMDD-name pattern")

    print(f"\n📊 Summary:")
    print(f"  Total directories: {len(all_dirs)}")
    print(f"  Valid date format: {len([d for d in all_dirs if date_pattern.match(d)])}")
    print(f"  With analysis files: {len(valid_dirs)}")

    if valid_dirs:
        print(f"\n🗓️  Date range:")
        dates = [d[:8] for d in valid_dirs]
        print(f"  Earliest: {min(dates)}")
        print(f"  Latest: {max(dates)}")

        # Show available periods
        years = set(d[:4] for d in dates)
        print(f"\n📅 Available periods you can analyze:")
        for year in sorted(years):
            year_dates = [d for d in dates if d.startswith(year)]
            months = set(d[4:6] for d in year_dates)
            quarters = set(f"Q{(int(d[4:6])-1)//3 + 1}" for d in year_dates)
            print(f"  {year}: {len(year_dates)} articles")
            print(f"    Months: {', '.join(sorted(months))}")
            print(f"    Quarters: {', '.join(sorted(quarters))}")

    return len(valid_dirs) > 0

print("🧪 Testing functions loaded!")
print("Run: test_archive_structure() to validate your archive")

🧪 Testing functions loaded!
Run: test_archive_structure() to validate your archive


In [None]:
test_archive_structure()

🔍 Testing archive structure...
📂 Archive path: /content/drive/MyDrive/Google Drive/syntaxandempathy/99-past/
📁 Found 2 directories:
  🔍 Checking: '2024'
    ✗ Does NOT match YYYYMMDD-name pattern
  🔍 Checking: '20250614-ai-vs-human'
    ✓ MATCHES date pattern (YYYYMMDD-name)
      Found analysis file: markup-languages_complete_analysis.json

📊 Summary:
  Total directories: 2
  Valid date format: 1
  With analysis files: 1

🗓️  Date range:
  Earliest: 20250614
  Latest: 20250614

📅 Available periods you can analyze:
  2025: 1 articles
    Months: 06
    Quarters: Q2


True

In [None]:
# Test with the available periods
results = quick_trend_check('2025-06')

📈 Analyzing trends for period: 2025-06

🚀 Starting trend analysis for period: 2025-06
🔍 Searching for articles in period: 2025-06
📂 Archive path: /content/drive/MyDrive/Google Drive/syntaxandempathy/99-past/
📁 Found 2 directories in archive
  ✓ Found: 20250614-ai-vs-human (matches 202506*)
📊 Total articles found for 2025-06: 1

📖 Loading analysis data for 1 articles...
  ✓ Loaded: ai-vs-human
📊 Successfully loaded 1 articles

🔍 Analyzing trends...

📊 TREND ANALYSIS SUMMARY for 2025-06
📝 Average content retention from draft: 2.4%
✏️  Average content from editing phase: 85.7%
🆕 Average new content in final: 11.9%
🔄 Average draft-to-final similarity: 43.1%
📏 Average word count increase: 3.4%

📚 Articles analyzed: 1
  • 20250614: markup-languages
