In [1]:
# Metadata_Generation_Demo.ipynb

# %% [markdown]
# ## MetaMuse: Automated Metadata Generation Demo
# This notebook demonstrates the complete workflow of the MetaMuse system, including:
# 1. Sample PDF generation
# 2. Text extraction from documents
# 3. Semantic metadata generation
# 4. Result visualization


In [2]:

# %%
# Install required packages
!pip install -q fpdf pytesseract pdfminer.six python-docx spacy keybert transformers
!python -m spacy download en_core_web_md
!python -m nltk.downloader stopwords


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fr0stedflake/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:

# %%
import re
import json
import spacy
from keybert import KeyBERT
from transformers import pipeline
from datetime import datetime
import nltk
from fpdf import FPDF
import os
import io
import fitz
import docx
import pytesseract
import pdfplumber
from PIL import Image


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

# %%
# Initialize NLP components
try:
    nltk.data.find('corpora/stopwords')
except:
    nltk.download('stopwords')

nlp = spacy.load("en_core_web_md")
kw_model = KeyBERT()
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")


Device set to use cpu


In [5]:

# %%
# Generate sample PDF
def create_sample_pdf():
    class PDF(FPDF):
        def header(self):
            self.set_font('Arial', 'B', 12)
            self.cell(0, 10, 'MetaMuse Sample Document', 0, 1, 'C')
        def footer(self):
            self.set_y(-15)
            self.set_font('Arial', 'I', 8)
            self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
    
    pdf = PDF()
    pdf.add_page()
    pdf.set_font('Arial', '', 12)
    
    sample_text = '''
    MetaMuse is an advanced automated metadata generation system developed by Acme Corp. 
    It uses natural language processing to extract meaningful information from documents.
    
    Key features include:
    - Multi-format support (PDF, DOCX, TXT)
    - OCR capabilities for image-based content
    - Semantic analysis of document content
    - Structured metadata output in JSON format
    
    This document was generated on June 25, 2025 to demonstrate the system's capabilities.
    '''
    
    pdf.multi_cell(0, 10, sample_text)
    pdf.output("sample_document.pdf")
    print("Sample PDF created: sample_document.pdf")

create_sample_pdf()


Sample PDF created: sample_document.pdf


In [6]:

# %%
# Document extraction functions
def extract_pdf_text(file_path):
    # Try text extraction first
    try:
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join(page.extract_text() for page in pdf.pages)
            if text.strip(): return text
    except:
        pass
    
    # OCR fallback
    images = convert_from_path(file_path)
    return "\n".join(pytesseract.image_to_string(img) for img in images)

def extract_docx_text(file_path):
    doc = docx.Document(file_path)
    return "\n".join(para.text for para in doc.paragraphs)

def extract_content(file_path):
    if file_path.endswith('.pdf'):
        return extract_pdf_text(file_path)
    elif file_path.endswith('.docx'):
        return extract_docx_text(file_path)
    elif file_path.endswith('.txt'):
        return open(file_path).read()
    else:
        raise ValueError("Unsupported file format")


In [7]:

# %%
# Metadata generation functions
TOPIC_MAP = {
    "Technology": ["ai", "data", "cloud", "algorithm", "software", "digital", "blockchain"],
    "Finance": ["loan", "investment", "stock", "bank", "equity", "credit", "capital"],
    "Healthcare": ["health", "medical", "patient", "disease", "hospital", "treatment"],
    "Education": ["school", "student", "learning", "university", "course", "teacher"],
    "Environment": ["climate", "sustainability", "energy", "conservation", "pollution"],
    "Government": ["policy", "regulation", "public", "government", "law", "administration"]
}


In [8]:

def extract_title(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    if lines:
        candidate = lines[0]
        if 10 <= len(candidate) <= 120 and not candidate.isupper():
            return candidate
    first_sentence = text.split('.')[0]
    if 20 <= len(first_sentence) <= 150:
        return first_sentence
    return "Untitled Document"


In [9]:

def extract_author(text):
    doc = nlp(text[:2000])
    persons = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    if persons: return persons[0]
    match = re.search(r"(?i)(?:by|author|written by)[: ]+\s*(\w+ \w+)", text[:1000])
    if match: return match.group(1).title()
    return "Unknown Author"


In [10]:

def extract_date(text):
    doc = nlp(text[:5000])
    dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    if dates: return dates[0]
    return datetime.now().strftime("%Y")

def extract_keyphrases(text):
    return [kw[0] for kw in kw_model.extract_keywords(
        text, keyphrase_ngram_range=(1, 3), stop_words="english", top_n=7)]

def map_topics(keyphrases):
    topics = set()
    for topic, keywords in TOPIC_MAP.items():
        if any(kw in phrase.lower() for phrase in keyphrases for kw in keywords):
            topics.add(topic)
    return list(topics) or ["General"]


In [11]:

def analyze_sentiment(text):
    doc = nlp(text)
    pos = sum(1 for token in doc if token.sentiment > 0)
    neg = sum(1 for token in doc if token.sentiment < 0)
    return "Positive" if pos > neg else "Negative" if neg > pos else "Neutral"

def analyze_readability(text):
    words = text.split()
    sentences = [s for s in re.split(r'[.!?]', text) if s.strip()]
    if not sentences: return "Unknown"
    avg = len(words) / len(sentences)
    return "Technical" if avg > 25 else "Standard"

def generate_summary(text):
    if len(text.split()) < 100: 
        return "Document too short for meaningful summary"
    return summarizer(text[:3000], max_length=150, min_length=30)[0]['summary_text']


In [12]:

def generate_metadata(text):
    return {
        "title": extract_title(text),
        "author": extract_author(text),
        "date": extract_date(text),
        "keyphrases": extract_keyphrases(text),
        "topics": map_topics(extract_keyphrases(text)),
        "summary": generate_summary(text),
        "sentiment": analyze_sentiment(text),
        "readability": analyze_readability(text),
        "stats": {
            "word_count": len(text.split()),
            "char_count": len(text),
            "sentence_count": len([s for s in re.split(r'[.!?]', text) if s.strip()])
        }
    }


In [13]:

# %%
# Process sample document
print("Extracting content from sample_document.pdf...")
text_content = extract_content("sample_document.pdf")
print("\nExtracted Text Preview:")
print(text_content[:500] + "...\n")

# %%
# Generate metadata
print("Generating metadata...")
metadata = generate_metadata(text_content)


Extracting content from sample_document.pdf...

Extracted Text Preview:
MetaMuse Sample Document
MetaMuse is an advanced automated metadata generation system developed by Acme Corp.
It uses natural language processing to extract meaningful information from documents.
Key features include:
- Multi-format support (PDF, DOCX, TXT)
- OCR capabilities for image-based content
- Semantic analysis of document content
- Structured metadata output in JSON format
This document was generated on June 25, 2025 to demonstrate the system's capabilities.
Page 1...

Generating metadata...


In [14]:

# %%
# Display results
print("Generated Metadata:")
print(f"Title: {metadata['title']}")
print(f"Author: {metadata['author']}")
print(f"Date: {metadata['date']}")
print(f"Key Phrases: {', '.join(metadata['keyphrases'])}")
print(f"Topics: {', '.join(metadata['topics'])}")
print(f"Sentiment: {metadata['sentiment']}")
print(f"Readability: {metadata['readability']}")
print(f"\nSummary:\n{metadata['summary']}")
print(f"\nDocument Stats:")
print(f"- Word Count: {metadata['stats']['word_count']}")
print(f"- Character Count: {metadata['stats']['char_count']}")
print(f"- Sentence Count: {metadata['stats']['sentence_count']}")


Generated Metadata:
Title: MetaMuse Sample Document
Author: Acme Corp
Date: June 25, 2025
Key Phrases: sample document metamuse, document metamuse, document metamuse advanced, metamuse sample document, automated metadata generation, automated metadata, advanced automated metadata
Topics: Technology
Sentiment: Neutral
Readability: Standard

Summary:
Document too short for meaningful summary

Document Stats:
- Word Count: 69
- Character Count: 478
- Sentence Count: 4


In [15]:

# %%
# Save metadata to JSON
with open("sample_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)
print("\nMetadata saved to sample_metadata.json")

# %%
# Display JSON output
print("\nJSON Metadata Output:")
print(json.dumps(metadata, indent=2))



Metadata saved to sample_metadata.json

JSON Metadata Output:
{
  "title": "MetaMuse Sample Document",
  "author": "Acme Corp",
  "date": "June 25, 2025",
  "keyphrases": [
    "sample document metamuse",
    "document metamuse",
    "document metamuse advanced",
    "metamuse sample document",
    "automated metadata generation",
    "automated metadata",
    "advanced automated metadata"
  ],
  "topics": [
    "Technology"
  ],
  "summary": "Document too short for meaningful summary",
  "sentiment": "Neutral",
  "readability": "Standard",
  "stats": {
    "word_count": 69,
    "char_count": 478,
    "sentence_count": 4
  }
}
