In [74]:
import os
import re
import yaml
import json
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline


In [75]:
print("cwd:", Path.cwd())

cwd: c:\Users\huber\netapp_team_1\DATA992-Summer-2025-Team1\metadata-concepts


In [76]:
ROOT_PATH = Path.cwd().parent / "data" / "bluexp-dataset"

EXTENSIONS = ("adoc", "yaml", "yml", "json")

print(ROOT_PATH)

c:\Users\huber\netapp_team_1\DATA992-Summer-2025-Team1\data\bluexp-dataset


In [77]:
# # Config
# ROOT_PATH = "/data/bluexp-dataset"
# EXTENSIONS = ("adoc", "yaml", "yml", "json")

In [78]:
# Initialize 
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)
DOC_TYPE_LABELS = ["concept", "task", "reference", "overview", "tutorial"]
PERSONA_LABELS = ["API Developer", "Storage Administrator", "DevOps Engineer", "Systems Administrator", "IT Director"]

Device set to use cpu
Device set to use cpu


In [79]:
# Text Extraction
def extract_texts(repo_path, extensions=EXTENSIONS):
    texts = []
    for ext in extensions:
        for path in Path(repo_path).rglob(f"*.{ext}"):
            try:
                content = path.read_text(encoding='utf-8', errors='ignore')
                texts.append((str(path), content, ext))
            except Exception as e:
                print(f"⚠️ Failed to read {path}: {e}")
    return texts


In [80]:
# Extracted
def split_frontmatter(text):
    front, body = {}, text
    if text.startswith('---'):
        parts = text.split('---', 2)
        if len(parts) >= 3:
            front = yaml.safe_load(parts[1]) or {}
            body = parts[2]
    return front, body

# Only for .adoc bodies
def extract_title(body):
    for line in body.splitlines():
        if line.startswith('= '):
            return line.lstrip('= ').strip()
    return None

def extract_headings(body, level=2):
    return re.findall(rf"^{'='*level}\s+(.*)", body, flags=re.MULTILINE)

def extract_prerequisites(body):
    out, cap = [], False
    for line in body.splitlines():
        if re.match(r"^==+\s+Prerequisites", line): cap = True; continue
        if cap:
            if re.match(r"^==+\s+", line): break
            if line.strip(): out.append(line.strip('-* '))
    return out

def estimate_reading_time(body, wpm=200):
    return max(1, len(re.findall(r"\w+", body)) // wpm)

def extract_api_endpoints(body):
    return [f"{m} {p}" for m, p in re.findall(r"\b(GET|POST|PUT|DELETE|PATCH)\s+(/[\w_{}\-\[\]/]+)", body)]
    

def extract_tags_tfidf(corpus, top_n=5):
    idxs = [i for i, doc in enumerate(corpus) if re.search(r"\w+", doc)]
    tags = [[] for _ in corpus]
    if not idxs: return tags
    filtered = [corpus[i] for i in idxs]
    vect = TfidfVectorizer(max_df=0.8, stop_words='english', ngram_range=(1,2))
    X = vect.fit_transform(filtered)
    feats = vect.get_feature_names_out()
    for i, orig in enumerate(idxs):
        arr = X[i].toarray().flatten()
        top = arr.argsort()[::-1][:top_n]
        tags[orig] = [feats[j] for j in top]
    return tags


In [81]:
#Summarizer
def summarize_purpose(text):
    snippet = text[:1000]
    out = summarizer(snippet, max_length=60, min_length=10, do_sample=False)
    return out[0]['summary_text'].strip()

In [82]:
#Pipeline
texts_info = extract_texts(ROOT_PATH)
# Build corpus for TF-IDF tagging
corpus = [split_frontmatter(content)[1] if ext=='adoc' else content for _, content, ext in texts_info]
tag_lists = extract_tags_tfidf(corpus)

all_metadata = []
for idx, (path, content, ext) in enumerate(texts_info):
    front, body = split_frontmatter(content) if ext=='adoc' else ({}, content)
    title = (extract_title(body) if ext=='adoc' else os.path.basename(path)) or os.path.basename(path)
    purpose = summarize_purpose(body)
    # Classification for doc_type/persona
    doc_type = front.get('doc_type') or classifier(purpose, DOC_TYPE_LABELS)['labels'][0]
    persona = front.get('persona') or classifier(purpose, PERSONA_LABELS)['labels'][0]
    metadata = {
        'source': path,
        'file_type': ext,
        'title': title,
        'purpose': purpose,
        'persona': persona,
        'doc_type': doc_type,
        'prerequisites': extract_prerequisites(body) if ext=='adoc' else [],
        'difficulty': front.get('difficulty'),
        'key_tasks': extract_headings(body, level=2) if ext=='adoc' else [],
        'api_endpoints': extract_api_endpoints(body) if ext=='adoc' else [],
        'reading_time_min': estimate_reading_time(body),
        'version': front.get('version'),
        'last_updated': front.get('last_updated'),
        'tags': tag_lists[idx]
    }
    all_metadata.append(metadata)

Your max_length is set to 60, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 60, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
Your max_length is set to 60, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
Your max_length is set to 60, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max

In [83]:
#Export JSON catalog
with open('metadata_catalog.json', 'w', encoding='utf-8') as jf:
    json.dump(all_metadata, jf, ensure_ascii=False, indent=2)

print(f"Generated metadata for {len(all_metadata)} files (.{','.join(EXTENSIONS)}). " +
      "Written to metadata_catalog.json.")

Generated metadata for 193 files (.adoc,yaml,yml,json). Written to metadata_catalog.json.
