# Data Preparation Notebook for Fine-tuning Llama 3.1-8B-Instruct


In [None]:
!git clone https://github.com/zackproser/portfolio

In [None]:
# Import necessary libraries
import os
import re
import json
from markdown import markdown
from bs4 import BeautifulSoup
import subprocess

In [None]:
# Cell 2: Define helper functions (from clean-markdown-corpus.py)
def markdown_to_text(markdown_string):
    """Converts a markdown string to plain text."""
    html = markdown(markdown_string)
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text()
    return text

def clean_text(text):
    """Clean text by removing extra whitespace and normalizing."""
    # Remove empty lines and excess whitespace
    cleaned_lines = [line.strip() for line in text.splitlines() if line.strip()]

    # Break down long lines into sentences
    cleaned_text = []
    for line in cleaned_lines:
        if len(line) > 200:  # Example threshold for long lines
            sentences = re.split(r'(?<=[.!?]) +', line)
            cleaned_text.extend(sentences)
        else:
            cleaned_text.append(line)

    return "\n".join(cleaned_text)

# Cell 3: Process MDX files
def process_mdx_files(directory):
    """Process all MDX files in the given directory and its subdirectories."""
    processed_content = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.mdx'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    raw_text = f.read()
                plain_text = markdown_to_text(raw_text)
                cleaned_text = clean_text(plain_text)
                processed_content.append(cleaned_text)
    return processed_content

def extract_metadata(content):
    """Extract metadata from the content."""
    metadata = {}
    metadata_match = re.search(r'export const metadata = createMetadata\((.*?)\)', content, re.DOTALL)
    if metadata_match:
        metadata_str = metadata_match.group(1)
        # Extract key-value pairs
        for match in re.finditer(r'(\w+):\s*["\']?(.*?)["\']?,?\s*(?=\w+:|$)', metadata_str):
            key, value = match.groups()
            metadata[key.strip()] = value.strip().strip('"').strip("'")
    return metadata

def clean_content(content):
    """Clean the content by removing imports, exports, metadata, and special characters."""
    # Remove import statements
    content = re.sub(r'import.*\n', '', content)
    # Remove export statements and metadata
    content = re.sub(r'export const metadata.*?}\)', '', content, flags=re.DOTALL)
    content = re.sub(r'export default.*\n', '', content)
    # Remove special characters and markdown syntax
    content = re.sub(r'[#*`]', '', content)
    # Remove empty lines and lines that look like object properties
    content = '\n'.join(line for line in content.split('\n') if line.strip() and not re.match(r'^\w+:\s', line))
    return content.strip()

In [None]:
# Process MDX files in the portfolio directory
mdx_content = process_mdx_files('portfolio')

In [None]:
def create_alpaca_entry(content):
    """Create an Alpaca format entry for the given content."""
    metadata = extract_metadata(content)
    cleaned_content = clean_content(content)

    article_name = metadata.get('title', 'an article').strip('"')

    return {
        "instruction": f"Write an article about \"{article_name}\"",
        "input": "",
        "output": cleaned_content
    }

In [None]:
alpaca_data = [create_alpaca_entry(content) for content in mdx_content]

In [None]:
# Save Alpaca format data to a file
with open('training_data.json', 'w', encoding='utf-8') as f:
    json.dump(alpaca_data, f, ensure_ascii=False, indent=2)

print(f"Processed {len(alpaca_data)} entries and saved to training_data.json")

# Display sample entries (for verification)
print("Sample entries:")
for entry in alpaca_data[:3]:  # Display first 3 entries
    print(json.dumps(entry, indent=2))
    print("---")