# Day 1: Ingest and Index Your Data
- https://docs.google.com/document/d/1ssAeSDpxF-FG8fGsqvAQrTLalVk91f-p9O2nbXiio5U/edit?tab=t.0#heading=h.tjb2t5qu4ajr

### Frontmatter

A library for parsing frontmatter - a popular documentation format commonly used for modern frameworks like Jekyll, Hugo, and Next.js.

In [2]:
import frontmatter

with open('example.md', 'r', encoding='utf-8') as f:
    post = frontmatter.load(f)

# Access metadata
print("Title:", post.metadata['title'])  # "Getting Started with AI"
print("Tags:", post.metadata['tags'])   # ["ai", "machine-learning", "tutorial"]

# Access content
print("Content:", post.content)  # The markdown content without frontmatter

Title: Getting Started with AI
Tags: ['ai', 'machine-learning', 'tutorial']
Content: # Getting Started with AI

This is the main content of the document written in **Markdown**.

You can include code blocks, links, and other formatting here.


### Working with Zip Archives

Load the zip archives into our Python process memory and extract all the data we need from there.

So the plan:
- Use requests for downloading the zip archive from GitHub
- Open the archive using built-in zipfile and io modules
- Iterate over all .md and .mdx files in the repo
- Collect the results into a list



In [10]:
import io
import zipfile
import requests
import frontmatter
from pprint import pprint

In [7]:
# Download the zip file
url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'
resp = requests.get(url)
print(resp.status_code)

200


In [8]:
# Process the zip file in memory without saving it to disk
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()

    # Only process markdown files
    if not filename.endswith('.md'):
        continue

    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()


In [11]:
pprint(repository_data[1])

{'content': 'The next cohort starts January 13th, 2025. More info at '
            '[DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n'
            '\n'
            '- Register before the course starts using this '
            '[link](https://airtable.com/shr6oVXeQvSI5HuWD).\n'
            '- Join the [course Telegram channel with '
            'announcements](https://t.me/dezoomcamp).\n'
            "- Don’t forget to register in DataTalks.Club's Slack and join the "
            'channel.',
 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md',
 'id': '9e508f2212',
 'question': 'Course: When does the course start?',
 'sort_order': 1}


In [12]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data


In [13]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')
evidently_docs = read_repo_data('evidentlyai', 'docs')

print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")

FAQ documents: 1219
Evidently documents: 95
