# Downloading and preparing data

### Download full open source Azure documentation by cloning the repository from GitHub. 

In [None]:
!git clone --depth 1 --branch main https://github.com/MicrosoftDocs/azure-docs.git 

### Concat all documentation Markdown files into single big file

In [None]:
import glob

processed_count = 0

for file in glob.glob("./azure-docs/articles/**/*.md", recursive=True):
    try:
        print(f"Processing {file}")
        with open(file, "r", encoding="utf-8") as f:
            content = f.read()
        with open("data/azure_docs/azure-docs.md", "a", encoding="utf-8") as f:
            f.write(content)
        processed_count += 1
    except (UnicodeDecodeError, OSError):
        print(f"Skipping {file}")

print(f"Processed {processed_count} files")

### Split file into training and validation set

In [5]:
with open("data/azure_docs/azure-docs.md", "r", encoding="utf-8") as f:
    content = f.read()

# Split the content into training and validation sets
split_index = int(len(content) * 0.9)
training_content = content[:split_index]
validation_content = content[split_index:]

# Write the training and validation sets to separate files
with open("data/azure_docs/azure_docs_training.md", "w", encoding="utf-8") as f:
    f.write(training_content)

with open("data/azure_docs/azure_docs_validation.md", "w", encoding="utf-8") as f:
    f.write(validation_content)

### Tokenize and save training data

In [2]:
import tiktoken
import torch
import numpy as np

# Select GPT2 tokenization
enc = tiktoken.get_encoding("gpt2")

# Get tokens for validation data
with open("data/azure_docs/azure_docs_validation.md", "r", encoding="utf-8") as f:
    val_tokens = enc.encode_ordinary(f.read())
print(f"Validation size is {len(val_tokens):,} tokens")
val_tokens = np.array(val_tokens, dtype=np.uint16)
val_tokens.tofile('data/azure_docs/val.bin')


# Get tokens for training data
with open("data/azure_docs/azure_docs_training.md", "r", encoding="utf-8") as f:
    train_tokens = enc.encode_ordinary(f.read())
print(f"Validation size is {len(train_tokens):,} tokens")
train_tokens = np.array(train_tokens, dtype=np.uint16)
train_tokens.tofile('data/azure_docs/train.bin')


Validation size is 7,846,459 tokens
Validation size is 72,299,304 tokens
