# Downloading and preparing data

### Download full open source Azure documentation by cloning the repository from GitHub. 

In [None]:
!git clone --depth 1 --branch main https://github.com/MicrosoftDocs/azure-docs.git 

### Concat all documentation Markdown files into single big file

In [None]:
import glob

processed_count = 0

for file in glob.glob("./azure-docs/articles/**/*.md", recursive=True):
    try:
        print(f"Processing {file}")
        with open(file, "r", encoding="utf-8") as f:
            content = f.read()
        with open("azure-docs.md", "a", encoding="utf-8") as f:
            f.write(content)
        processed_count += 1
    except (UnicodeDecodeError, OSError):
        print(f"Skipping {file}")

print(f"Processed {processed_count} files")

### Split file into training and validation set

In [5]:
with open("azure-docs.md", "r", encoding="utf-8") as f:
    content = f.read()

# Split the content into training and validation sets
split_index = int(len(content) * 0.9)
training_content = content[:split_index]
validation_content = content[split_index:]

# Write the training and validation sets to separate files
with open("azure-docs-training.md", "w", encoding="utf-8") as f:
    f.write(training_content)

with open("azure-docs-validation.md", "w", encoding="utf-8") as f:
    f.write(validation_content)

### Tokenize and save training data

In [8]:
import tiktoken
import torch

# Select GPT2 tokenization
enc = tiktoken.get_encoding("gpt2")

# Get tokens for validation data
with open("azure-docs-validation.md", "r", encoding="utf-8") as f:
    validation = torch.tensor(enc.encode(f.read()))
torch.save(validation, "azure-docs-validation.pt")

# Get tokens for training data
with open("azure-docs-training.md", "r", encoding="utf-8") as f:
    training = torch.tensor(enc.encode(f.read()))
torch.save(training, "azure-docs-training.pt")

In [16]:
print(f"Validation set is {validation.shape[0]} tokens")
print(f"Training set is {training.shape[0]} tokens")

Validation set is 7846459 tokens
Training set is 72299304 tokens
