In [1]:
# Define a filter function to to avoid downloading the data and instead use the streaming feature to filter it on the fly

def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [3]:
#let's test it on two examples
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import pandas as pd"
example_2 = "import numpy as np"
print(any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters))

True False


In [4]:
# We can use this to create a function that will stream the dataset and filter the elements we want

from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset

def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    print(f"{len(filtered_dict['content'])} out of {total:.2%} samples were filtered")

In [5]:
# Apply this function to the streaming dataset
# This cell will take a very long time to execute, so we will skip it for now. You can run it later if you want to train the model on the full dataset.
from datasets import load_dataset
split = "train" # "valid"
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]

data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
filtered_data = filter_streaming_dataset(data, filters)

Downloading readme:   0%|          | 0.00/583 [00:00<?, ?B/s]

486999it [07:29, 1572.59it/s]Got disconnected from remote data host. Retrying in 5sec [1/20]
486999it [07:40, 1572.59it/s]Failed to read file 'gzip://file-000000000004.json::https://huggingface.co/datasets/transformersbook/codeparrot-train/resolve/0933803eb0f5956b2da9d2d7b6805fa31b18a6c8/file-000000000004.json.gz' with error <class 'pyarrow.lib.ArrowInvalid'>: JSON parse error: Invalid value. in row 0
487815it [08:31, 953.89it/s] 


ArrowInvalid: JSON parse error: Invalid value. in row 0

In [6]:
# To speed up getting the dataset, we can reuse the filtered dataset from HF hub
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")
raw_datasets = DatasetDict(
    {
        "train": ds_train, # .shuffle().select(range(50000))
        "validation": ds_valid, # .shuffle().select(range(500))
    }
)

Downloading and preparing dataset json/huggingface-course--codeparrot-ds-train to C:/Users/Raj/.cache/huggingface/datasets/huggingface-course___json/huggingface-course--codeparrot-ds-train-7e9fc5dfe436a81a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.25G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/Raj/.cache/huggingface/datasets/huggingface-course___json/huggingface-course--codeparrot-ds-train-7e9fc5dfe436a81a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.
Downloading and preparing dataset json/huggingface-course--codeparrot-ds-valid to C:/Users/Raj/.cache/huggingface/datasets/huggingface-course___json/huggingface-course--codeparrot-ds-valid-65557c3279496c87/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/46.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/Raj/.cache/huggingface/datasets/huggingface-course___json/huggingface-course--codeparrot-ds-valid-65557c3279496c87/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


Pretraining the language model will take a while. So just run the training loop on a sample of the data and make sure the training successfully completes and the models are stored. Nothing is more frustrating than a training run failing at the last step because you forgot to create a folder or becauase there is a typo at the end of the training loop!

In [7]:
# To speed up getting the dataset, we can reuse the filtered dataset from HF hub
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")
raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(50000)),
        "validation": ds_valid.shuffle().select(range(500)),
    }
)

Found cached dataset json (C:/Users/Raj/.cache/huggingface/datasets/huggingface-course___json/huggingface-course--codeparrot-ds-train-7e9fc5dfe436a81a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
Found cached dataset json (C:/Users/Raj/.cache/huggingface/datasets/huggingface-course___json/huggingface-course--codeparrot-ds-valid-65557c3279496c87/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 500
    })
})

In [9]:
# let's look at an example
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

REPO_NAME: TobiasHiort/TheGreatEscape
PATH: gui/utils.py
COPIES: 1
SIZE: 52862
CONTENT: #!/usr/bin/python3

import pygame
import sys
import os
import numpy
import math
import time
import subprocess
import doctest # read from txt, read docs
import random
import scipy.spatial as sp
import 
LICENSE: mit


In [None]:
# tokenize the dataset
from transformers import AutoTokenizer
context_length = 128 # keeping it small for now; gpt-2 can handle up to 1024, gpt-3 up to 2048
