In [1]:
%load_ext autoreload
%autoreload 2

import json
import logging
import os
import shutil
import subprocess
import time
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path

import libcst as cst
from tqdm import tqdm

from typet5.data import GitRepo
from typet5.type_env import collect_annots_info, mypy_checker
from typet5.utils import proj_root, read_file, write_file

os.chdir(proj_root())

Note: This notebook preprocess the downloaded dataset into a format used by the training pipeline. This is useful if you only want to preprocess the data but not performing the training right away. Otherwise, you should directly run the `train_model.py` script and it will automatically preprocess or load the dataset for you according to the experiment configuration.

In [1]:
# Run form here: Analyzing TokenizedSrcSet.

%load_ext autoreload
%autoreload 2

import pickle

from typet5 import proj_root
from typet5.data import get_dataset_dir, get_tk_dataset_name, PreprocessArgs
import typet5.function_dataset as fd
from typet5.utils import Path, run_long_task, DefaultTokenizer, not_none
import subprocess

dataset_name = "ManyTypes4Py"
# repos_split_path = proj_root() /  "data/repos_split.pkl"
repos_dir = get_dataset_dir("ManyTypes4Py") / "repos"

recreate = False
func_only = True # whether to create functional data (for TypeT5) or chunk data (for CodeT5)
pre_args = PreprocessArgs()
data_reduction = 1

tk_src_name = get_tk_dataset_name(
    dataset_name, pre_args, func_only, data_reduction=data_reduction,
)
datasets_path = get_data_dir() / "SPOT-data" / tk_src_name

In [2]:
from typet5.data import create_tokenized_srcsets, load_tokenized_srcsets
if recreate or not datasets_path.exists():
    create_tokenized_srcsets(
        proj_root() / "data/repos_split.pkl",
        datasets_path,
        func_only=func_only,
        pre_args=pre_args,
        data_reduction=data_reduction,
    )
tk_dataset = load_tokenized_srcsets(
    datadir,
    tk_src_name,
)


Starting task: Generating TokenizedSrcSets: func_datasets-v5-PreprocessArgs(drop_env_types=False)


Generating dataset from repos: 100%|██████████| 573/573 [08:02<00:00,  1.19it/s]
Generating dataset from repos: 100%|██████████| 40/40 [02:12<00:00,  3.32s/it]
Generating dataset from repos: 100%|██████████| 50/50 [01:32<00:00,  1.85s/it]


Saved source datasets to: /mnt/data0/jiayi/SPOT-data/func_datasets-v5-PreprocessArgs(drop_env_types=False)
777M	/mnt/data0/jiayi/SPOT-data/func_datasets-v5-PreprocessArgs(drop_env_types=False)
Pushover: (Finished: 'Generating TokenizedSrcSets: func_datasets-v5-PreprocessArgs(drop_env_types=False)'.) Time taken: 736.9s


In [None]:
import plotly.express as px
from pandas import DataFrame

from typet5.utils import cumulative_counts

len_counts = [len(src.tokenized_code) for src in tk_dataset["train"].all_srcs]
xs, ys = cumulative_counts(len_counts)
px.line(
    DataFrame({"tokens_per_file": xs, "n_files": ys}), x="tokens_per_file", y="n_files"
)

In [3]:
print("dataset:", datasets_path)
tk_dataset["train"].print_stats()

dataset: /mnt/data0/jiayi/SPOT-data/func_datasets-v5-PreprocessArgs(drop_env_types=False)
num_repos: 572
num_files: 134977
num_lines: 5752662
num_labels: 293592
main_tokens_per_file:
   mean: 1578.6
   median: 717
   min: 17
   max: 97666
preamble_tokens_per_file:
   mean: 196.41
   median: 134
   min: 2
   max: 6729
target_tks_per_file:
   mean: 7.8197
   median: 5
   min: 2
   max: 725


In [3]:
print("dataset:", datasets_path)
tk_dataset["train"].print_stats()

dataset: /mnt/data0/jiayi/SPOT-data/func_datasets-v4-PreprocessArgs(drop_env_types=False)
num_repos: 572
num_files: 134977
num_lines: 16579520
num_labels: 293592
main_tokens_per_file:
   mean: 1585
   median: 717
   min: 17
   max: 94019
preamble_tokens_per_file:
   mean: 970.94
   median: 532
   min: 6
   max: 49334
target_tks_per_file:
   mean: 7.8197
   median: 5
   min: 2
   max: 725


In [8]:
from typet5.data import load_tokenized_srcsets

# tk_dataset = load_tokenized_srcsets(get_data_dir(), get_dataset_name(pre_args, func_only))

long_files=sorted(tk_dataset["train"].all_srcs, key=lambda s: len(s.tokenized_code),reverse=True)

In [12]:
print(long_files[8].preamble_code)

from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.responses import JSONResponse
class Item(BaseModel):
    ...
class Message(BaseModel):
    ...



In [7]:
print(str(tk_dataset["train"].all_srcs[2345]))

file:devonhollowood__adventofcode/2018.day10/Point.velocity
repo:devonhollowood__adventofcode
--------Preamble--------
import argparse
import re
import typing
import unittest
from functools import lru_cache
from dataclasses import dataclass
@dataclass(...)
class Vector:
    x: ...
    y: ...
@dataclass(...)
class Point:
    position: ...
    velocity: ...
    def step(self): ...
    velocity: ...
    position: ...
def bounding_circumference(points): ...
@lru_cache(...)
def run_until_compact(points): ...
@lru_cache(...)
def parse(puzzle): ...
def part1(puzzle): ...
def part2(puzzle): ...
def main(): ...
if __name__ == '__main__':
    main()
class ExampleTest(unittest.TestCase):
    example = ...
    expected = ...
    def test_part1(self): ...
    def test_part2(self): ...
    example: ...
    assertEqual: ...
    expected: ...

--------Main Code--------

# BEGIN

# 2018.day10
@dataclass(frozen=True)
class Point:
    velocity: <mask>

# END

# 2018.day10
@lru_cache()
def parse(puzzle: s

In [12]:
max(tk_dataset["train"].all_srcs, key=lambda s: len(s.tokenized_code)).print_code(500)

# fastapi.routing/APIRouter
def api_route(
    self,
    path,
    *,
    response_model = None,
    status_code = None,
    tags = None,
    dependencies = None,
    summary = None,
    description = None,
    response_description = "Successful Response",
    responses = None,
    deprecated = None,
    methods = None,
    operation_id = None,
    response_model_include = None,
    response_model_exclude = None,
    response_model_by_alias = True,
    response_model_exclude_unset = False,
    response_model_exclude_defaults = False,
    response_model_exclude_none = False,
    include_in_schema = True,
    response_class = Default(JSONResponse),
    name = None,
    callbacks = None,
    openapi_extra = None,
    generate_unique_id_function = Default(
        generate_unique_id
    ),
):
    def decorator(func):
        self.add_api_route(
            path,
            func,
            response_model=response_model,
            status_code=status_code,
            tags=tags,
        

In [4]:
tk_dataset["train"].print_stats()


num_repos: 572
num_files: 16281
num_lines: 2698669
num_labels: 295457
main_tokens_per_file:
   mean: 1311.6
   median: 608
   min: 8
   max: 55437
preamble_tokens_per_file:
   mean: 301.97
   median: 181
   min: 6
   max: 9784
target_tks_per_file:
   mean: 65.318
   median: 28
   min: 2
   max: 3798
n_files_too_wide: 444
too_wide_ratio: 0.014897
preprocess: PreprocessArgs(imports_in_preamble=True, stub_in_preamble=True, drop_comments=True)


In [8]:
def preamble_len(src):
    return len(src.preamble_code.split("\n"))


weird_src = max(tk_dataset["train"].all_srcs, key=preamble_len)


In [3]:
from typet5.data import load_tokenized_srcsets, get_dataroot

sdata_path = get_dataroot() / "TokenizedSrcSets" / "ManyTypes4Py-v5-PreprocessArgs()"

tk_dataset = load_tokenized_srcsets(sdata_path)

Loading TokenizedSrcSets:  /mnt/nas/jiayi/SPOT/TokenizedSrcSets/ManyTypes4Py-v5-PreprocessArgs()
258M	/mnt/nas/jiayi/SPOT/TokenizedSrcSets/ManyTypes4Py-v5-PreprocessArgs()


In [6]:
tk_dataset["test"].print_stats()

num_repos: 50
num_files: 949
num_lines: 139121
num_labels: 17740
main_tokens_per_file:
   mean: 1270.5
   median: 632
   min: 23
   max: 57953
preamble_tokens_per_file:
   mean: 103.8
   median: 67
   min: 2
   max: 1517
target_tks_per_file:
   mean: 72.285
   median: 32
   min: 2
   max: 1882
n_files_too_wide: 1
too_wide_ratio: 0.00062735
preprocess: PreprocessArgs(imports_in_preamble=True, stub_in_preamble=True, drop_comments=True, max_callees=80, max_callers=20, drop_env_types=True, add_override_usages=False)
