In [1]:
%load_ext autoreload
%autoreload 2

from typet5.utils import *

os.chdir(proj_root())

In [2]:
from typet5.function_dataset import data_project_from_dir
from typet5.function_decoding import EvalResult


# load test projects
dataset_name = "ManyTypes4Py"
# dataset_name = "InferTypes4Py"


repos_dir = get_dataset_dir(dataset_name) / "repos" / "test"
test_repo_paths = [f for f in repos_dir.iterdir() if f.is_dir()]
test_projects = pmap(
    data_project_from_dir,
    test_repo_paths,
    desc="Loading test projects",
)
assert len(test_projects) > 0


Loading test projects: 100%|██████████| 50/50 [00:26<00:00,  1.91it/s]


In [3]:
strategies = {
    "Independent": "non-incr",
    "Random": "random",
    "UserToUsee": "caller2callee",
    "UseeToUser": "callee2caller",
    "TwoPass": "double-traversal",
}

eval_dir = get_eval_dir(
    dataset_name,
    "(implicit_imports, new) model-v7--TrainingConfig(drop_env_types=False, add_implicit_rel_imports=True)",
)
evals: dict[str, EvalResult] = {
    sname: pickle_load(eval_dir / f"{s}-EvalResult.pkl")
    for sname, s in strategies.items()
}

model_dirs = {
    "No Preamble": get_eval_dir(
        dataset_name,
        "(ablation) model-v7--TrainingConfig(imports_in_preamble=False, stub_in_preamble=False, drop_env_types=False, add_implicit_rel_imports=True)",
    )
    / "double-traversal-EvalResult.pkl",
    "No Usees": get_eval_dir(
        dataset_name,
        "(ablation) model-v7--TrainingConfig(max_callees=0, drop_env_types=False, add_implicit_rel_imports=True, left_margin=512, preamble_size=511, right_margin=3072)",
    )
    / "double-traversal-EvalResult.pkl",
    "No Users": get_eval_dir(
        dataset_name,
        "(ablation) model-v7--TrainingConfig(max_callers=0, drop_env_types=False, add_implicit_rel_imports=True, left_margin=3072, right_margin=512)",
    )
    / "double-traversal-EvalResult.pkl",
    "Nonincremental": get_eval_dir(
        dataset_name,
        "(ablation) model-v7--TrainingConfig(add_implicit_rel_imports=True)",
    )
    / "non-incr-EvalResult.pkl",
    # "TypeT5": get_eval_dir(dataset_name, "(implicit_imports, new) model-v7--TrainingConfig(drop_env_types=False, add_implicit_rel_imports=True)") / "double-traversal-EvalResult.pkl",
}


ablation_evals: dict[str, EvalResult] = {
    k: pickle_load(v) for k, v in model_dirs.items()
}
evals.update(ablation_evals)


In [4]:
from typet5.type_env import AccuracyMetric
from typet5.model import ModelWrapper
from typet5.experiments.typet5 import accs_as_table_row

common_names = ModelWrapper.load_common_type_names(
    get_model_dir() / "model-v7--TrainingConfig(drop_env_types=False)"
)
metrics = AccuracyMetric.default_metrics(common_type_names=common_names)
for name, evalr in evals.items():
    accs = {m.name: evalr.error_analysis(None, m).accuracies for m in metrics}
    # accs_str = pretty_show_dict(accs)
    # print(accs_str)
    print("=" * 20, name, "=" * 20)
    accs_as_table_row(accs)


Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
70.87 & 71.68 & 75.99 & 46.92 & 77.58
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
77.43 & 78.88 & 83.12 & 54.23 & 82.52
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
58.95 & 61.80 & 66.17 & 37.14 & 68.92
Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
70.68 & 71.66 & 75.68 & 47.89 & 77.65
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
77.70 & 79.64 & 83.74 & 55.15 & 83.36
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
57.95 & 60.70 & 64.58 & 38.14 & 67.63
Accuracies on all types:
header:  ['full.all

In [29]:
from typet5.experiments.utils import (
    apply_sigmap,
    apply_sigmap_and_typecheck,
    count_type_errors,
    count_project_type_errors,
)
import shutil

from typet5.type_env import AccuracyMetric

bin_paths = (
    [
        None,
        Path("/home/jiayi/Projects/typilus/.venv/bin"),
        Path("/home/jiayi/Projects/type4py/.venv/bin"),
    ]
    if dataset_name == "InferTypes4Py"
    else [None] * len(test_projects)
)

acc_metric = AccuracyMetric(set())
name2errors = dict()
name2accs = dict()

background_errors = pmap(
    count_project_type_errors,
    test_projects,
    [
        {k: v.drop_types() for k, v in p.final_sigmap.items()}
        for p in list(evals.values())[0].predictions
    ],
    [Path("mypy_temp")] * len(test_projects),
    bin_paths,
    desc=f"Counting background type errors",
)
background_count = count_type_errors(seq_flatten(background_errors))
print("Background Brrors:", background_count)

for name, eval in evals.items():
    all_errors = pmap(
        count_project_type_errors,
        test_projects,
        [p.final_sigmap for p in eval.predictions],
        [Path("mypy_temp")] * len(test_projects),
        bin_paths,
        desc=f"Counting type errors for {name}",
    )
    errors = list(seq_flatten(all_errors))
    error_count = count_type_errors(errors)

    print(f"{name}:")

    print("\tErrors:", error_count - background_count)
    acc = eval.error_analysis(None, acc_metric).accuracies["acc"]
    print("\tAccuracy:", acc)
    name2errors[name] = errors
    name2accs[name] = acc


Counting background type errors:  30%|███       | 15/50 [00:10<00:43,  1.25s/it]



Counting background type errors: 100%|██████████| 50/50 [00:33<00:00,  1.50it/s]


Background Brrors: 36


Counting type errors for Independent:  28%|██▊       | 14/50 [00:04<00:15,  2.25it/s]



Counting type errors for Independent: 100%|██████████| 50/50 [00:32<00:00,  1.55it/s]


Independent:
	Errors: 6369
	Accuracy: 71.68% (count=13.2k)


Counting type errors for Random:  28%|██▊       | 14/50 [00:04<00:16,  2.18it/s]



Counting type errors for Random: 100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


Random:
	Errors: 5740
	Accuracy: 71.66% (count=13.2k)


Counting type errors for UserToUsee:  28%|██▊       | 14/50 [00:18<00:13,  2.60it/s]



Counting type errors for UserToUsee: 100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


UserToUsee:
	Errors: 6891
	Accuracy: 70.67% (count=13.2k)


Counting type errors for UseeToUser:  32%|███▏      | 16/50 [00:19<00:11,  2.88it/s]



Counting type errors for UseeToUser: 100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


UseeToUser:
	Errors: 5819
	Accuracy: 72.65% (count=13.2k)


Counting type errors for TwoPass:  28%|██▊       | 14/50 [00:04<00:15,  2.28it/s]



Counting type errors for TwoPass: 100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


TwoPass:
	Errors: 4530
	Accuracy: 73.02% (count=13.2k)


Counting type errors for No Preamble:  28%|██▊       | 14/50 [00:04<00:18,  1.94it/s]



Counting type errors for No Preamble: 100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


No Preamble:
	Errors: 2976
	Accuracy: 64.20% (count=13.2k)


Counting type errors for No Usees:  28%|██▊       | 14/50 [00:04<00:16,  2.13it/s]



Counting type errors for No Usees: 100%|██████████| 50/50 [00:32<00:00,  1.56it/s]


No Usees:
	Errors: 6944
	Accuracy: 67.15% (count=13.2k)


Counting type errors for No Users:  28%|██▊       | 14/50 [00:04<00:17,  2.06it/s]



Counting type errors for No Users: 100%|██████████| 50/50 [00:32<00:00,  1.55it/s]


No Users:
	Errors: 6521
	Accuracy: 71.20% (count=13.2k)


Counting type errors for Nonincremental:  28%|██▊       | 14/50 [00:04<00:15,  2.35it/s]



Counting type errors for Nonincremental: 100%|██████████| 50/50 [00:32<00:00,  1.56it/s]


Nonincremental:
	Errors: 5252
	Accuracy: 72.52% (count=13.2k)


In [33]:
from typet5.type_check import MypyFeedback


def count_undefined(errors: list[MypyFeedback]) -> int:
    return sum(1 for e in errors if e.error_code == "name-defined")


def make_row(errors):
    n_undefiend = count_undefined(errors)
    n_errors = count_type_errors(errors)
    n_total = n_undefiend + n_errors
    # return {"total": n_total, "errors": n_errors, "undefined": n_undefiend}
    return f"{n_total} & {n_errors} & {n_undefiend}"


pretty_print_dict({k: make_row(v) for k, v in name2errors.items()})


Independent: 6876 & 6405 & 471
Random: 6215 & 5776 & 439
UserToUsee: 7415 & 6927 & 488
UseeToUser: 6402 & 5855 & 547
TwoPass: 5087 & 4566 & 521
No Preamble: 6067 & 3012 & 3055
No Usees: 7332 & 6980 & 352
No Users: 7053 & 6557 & 496
Nonincremental: 5720 & 5288 & 432


non-incr: 1515
random: 1482
caller2callee: 1288
callee2caller: 1181
double-traversal: 1191

In [37]:
from typet5.static_analysis import ProjectPath

# print(decode_tokens(evals["No Preamble"].predictions[0].elem2inputs[ProjectPath.from_str("typet5.utils/proj_root")]["input_ids"]))
print(
    decode_tokens(
        evals["random"]
        .predictions[0]
        .elem2inputs[ProjectPath.from_str("typet5.decode/sample_candidates")]["input_ids"]
    )
)


import multiprocessing
import torch
from datasets import Dataset
from.data import (
    ChunkedDataset,
    CtxArgs,
    SrcCheckResult,
    SrcChunkInfo,
    TokenizedSrcSet,
    TokenizedSrc,
    TypeCheckingEnv,
    chunk_from_src,
    code_to_check_from_preds,
    src_to_chunks_,
    type_check_src_in_project,
    feedbacks_to_tokenized_src,
)
from.type_check import (
    MypyChecker,
    MypyFeedback,
    MypyResult,
    PythonType,
    normalize_type,
)
from.utils import *
from copy import deepcopy
from.critic import CriticCollator, CriticModel
from.model import DatasetPredResult, DecodingArgs, ModelWrapper, dynamic_dataloader
class IncrSelector:
   ...
class SelectByOracle(IncrSelector):
   ...
class SelectByCounting(IncrSelector):
   ...
@dataclass
class SelectByCritic(IncrSelector):
   ...
@dataclass
class CriticAssesInfo:
   ...
# typet5.data
@dataclass
class ChunkedDataset:
    data: Dataset

# typet5.data
@dataclass
class TokenizedSrcSet:
    def to_chunks(
        self,
  