In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import libcst as cst

from spot.tokenized_src import TokenizedSrc, stub_from_module
from spot.utils import Path, decode_tokens

ex_code = '''
from .utils import *
from .data import remove_comments, remove_imports
import spot


def gen_stub(m: cst.Module, rm_comments=True, rm_imports=True) -> cst.Module:
    """Removes all comments and docstrings."""
    if rm_comments:
        m = remove_comments(m)
    if rm_imports:
        m, _ = remove_imports(m)
    m = m.visit(StubGenerator())
    m = remove_empty_lines(m)
    spot.fly.attach(m)
    return m


def remove_empty_lines(m: cst.Module) -> cst.Module:
    m = m.visit(EmptyLineRemove())
    return m


OMIT = cst.SimpleStatementSuite([cst.Expr(cst.Ellipsis())])


@dataclass
class ClassNamespace:
    all_elems: set[str] = field(default_factory=set)
    declared_elems: set[str] = field(default_factory=set)


class StubGenerator(cst.CSTTransformer):
    """Generate a stub module from a Python module."""

    def __init__(self):
        self.ns_stack = list[ClassNamespace]()
        self.ns_stack = 1

    def register_elem(self, name: str, declared: bool):
        if self.ns_stack:
            s = self.ns_stack[-1]
            s.all_elems.add(name)
            if declared:
                s.declared_elems.add(name)

    def visit_ClassDef(self, node: cst.ClassDef):
        self.ns_stack.append(ClassNamespace())

    def leave_ClassDef(self, node, updated: cst.ClassDef):
        s = self.ns_stack.pop()
        to_declare = s.all_elems.difference(s.declared_elems)
        if to_declare:
            more_stmts = [cst.parse_statement(f"{n}: ...") for n in to_declare]
            new_stmts = list(updated.body.body) + more_stmts
            updated = updated.with_changes(
                body=updated.body.with_changes(body=new_stmts)
            )
        return updated

    def leave_FunctionDef(self, node, updated: cst.FunctionDef):
        self.register_elem(updated.name.value, True)
        return updated.with_changes(body=OMIT, returns=None)

    def leave_Annotation(self, node, updated: cst.Annotation):
        return updated.with_changes(annotation=cst.Ellipsis())

    def leave_Param(self, node, updated: cst.Param):
        if updated.default is not None:
            updated = updated.with_changes(default=cst.Ellipsis())
        return updated.with_changes(annotation=None)

    def leave_AnnAssign(self, node, updated: cst.AnnAssign):
        if updated.value is not None:
            updated = updated.with_changes(value=cst.Ellipsis())
        return updated

    def leave_Assign(self, node, updated: cst.AnnAssign):
        return updated.with_changes(value=cst.Ellipsis())

    def leave_Attribute(self, node, updated: cst.Assign):
        match updated:
            case cst.Attribute(
                value=cst.Name(value="self"),
                attr=cst.Name(value=elem_name),
            ):
                self.register_elem(elem_name, False)
        return updated


class EmptyLineRemove(cst.CSTTransformer):
    def on_leave(self, node, updated):
        if hasattr(updated, "leading_lines") and updated.leading_lines:
            return updated.with_changes(leading_lines=[])
        return updated

'''

ex_m = cst.parse_module(ex_code)
print(stub_from_module(ex_m).code)



def gen_stub(m, rm_comments=..., rm_imports=...): ...
def remove_empty_lines(m): ...
OMIT = ...
@dataclass
class ClassNamespace:
    all_elems: ... = ...
    declared_elems: ... = ...
class StubGenerator(cst.CSTTransformer):
    def __init__(self): ...
    def register_elem(self, name, declared): ...
    def visit_ClassDef(self, node): ...
    def leave_ClassDef(self, node, updated): ...
    def leave_FunctionDef(self, node, updated): ...
    def leave_Annotation(self, node, updated): ...
    def leave_Param(self, node, updated): ...
    def leave_AnnAssign(self, node, updated): ...
    def leave_Assign(self, node, updated): ...
    def leave_Attribute(self, node, updated): ...
    ns_stack: ...
class EmptyLineRemove(cst.CSTTransformer):
    def on_leave(self, node, updated): ...




In [13]:
from spot import proj_root
from spot.static_analysis import ProjectPath, UsageAnalysis, PythonProject
from pprint import pprint


proj = PythonProject.from_root(proj_root())
for caller, callees in UsageAnalysis(proj).caller2callees.items():
    if caller.module == "tests.test_static_analysis":
        print(caller)
        for callee in callees:
            print("\t", callee.callee, "" if callee.is_certain else "  (maybe)")


tests.test_static_analysis/test_import_normalization
	 spot.static_analysis/to_abs_import_path 
tests.test_static_analysis/test_namespace_resolution
	 spot.static_analysis/ModuleNamespace.from_modules 
	 spot.static_analysis/ModuleNamespace.resolve_path   (maybe)
tests.test_static_analysis/test_usage_analysis
	 spot.static_analysis/PythonProject.from_modules 
	 spot.static_analysis/PythonModule.from_cst 
	 spot.static_analysis/UsageAnalysis.__init__ 
	 spot.static_analysis/ProjectPath.from_str 
	 spot.utils/assert_eq 


In [9]:
%load_ext autoreload
%autoreload 2

from spot.tokenized_src import PreprocessArgs, proj_root
from spot.function_dataset import repo_to_tk_srcs, dataset_from_repos

srcs = repo_to_tk_srcs(proj_root(), PreprocessArgs())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
from spot.data import SrcDataset, CtxArgs

sdata = SrcDataset(proj_root(), srcs)
ctx_args = CtxArgs(1024, 128, 256, 512)
cdata = sdata.to_chunks(ctx_args)

chunk_srcs_per_file: 100%|██████████| 328/328 [00:02<00:00, 118.92it/s]
verify_labels: 100%|██████████| 333/333 [00:00<00:00, 73852.75it/s]


In [17]:
from spot.utils import DefaultTokenizer

DefaultTokenizer.encode("abs = def(x)", return_tensors="pt")

torch.Tensor

In [32]:
for src in srcs[-5:]:
    print(f"======= file: {src.file} ========")
    src.print_code(max_lines=100)

# spot.train/TrainingConfig
def as_name(self):
    if len(self.modified_params()) > 0:
        return "-".join(
            f"{str(k)}={str(v)}" for k, v in self.modified_params().items()
        )
    else:
        return "default"
# spot.train/TrainingConfig
def get_model_name(self) -> <mask>:
    return "model-v3--" + self.as_name()

# spot.train/TrainingConfig
def train_ctx_args(self) -> <mask>:
    return CtxArgs(
        ctx_size=self.ctx_size,
        preamble_size=self.preamble_size,
        left_margin=self.left_margin,
        right_margin=self.right_margin,
        max_labels=self.train_max_labels,
        inline_prev_gold=self.inline_prev_gold,
    )
# spot.train/TrainingConfig
def dec_ctx_args(self):
    r = self.train_ctx_args()
    r.max_labels = self.dec_max_labels
    return r

# spot.train/TrainingConfig
def train_ctx_args(self):
    return CtxArgs(
        ctx_size=self.ctx_size,
        preamble_size=self.preamble_size,
        left_margin=self.left_margin,
        

In [12]:
from spot.static_analysis import cst, PythonModule, compute_module_usages, PythonProject

code1 = """
# root.file1

# global function
def gf(x):
    return x * x

# with inner function
def gf_with_inner(x):
    def inner(y):
        return y * y
    return inner(x)

# class
class C:
    def __init__(self, x):
        self.x = x
    
    def foo(self, y):
        return self.x + y

    @staticmethod
    def s_method(x):
        return x + 1
    
"""
code2 = """
# root.file2
from .file1 import gf
from root.file1 import gf_with_inner
import root.file1
import root.file1 as f1

def usage1(x):
    gf(x) + root.file1.C(5)
    foo(5)

def usage2(x):
    def inner():
        1 + gf_with_inner(x)
    return inner()

def usage_method1(x):
    x = f1.C(5)
    1 + x.foo(3)

def usage_method2(x):
    (1 + f1.C(5)).foo(3)

def usage_local():
    usage1(3)
    UsageClass(4)

@f1.C(1)
def usage_dec():
    pass

class UsageClass:
    def __init__(self, x):
        self.x = gf_with_inner(x)
        self.y = self.foo(5)

    def foo(self, y):
        return usage_local(f1.gf(y))

    @staticmethod
    def s_method(x):
        return x

class SubClass(UsageClass):
    def use(self):
        self.foo(5)
        f1.C.s_method(5)
"""

project = PythonProject.from_modules(
    [
        PythonModule.from_cst(cst.parse_module(code1), "root.file1"),
        PythonModule.from_cst(cst.parse_module(code2), "root.file2"),
    ]
)

for u in compute_module_usages(project.modules["root.file2"]):
    print(str(u))

(proj'root.file2/usage1', CodeRange(start=CodePosition(line=9, column=4), end=CodePosition(line=9, column=9)), QualifiedName(name='.file1.gf', source=<QualifiedNameSource.IMPORT: 1>))
(proj'root.file2/usage1', CodeRange(start=CodePosition(line=9, column=12), end=CodePosition(line=9, column=27)), QualifiedName(name='root.file1.C', source=<QualifiedNameSource.IMPORT: 1>))
(proj'root.file2/usage2', CodeRange(start=CodePosition(line=14, column=12), end=CodePosition(line=14, column=28)), QualifiedName(name='root.file1.gf_with_inner', source=<QualifiedNameSource.IMPORT: 1>))
(proj'root.file2/usage2', CodeRange(start=CodePosition(line=15, column=11), end=CodePosition(line=15, column=18)), QualifiedName(name='usage2.<locals>.inner', source=<QualifiedNameSource.LOCAL: 3>))
(proj'root.file2/usage_method1', CodeRange(start=CodePosition(line=18, column=8), end=CodePosition(line=18, column=15)), QualifiedName(name='root.file1.C', source=<QualifiedNameSource.IMPORT: 1>))
(proj'root.file2/usage_metho

In [60]:
from spot.static_analysis import UsageAnalysis

analysis = UsageAnalysis(project)
analysis.caller2callees[ProjectPath("root.file2", "SubClass.use")]

Local name: gf_with_inner.<locals>.inner
Segs: ['gf_with_inner', '<locals>', 'inner']
Local name: usage2.<locals>.inner
Segs: ['usage2', '<locals>', 'inner']
Local name: usage_method1.<locals>.x.foo
Segs: ['usage_method1', '<locals>', 'x', 'foo']
Case 3
Local name: <method>.foo
Local name: usage1
Segs: ['usage1']
Case 1
Local name: UsageClass
Segs: ['UsageClass']
Case 2
Local name: UsageClass.__init__.<locals>.self.foo
Segs: ['UsageClass', 'foo']
Case 1
Local name: usage_local
Segs: ['usage_local']
Case 1
Local name: SubClass.use.<locals>.self.foo
Segs: ['SubClass', 'foo']
Case 3


[FunctionUsage(caller=proj'root.file2/SubClass.use', callee=proj'root.file1/C.foo', call_site=CodeRange(start=CodePosition(line=42, column=8), end=CodePosition(line=42, column=19)), is_certain=False),
 FunctionUsage(caller=proj'root.file2/SubClass.use', callee=proj'root.file2/UsageClass.foo', call_site=CodeRange(start=CodePosition(line=42, column=8), end=CodePosition(line=42, column=19)), is_certain=False)]

In [22]:
import libcst as cst

from spot.tokenized_src import TokenizedSrc, PreprocessArgs
from spot.utils import Path, decode_tokens

ex_code = '''# document comment 1
  # document comment 2
"""String document commnet"""
import os; import spot;
from sys import argv, exit
# after import
@wraps(function)
def catch_permission_denied(function):
    import some.inner.imports
    """
    Decorator to catch :class:`psycopg2.ProgrammingError` exceptions with the
    ``INSUFFICIENT_PRIVILEGE`` error code and rethrow them as
    :class:`~werkzeug.exceptions.Forbidden` exceptions instead.
    """
    @wraps(function)
    def decorated(x: str, y: int) -> str:
        try:
            # comment 1
            # comment 1 cont
            return function(*args, **kwargs)

        except InsufficientPrivilege as error:
            LOG.error("Forbidden: %s", error) # comment 2
            raise Forbidden()

    return decorated
'''
pre_args = PreprocessArgs(stub_in_preamble=True)
ex_src = TokenizedSrc.parse(ex_code, Path("test_file"), Path("test_repo"), pre_args)
print(decode_tokens(ex_src.tokenized_code))


@wraps(function)
def catch_permission_denied(function):
    import some.inner.imports
    @wraps(function)
    def decorated(x: <mask>, y: <mask>) -> <mask>:
        try:
            return function(*args, **kwargs)

        except InsufficientPrivilege as error:
            LOG.error("Forbidden: %s", error) 
            raise Forbidden()

    return decorated



In [16]:
from spot.data import src_to_chunks_, CtxArgs, PreprocessArgs
from ipywidgets import interactive

pre_args = PreprocessArgs(stub_in_preamble=True)
ex_src = TokenizedSrc.parse(ex_code, Path("test_file"), Path("test_repo"), pre_args)


def print_code(
    preamble: int,
    left: int,
    right: int,
    ctx_size: int,
    max_labels: int,
    chunk_id: int,
    inline_prev: bool,
):
    chunks = []
    args = CtxArgs(ctx_size, preamble, left, right, max_labels=max_labels, inline_prev_gold=inline_prev)
    src_to_chunks_(chunks, [], ex_src, (0, len(ex_src.types)), args)
    print(decode_tokens(chunks[chunk_id]["input_ids"]))


interactive(
    print_code,
    preamble=(1, 100),
    left=(1, 200),
    right=(1, 100),
    ctx_size=(1, 500),
    max_labels=(1, 10),
    chunk_id=(0,1),
    inline_prev=True,
)


interactive(children=(IntSlider(value=50, description='preamble', min=1), IntSlider(value=100, description='le…

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pickle
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from typing import *

import pandas as pd
import plotly.express as px

from spot.data import GitRepo, ModuleRemapUnpickler
from spot.type_env import (
    AnnotPath,
    MypyChecker,
    SelectAnnotations,
    TypeInfAction,
    TypeInfEnv,
    TypeInfState,
    collect_annotations,
    mypy_checker,
)
from spot.utils import cst, proj_root, read_file, seq_flatten, tqdm, write_file

os.chdir(proj_root())

datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"

useful_repos_path = proj_root() / "scripts" / "useful_repos.pkl"
rename_module = lambda n: "spot.data" if n == "spot.data_prepare" else n
with useful_repos_path.open("rb") as f:
    useful_repos: list[GitRepo] = ModuleRemapUnpickler(f, rename_module).load()

In [18]:
# loading pre-trained model and tokenizer
from spot.utils import get_data_dir

model_dir = "Salesforce/codet5-base"
# model_dir = datadir / "checkpoints/saved/SPOT-CodeT5-no_margin/"

import torch
from transformers import (
    DataCollatorForSeq2Seq,
    RobertaTokenizer,
    T5ForConditionalGeneration,
)
from transformers.models.t5 import T5ForConditionalGeneration

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(model_dir)
model: T5ForConditionalGeneration = T5ForConditionalGeneration.from_pretrained(
    model_dir
).to(device)
max_target_length = 128


In [9]:
from spot.data import mask_type_annots, output_ids_as_types, tokenize_masked

test_code = """
@dataclass
class GitRepo:
    author: str
    name: str
    url: str
    stars: int
    forks: int

    def authorname(self):
        return self.author + "__" + self.name

    def repo_dir(self, repos_dir: Path) -> Path:
        return repos_dir / "downloaded" / self.authorname()

    def download(self, repos_dir: Path, timeout=None) -> bool:
        pass
"""


def run_model(code: str, num_beams=16):
    masked = mask_type_annots((Path("no_source"), code))
    tks = tokenize_masked(masked, tokenizer, device)
    input_ids = tks["input_ids"]
    with torch.no_grad():
        loss = model.forward(**tks).loss
        dec = model.generate(
            input_ids,
            max_length=max_target_length,
            num_beams=num_beams,
            # do_sample=True,
        )[0]
    return {
        "loss": loss,
        "predicted_types": output_ids_as_types(dec, tokenizer),
        "labels": output_ids_as_types(tks["labels"][0], tokenizer),
        "generation": tokenizer.decode(dec),
        "input_ids": input_ids[0],
        "output_ids": dec,
        "annots_info": masked["annots_info"],
    }


result = run_model(test_code, num_beams=10)
result["loss"]


tensor(2.9077, device='cuda:0')

In [24]:
from spot import PythonType
from spot.type_env import apply_annotations


def type_to_annot(ty: PythonType) -> str:
    return cst.Annotation(cst.parse_expression(str(ty)))


def run_aug_model(src: Path, cwd: Path):
    result = run_model(read_file(src), num_beams=10)
    pred_annots = {
        info.path: type_to_annot(t)
        for info, t in zip(result["annots_info"], result["predicted_types"])
    }
    m1 = apply_annotations(cst.parse_module(read_file(src)), pred_annots)
    write_file(src, m1.code)
    checker_r = MypyChecker.check_project(src, cwd)
    pos_to_preds = {
        info.annot_range: str(ty)
        for info, ty in zip(result["annots_info"], result["predicted_types"])
    }
    return {
        "model_result": result,
        "module": m1,
        "checker_feedback": checker_r,
        "pos_to_preds": pos_to_preds,
    }


aug_r = run_aug_model(inference_dir / "env_code_2.py", inference_dir)


In [33]:
from spot.utils import patch_code_with_extra

print("---- predicted types ----")
print(aug_r["model_result"]["predicted_types"])
print("---- model output ----")
print(tokenizer.decode(aug_r["model_result"]["output_ids"], skip_special_tokens=False))
print("---- checker_feedback ----")
print(aug_r["checker_feedback"].output_str)

print("---- new input ----")
new_input = patch_code_with_extra(
    aug_r["module"].code,
    aug_r["pos_to_preds"],
    aug_r["checker_feedback"].error_dict["env_code_2.py"],
)
print(new_input)


---- model output ----
<pad><s><extra_id_0>int<extra_id_1>int<extra_id_2>int<extra_id_3>int<extra_id_4>int, y : int<extra_id_5>int<extra_id_6>Optional[int]<extra_id_7>int<extra_id_8>int<extra_id_9>Bar[int, int, int, float, float]</s>
---- checker_feedback ----
env_code_2.py:20:14: error: Incompatible types in assignment (expression has type "str", variable has type "int")  [assignment]
env_code_2.py:32:29: error: Argument 1 to "len" has incompatible type "int"; expected "Sized"  [arg-type]
env_code_2.py:35:6: error: "Bar" expects no type arguments, but 5 given  [type-arg]
Found 3 errors in 1 file (checked 1 source file)

---- new input ----
# Env example 2: some existing annotations

from typing import *


def fib(n: /* int */<extra_id_0>):
    if n == 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fib(n - 1) + fib(n - 2)


def foo(bar: /* int */<extra_id_1>):
    return fib(bar)


class Bar:
    z: /* int */<extra_id_2> = /* error: Incompatible types in

In [33]:
import pickle

from spot.utils import Path, run_long_task, DefaultTokenizer, not_none, CountedAcc
from spot import proj_root
from spot.function_dataset import guess_src_root

datadir = Path(not_none(os.getenv("datadir")))
repos_dir = datadir / "SPOT-data/repos/"

repos_split_path = proj_root() /  "data/repos_split.pkl"
with repos_split_path.open("rb") as f:
    repos_split = pickle.load(f)

root_is_src = list[bool]()
for repo in repos_split["train"]:
    rd = repo.repo_dir(repos_dir)
    root_is_src.append(guess_src_root(rd).name == "src")

CountedAcc(sum(root_is_src), len(root_is_src))

CountedAcc(16.23%, count=573)

In [50]:
src_in_root = 0
package_in_root = 0
setup_in_root = 0
n_proj = 0

weird_repos = []
setup_files = []

for repo in repos_split["train"]:
    rd: Path = repo.repo_dir(repos_dir)
    n_proj += 1
    files = list(rd.iterdir())
    if rd / "src" in files:
        src_in_root += 1
    elif rd / (pname := rd.name.split("__")[-1]) in files:
        package_in_root += 1
    elif rd / "setup.cfg" in files:
        setup_in_root += 1
        setup_files.append(rd / "setup.cfg")
    else:
        weird_repos.append(repo)

print("n_projects:", n_proj)
print("src_in_root:", src_in_root)
print("package_in_root:", package_in_root)
print("setup_in_root:", setup_in_root)
print("weird_repos:", len(weird_repos))

n_projects: 573
src_in_root: 93
package_in_root: 203
setup_in_root: 107
weird_repos: 170


In [48]:
for repo in weird_repos[:10]:
    rd: Path = repo.repo_dir(repos_dir)
    print("Repo:", rd.relative_to(repos_dir))
    for f in rd.iterdir():
        print(f.relative_to(rd))

Repo: downloaded/tiangolo__uvicorn-gunicorn-docker
scripts
.gitignore
mypy.ini
README.md
tests
.github
.mypy_cache
.git
docker-images
pyproject.toml
LICENSE
Repo: downloaded/uwbmrb__BMRBDep
.gitignore
install.sh
ADIT-NMR Testing.ods
README.md
FrontEnd
deploy.sh
BackEnd
nginx_configuration_example.conf
upgrade.sh
apache_configuration_example.conf
.mypy_cache
.git
wsgi.conf
installation.md
.editorconfig
Dockerfile
build_docker.sh
run_locally.sh
.dockerignore
Repo: downloaded/jfcherng__Sublime-VisualizeZeroWidthChars
messages
dependencies.json
docs
.flake8
boot.py
scripts
.gitignore
typings
mypy.ini
.python-version
messages.json
README.md
menus
plugin
.github
.mypy_cache
.git
pyproject.toml
VisualizeZeroWidthChars.sublime-settings
CHANGELOG.md
requirements.txt
.gitattributes
LICENSE
.editorconfig
Repo: downloaded/chaosdorf__mpd-mqtt-gateway
.gitignore
gateway.py
.github
Pipfile
.mypy_cache
server.py
.git
Dockerfile
Pipfile.lock
Repo: downloaded/Celeo__Preston
preston
.gitignore
README.md
