In [1]:
%load_ext autoreload
%autoreload 2

import json
import logging
import os
import shutil
import subprocess
import time
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path

import libcst as cst
from tqdm import tqdm

from spot.data import GitRepo
from spot.type_env import collect_annotations, mypy_checker
from spot.utils import proj_root, read_file, write_file

os.chdir(proj_root())

In [3]:
all_repos=json.loads(read_file("scripts/mypy-dependents-by-stars.json"))
all_repos=[GitRepo.from_json(r) for r in all_repos] # for testing
# all_repos=all_repos[:10]

In [4]:
# download all candidate repos

def clear_downloaded_repos(repos_dir):
    shutil.rmtree(repos_dir)

def download_repos(to_download: list[GitRepo], repos_dir, download_timeout=10.0, max_workers=10) -> list[GitRepo]:
    def download_single(repo: GitRepo):
        try:
            if repo.download(repos_dir, timeout=download_timeout):
                repo.read_last_update(repos_dir)
                return repo
            else:
                return None
        except subprocess.TimeoutExpired:
            return None
        except Exception as e:
            logging.warning(f"Failed to download {repo.name}. Exception: {e}")
            return None

    print("Downloading repos from Github...")
    t_start = time.time()
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        fs = [executor.submit(download_single, repo) for repo in to_download]
        rs = [f.result() for f in tqdm(as_completed(fs), total=len(fs))]
    print(f"Downloading took {time.time() - t_start} seconds.")
    downloaded = [r for r in rs if r is not None]
    return downloaded 

datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"
if not repos_dir.exists():
    (repos_dir / "downloading").mkdir(parents=True)
    (repos_dir / "downloaded").mkdir(parents=True)
    downloaded_repos = download_repos(all_repos, repos_dir)
    print("Deleting failed repos...")
    shutil.rmtree(repos_dir / "downloading")
else:
    print("Repos already downloaded.")
    downloaded_dirs = set(d.name for d in (repos_dir / "downloaded").iterdir())
    downloaded_repos = [r for r in all_repos if r.authorname() in downloaded_dirs]
    print("Reading last updates...")
    for r in tqdm(downloaded_repos):
        r.read_last_update(repos_dir)
print(f"Downloaded {len(downloaded_repos)}/{len(all_repos)} repos.")

# assert len(list((repos_dir / "downloaded").iterdir())) == len(downloaded_repos)

Repos already downloaded.
Reading last updates...


100%|██████████| 4890/4890 [00:27<00:00, 175.00it/s]

Downloaded 4890/5996 repos.





In [33]:
from datetime import datetime, timezone

date_threshold = datetime(2021, 4, 20)
new_repos = [r for r in downloaded_repos if r.last_update > date_threshold]
print(f"{len(new_repos)} / {len(downloaded_repos)} repos are updated within a year.")
loc_limit = 50000

small_repos = []
for rep in tqdm(new_repos):
    try:
        loc = rep.count_lines_of_code(repos_dir)
        if loc < loc_limit:
            small_repos.append(rep)
    except UnicodeDecodeError:
        # nothing we can do
        pass
    except Exception as e:
        logging.warning(f"Failed to count lines of code for {rep.name}. Exception: {e}")

print(f"{len(small_repos)}/{len(new_repos)} repos are within the size limit ({loc_limit} LOC).")

1218 / 4890 repos are updated within a year.


100%|██████████| 1218/1218 [00:05<00:00, 243.41it/s]

1181/1218 repos are within the size limit.





In [34]:
# filter out repos with too few annotations

def count_repo_annots(rep):
    try: 
        rep.count_annotations(repos_dir)
        if rep.n_type_annots / rep.lines_of_code > 0.05:
            return rep
    except Exception as e:
        logging.warning(f"Failed to count annotations for {rep.name}. Exception: {e}")
        return None

with ProcessPoolExecutor(max_workers=30) as executor:
    fs = [executor.submit(count_repo_annots, rep) for rep in small_repos]
    rs = [f.result() for f in tqdm(as_completed(fs), total=len(fs))]
useful_repos: list[GitRepo] = [r for r in rs if r is not None and 'typeshed' not in r.name]

print(f"{len(useful_repos)}/{len(small_repos)} repos are parsable and have enough portions of type annotations.")

Incomplete input. Encountered 'linenum', but expected ';', or 'NEWLINE'.

                print linenum, l.rstrip()
                      ^
Incomplete input. Encountered '"Test Plan:"', but expected ';', or 'NEWLINE'.

        print "Test Plan:", self.test_name
              ^
Incomplete input. Unexpectedly encountered '='.

= nonsense =
^
Incomplete input. Encountered '"not yet working script for generating a communication layer for dedicated ECU out of can database"', but expected ';', or 'NEWLINE'.

        print "not yet working script for generating a communication layer for dedicated ECU out of can database"
              ^
Incomplete input. Unexpectedly encountered '%'.

{% if cookiecutter.use_sentry == 'y' -%}
 ^
Incomplete input. Encountered '"Done."', but expected ';', or 'NEWLINE'.

    print "Done."
          ^
Incomplete input. Unexpectedly encountered '%'.

{%- if cookiecutter.use_postgres == 'y' %}
 ^
Incomplete input. Unexpectedly encountered an indent.

        writer.

664/1181 repos have enough portions of type annotations.


In [35]:
# Some summary statistics

# print total number of manual annotations
n_total_annots = sum(rep.n_type_annots for rep in useful_repos)
print("Total number of manual annotations:", n_total_annots)

# print total number of type places
n_total_places = sum(rep.n_type_places for rep in useful_repos)
print("Total number of type places:", n_total_places)

# print total number of lines of code
n_total_lines = sum(rep.lines_of_code for rep in useful_repos)
print("Total number of lines of code:", n_total_lines)

# print average number of type annotations per line of code excluding projects with more than 1000 lines of code
n_avg_annots = sum(rep.n_type_annots for rep in useful_repos if rep.lines_of_code < 1000) / n_total_lines

Total number of manual annotations: 343595
Total number of type places: 544497
Total number of lines of code: 3342911


[GitRepo(author='skorokithakis', name='catt', url='https://github.com/skorokithakis/catt', stars=1740, forks=762, lines_of_code=2036, last_update=datetime.datetime(2022, 4, 10, 1, 30, 43), n_type_annots=140, n_type_places=433),
 GitRepo(author='encode', name='databases', url='https://github.com/encode/databases', stars=769, forks=48, lines_of_code=3124, last_update=datetime.datetime(2022, 3, 6, 12, 25, 10), n_type_annots=323, n_type_places=498),
 GitRepo(author='Curt-Park', name='rainbow-is-all-you-need', url='https://github.com/Curt-Park/rainbow-is-all-you-need', stars=490, forks=110, lines_of_code=107, last_update=datetime.datetime(2022, 1, 13, 23, 4, 48), n_type_annots=26, n_type_places=30),
 GitRepo(author='jreese', name='aiomultiprocess', url='https://github.com/jreese/aiomultiprocess', stars=585, forks=45, lines_of_code=1140, last_update=datetime.datetime(2022, 2, 4, 21, 28, 7), n_type_annots=138, n_type_places=213),
 GitRepo(author='instaloader', name='instaloader', url='https:/

In [5]:
import pickle

useful_repos_path = proj_root() / "scripts" / "useful_repos.pkl"
with useful_repos_path.open("wb") as f:
    pickle.dump(useful_repos, f)
print(f"Saved {len(useful_repos)} useful repos to {useful_repos_path}.")
with useful_repos_path.open("rb") as f:
    print(pickle.load(f)[:3])

[GitRepo(author='typeddjango', name='pytest-mypy-plugins', url='https://github.com/typeddjango/pytest-mypy-plugins', stars=12, forks=0, lines_of_code=1039, last_update=datetime.datetime(2022, 4, 18, 23, 25, 40), n_type_annots=155, n_type_places=158), GitRepo(author='jfly', name='jfly.github.io', url='https://github.com/jfly/jfly.github.io', stars=0, forks=0, lines_of_code=650, last_update=datetime.datetime(2022, 4, 12, 8, 23, 39), n_type_annots=39, n_type_places=122), GitRepo(author='seattleflu', name='id3c', url='https://github.com/seattleflu/id3c', stars=2, forks=0, lines_of_code=8883, last_update=datetime.datetime(2022, 4, 21, 15, 38, 59), n_type_annots=675, n_type_places=1068)]


In [2]:
import pickle
from spot import proj_root

useful_repos_path = proj_root() / "scripts" / "useful_repos.pkl"
with useful_repos_path.open("rb") as f:
    useful_repos = pickle.load(f)

In [5]:
from spot.data import load_or_process_datasets
from spot.model import TokenizerSPOT

datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"
tokenizer = TokenizerSPOT.from_pretrained("Salesforce/codet5-base")

with_margin = True
margin_tag = "with_margin" if with_margin else "no_margin"

ctx_margin = tokenizer.model_max_length // 4 if with_margin else 0

test_parsing = False
test_tag = "test-" if test_parsing else ""

ti_datasets, repos_split = load_or_process_datasets(
    datadir / f"SPOT-data/{test_tag}repos-processed-{margin_tag}",
    tokenizer,
    repos_dir,
    repos_test=useful_repos[0:50] if not test_parsing else useful_repos[0:3],
    repos_valid=useful_repos[50:90] if not test_parsing else useful_repos[3:4],
    repos_train=useful_repos[90:] if not test_parsing else useful_repos[4:6],
    regenerate=True,
    ctx_margin=ctx_margin,
    max_workers=20,
)

Deleting old datasets at: /mnt/data0/jiayi/SPOT-data/repos-processed-with_margin
Processing dataset: train


parsing and masking sources:   0%|          | 0/29836 [00:00<?, ?it/s]

tokenizing sources:   0%|          | 0/29835 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/75298 [00:00<?, ?it/s]

Processing dataset: valid


parsing and masking sources:   0%|          | 0/1953 [00:00<?, ?it/s]

tokenizing sources:   0%|          | 0/1953 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/5422 [00:00<?, ?it/s]

Processing dataset: test


parsing and masking sources:   0%|          | 0/1594 [00:00<?, ?it/s]

tokenizing sources:   0%|          | 0/1594 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/3573 [00:00<?, ?it/s]