In [1]:
%load_ext autoreload
%autoreload 2

import json
import logging
import os
import shutil
import subprocess
import time
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path

import libcst as cst
from tqdm import tqdm

from spot.data import GitRepo
from spot.type_env import collect_annots_info, mypy_checker
from spot.utils import proj_root, read_file, write_file

os.chdir(proj_root())

In [17]:
import requests
requests.get(
    f"https://api.github.com/search/repositories?q=language:python&sort=stars&order=desc&per_page=10"
).json()["items"][0]


{'id': 63476337,
 'node_id': 'MDEwOlJlcG9zaXRvcnk2MzQ3NjMzNw==',
 'name': 'Python',
 'full_name': 'TheAlgorithms/Python',
 'private': False,
 'owner': {'login': 'TheAlgorithms',
  'id': 20487725,
  'node_id': 'MDEyOk9yZ2FuaXphdGlvbjIwNDg3NzI1',
  'avatar_url': 'https://avatars.githubusercontent.com/u/20487725?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/TheAlgorithms',
  'html_url': 'https://github.com/TheAlgorithms',
  'followers_url': 'https://api.github.com/users/TheAlgorithms/followers',
  'following_url': 'https://api.github.com/users/TheAlgorithms/following{/other_user}',
  'gists_url': 'https://api.github.com/users/TheAlgorithms/gists{/gist_id}',
  'starred_url': 'https://api.github.com/users/TheAlgorithms/starred{/owner}{/repo}',
  'subscriptions_url': 'https://api.github.com/users/TheAlgorithms/subscriptions',
  'organizations_url': 'https://api.github.com/users/TheAlgorithms/orgs',
  'repos_url': 'https://api.github.com/users/TheAlgorithms/repos',
  'even

In [31]:
import requests
import dateparser


def fetch_top_python_repos(n_pages: int, oldest_push: str):
    def request_page(page: int):
        return requests.get(
            f"https://api.github.com/search/repositories?q=language:python&pushed_at>{oldest_push}&sort=stars&order=desc&per_page=100&page={page}"
        ).json()

    pages = map(request_page, range(1, n_pages + 1))
    repos = list[GitRepo]()
    for page in tqdm(pages, desc="process pages"):
        if "items" not in page:
            print("Fetching page failed:")
            print(page)
            break
        for item in page["items"]:
            r = GitRepo(
                author=item["owner"]["login"],
                name=item["name"],
                url=item["html_url"],
                description=item["description"],
                stars=item["stargazers_count"],
                forks=item["forks_count"],
                last_update=dateparser.parse(item["pushed_at"]).replace(tzinfo=None),
            )
            repos.append(r)
    return repos


In [32]:
top_repos = fetch_top_python_repos(50, oldest_push="2021-04-20T00:00:01Z")
len(top_repos)


process pages: 10it [00:26,  2.61s/it]

Fetching page failed:
{'message': "API rate limit exceeded for 128.83.122.136. (But here's the good news: Authenticated requests get a higher rate limit. Check out the documentation for more details.)", 'documentation_url': 'https://docs.github.com/rest/overview/resources-in-the-rest-api#rate-limiting'}





1000

In [24]:
top_repos[0]


GitRepo(author='donnemartin', name='system-design-primer', url='https://github.com/donnemartin/system-design-primer', stars=182777, forks=33244, lines_of_code=None, last_update=datetime.datetime(2022, 5, 28, 11, 54, 39), n_type_annots=None, n_type_places=None)

In [3]:
all_repos = json.loads(read_file("data/mypy-dependents-by-stars.json"))
all_repos = [GitRepo.from_json(r) for r in all_repos]
# all_repos=all_repos[:10] # for testing


In [4]:
# download all candidate repos


def clear_downloaded_repos(repos_dir):
    shutil.rmtree(repos_dir)


def download_repos(
    to_download: list[GitRepo], repos_dir, download_timeout=10.0, max_workers=10
) -> list[GitRepo]:
    def download_single(repo: GitRepo):
        try:
            if repo.download(repos_dir, timeout=download_timeout):
                repo.read_last_update(repos_dir)
                return repo
            else:
                return None
        except subprocess.TimeoutExpired:
            return None
        except Exception as e:
            logging.warning(f"Failed to download {repo.name}. Exception: {e}")
            return None

    print("Downloading repos from Github...")
    t_start = time.time()
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        fs = [executor.submit(download_single, repo) for repo in to_download]
        rs = [f.result() for f in tqdm(as_completed(fs), total=len(fs))]
    print(f"Downloading took {time.time() - t_start} seconds.")
    downloaded = [r for r in rs if r is not None]
    return downloaded


datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"
if not repos_dir.exists():
    (repos_dir / "downloading").mkdir(parents=True)
    (repos_dir / "downloaded").mkdir(parents=True)
    downloaded_repos = download_repos(all_repos, repos_dir)
    print("Deleting failed repos...")
    shutil.rmtree(repos_dir / "downloading")
else:
    print("Repos already downloaded.")
    downloaded_dirs = set(d.name for d in (repos_dir / "downloaded").iterdir())
    downloaded_repos = [r for r in all_repos if r.authorname() in downloaded_dirs]
    print("Reading last updates...")
    for r in tqdm(downloaded_repos):
        r.read_last_update(repos_dir)
print(f"Downloaded {len(downloaded_repos)}/{len(all_repos)} repos.")

# assert len(list((repos_dir / "downloaded").iterdir())) == len(downloaded_repos)


Repos already downloaded.
Reading last updates...


100%|██████████| 4890/4890 [00:27<00:00, 175.00it/s]

Downloaded 4890/5996 repos.





In [33]:
from datetime import datetime, timezone

date_threshold = datetime(2021, 4, 20)
new_repos = [r for r in downloaded_repos if r.last_update > date_threshold]
print(f"{len(new_repos)} / {len(downloaded_repos)} repos are updated within a year.")
loc_limit = 50000

small_repos = []
for rep in tqdm(new_repos):
    try:
        loc = rep.count_lines_of_code(repos_dir)
        if loc < loc_limit:
            small_repos.append(rep)
    except UnicodeDecodeError:
        # nothing we can do
        pass
    except Exception as e:
        logging.warning(f"Failed to count lines of code for {rep.name}. Exception: {e}")

print(
    f"{len(small_repos)}/{len(new_repos)} repos are within the size limit ({loc_limit} LOC)."
)


1218 / 4890 repos are updated within a year.


100%|██████████| 1218/1218 [00:05<00:00, 243.41it/s]

1181/1218 repos are within the size limit.





In [34]:
# filter away repos with too few annotations

def count_repo_annots(rep):
    try:
        rep.count_annotations(repos_dir)
        if rep.n_type_annots / rep.lines_of_code > 0.05:
            return rep
    except Exception as e:
        logging.warning(f"Failed to count annotations for {rep.name}. Exception: {e}")
        return None


with ProcessPoolExecutor(max_workers=30) as executor:
    fs = [executor.submit(count_repo_annots, rep) for rep in small_repos]
    rs = [f.result() for f in tqdm(as_completed(fs), total=len(fs))]
useful_repos: list[GitRepo] = [
    r for r in rs if r is not None and "typeshed" not in r.name
]

print(
    f"{len(useful_repos)}/{len(small_repos)} repos are parsable and have enough portions of type annotations."
)


Incomplete input. Encountered 'linenum', but expected ';', or 'NEWLINE'.

                print linenum, l.rstrip()
                      ^
Incomplete input. Encountered '"Test Plan:"', but expected ';', or 'NEWLINE'.

        print "Test Plan:", self.test_name
              ^
Incomplete input. Unexpectedly encountered '='.

= nonsense =
^
Incomplete input. Encountered '"not yet working script for generating a communication layer for dedicated ECU out of can database"', but expected ';', or 'NEWLINE'.

        print "not yet working script for generating a communication layer for dedicated ECU out of can database"
              ^
Incomplete input. Unexpectedly encountered '%'.

{% if cookiecutter.use_sentry == 'y' -%}
 ^
Incomplete input. Encountered '"Done."', but expected ';', or 'NEWLINE'.

    print "Done."
          ^
Incomplete input. Unexpectedly encountered '%'.

{%- if cookiecutter.use_postgres == 'y' %}
 ^
Incomplete input. Unexpectedly encountered an indent.

        writer.

664/1181 repos have enough portions of type annotations.


In [35]:
# Some summary statistics

# print total number of manual annotations
n_total_annots = sum(rep.n_type_annots for rep in useful_repos)
print("Total number of manual annotations:", n_total_annots)

# print total number of type places
n_total_places = sum(rep.n_type_places for rep in useful_repos)
print("Total number of type places:", n_total_places)

# print total number of lines of code
n_total_lines = sum(rep.lines_of_code for rep in useful_repos)
print("Total number of lines of code:", n_total_lines)

# print average number of type annotations per line of code excluding projects with more than 1000 lines of code
n_avg_annots = (
    sum(rep.n_type_annots for rep in useful_repos if rep.lines_of_code < 1000)
    / n_total_lines
)


Total number of manual annotations: 343595
Total number of type places: 544497
Total number of lines of code: 3342911


[GitRepo(author='skorokithakis', name='catt', url='https://github.com/skorokithakis/catt', stars=1740, forks=762, lines_of_code=2036, last_update=datetime.datetime(2022, 4, 10, 1, 30, 43), n_type_annots=140, n_type_places=433),
 GitRepo(author='encode', name='databases', url='https://github.com/encode/databases', stars=769, forks=48, lines_of_code=3124, last_update=datetime.datetime(2022, 3, 6, 12, 25, 10), n_type_annots=323, n_type_places=498),
 GitRepo(author='Curt-Park', name='rainbow-is-all-you-need', url='https://github.com/Curt-Park/rainbow-is-all-you-need', stars=490, forks=110, lines_of_code=107, last_update=datetime.datetime(2022, 1, 13, 23, 4, 48), n_type_annots=26, n_type_places=30),
 GitRepo(author='jreese', name='aiomultiprocess', url='https://github.com/jreese/aiomultiprocess', stars=585, forks=45, lines_of_code=1140, last_update=datetime.datetime(2022, 2, 4, 21, 28, 7), n_type_annots=138, n_type_places=213),
 GitRepo(author='instaloader', name='instaloader', url='https:/

In [5]:
import pickle

useful_repos_path = proj_root() / "scripts" / "useful_repos.pkl"
with useful_repos_path.open("wb") as f:
    pickle.dump(useful_repos, f)
print(f"Saved {len(useful_repos)} useful repos to {useful_repos_path}.")
with useful_repos_path.open("rb") as f:
    print(pickle.load(f)[:3])


[GitRepo(author='typeddjango', name='pytest-mypy-plugins', url='https://github.com/typeddjango/pytest-mypy-plugins', stars=12, forks=0, lines_of_code=1039, last_update=datetime.datetime(2022, 4, 18, 23, 25, 40), n_type_annots=155, n_type_places=158), GitRepo(author='jfly', name='jfly.github.io', url='https://github.com/jfly/jfly.github.io', stars=0, forks=0, lines_of_code=650, last_update=datetime.datetime(2022, 4, 12, 8, 23, 39), n_type_annots=39, n_type_places=122), GitRepo(author='seattleflu', name='id3c', url='https://github.com/seattleflu/id3c', stars=2, forks=0, lines_of_code=8883, last_update=datetime.datetime(2022, 4, 21, 15, 38, 59), n_type_annots=675, n_type_places=1068)]


In [1]:
# Run form here: Analyzing src datasets.

%load_ext autoreload
%autoreload 2

import pickle

from spot import proj_root
from spot.data import TokenizedSrcSet, get_data_dir, get_datasets_name, PreprocessArgs
import spot.function_dataset as fd
from spot.utils import Path, run_long_task, DefaultTokenizer, not_none
import subprocess

repos_split_path = proj_root() /  "data/repos_split.pkl"
repos_dir = get_data_dir() / "SPOT-data/repos/"

recreate = False
func_only = True
pre_args = PreprocessArgs(
    drop_env_types=True,
    stub_in_preamble=True,
)
data_reduction = 1

datasets_name = get_datasets_name(
    pre_args, func_only, data_reduction=data_reduction,
)
datasets_path = get_data_dir() / "SPOT-data" / datasets_name

In [2]:
from spot.data import create_tokenized_srcsets, load_tokenized_srcsets
if recreate or not datasets_path.exists():
    create_tokenized_srcsets(
        proj_root() / "data/repos_split.pkl",
        datasets_path,
        func_only=func_only,
        pre_args=pre_args,
        data_reduction=data_reduction,
    )
tk_dataset = load_tokenized_srcsets(
    datadir,
    datasets_name,
)


Starting task: Generating TokenizedSrcSets: func_datasets-v5-PreprocessArgs(drop_env_types=False)


Generating dataset from repos: 100%|██████████| 573/573 [08:02<00:00,  1.19it/s]
Generating dataset from repos: 100%|██████████| 40/40 [02:12<00:00,  3.32s/it]
Generating dataset from repos: 100%|██████████| 50/50 [01:32<00:00,  1.85s/it]


Saved source datasets to: /mnt/data0/jiayi/SPOT-data/func_datasets-v5-PreprocessArgs(drop_env_types=False)
777M	/mnt/data0/jiayi/SPOT-data/func_datasets-v5-PreprocessArgs(drop_env_types=False)
Pushover: (Finished: 'Generating TokenizedSrcSets: func_datasets-v5-PreprocessArgs(drop_env_types=False)'.) Time taken: 736.9s


In [None]:
import plotly.express as px
from pandas import DataFrame

from spot.utils import cumulative_counts

len_counts = [len(src.tokenized_code) for src in tk_dataset["train"].all_srcs]
xs, ys = cumulative_counts(len_counts)
px.line(
    DataFrame({"tokens_per_file": xs, "n_files": ys}), x="tokens_per_file", y="n_files"
)

In [3]:
print("dataset:", datasets_path)
tk_dataset["train"].print_stats()

dataset: /mnt/data0/jiayi/SPOT-data/func_datasets-v5-PreprocessArgs(drop_env_types=False)
num_repos: 572
num_files: 134977
num_lines: 5752662
num_labels: 293592
main_tokens_per_file:
   mean: 1578.6
   median: 717
   min: 17
   max: 97666
preamble_tokens_per_file:
   mean: 196.41
   median: 134
   min: 2
   max: 6729
target_tks_per_file:
   mean: 7.8197
   median: 5
   min: 2
   max: 725


In [3]:
print("dataset:", datasets_path)
tk_dataset["train"].print_stats()

dataset: /mnt/data0/jiayi/SPOT-data/func_datasets-v4-PreprocessArgs(drop_env_types=False)
num_repos: 572
num_files: 134977
num_lines: 16579520
num_labels: 293592
main_tokens_per_file:
   mean: 1585
   median: 717
   min: 17
   max: 94019
preamble_tokens_per_file:
   mean: 970.94
   median: 532
   min: 6
   max: 49334
target_tks_per_file:
   mean: 7.8197
   median: 5
   min: 2
   max: 725


In [8]:
from spot.data import load_tokenized_srcsets

# tk_dataset = load_tokenized_srcsets(get_data_dir(), get_dataset_name(pre_args, func_only))

long_files=sorted(tk_dataset["train"].all_srcs, key=lambda s: len(s.tokenized_code),reverse=True)

In [12]:
print(long_files[8].preamble_code)

from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.responses import JSONResponse
class Item(BaseModel):
    ...
class Message(BaseModel):
    ...



In [7]:
print(str(tk_dataset["train"].all_srcs[2345]))

file:devonhollowood__adventofcode/2018.day10/Point.velocity
repo:devonhollowood__adventofcode
--------Preamble--------
import argparse
import re
import typing
import unittest
from functools import lru_cache
from dataclasses import dataclass
@dataclass(...)
class Vector:
    x: ...
    y: ...
@dataclass(...)
class Point:
    position: ...
    velocity: ...
    def step(self): ...
    velocity: ...
    position: ...
def bounding_circumference(points): ...
@lru_cache(...)
def run_until_compact(points): ...
@lru_cache(...)
def parse(puzzle): ...
def part1(puzzle): ...
def part2(puzzle): ...
def main(): ...
if __name__ == '__main__':
    main()
class ExampleTest(unittest.TestCase):
    example = ...
    expected = ...
    def test_part1(self): ...
    def test_part2(self): ...
    example: ...
    assertEqual: ...
    expected: ...

--------Main Code--------

# BEGIN

# 2018.day10
@dataclass(frozen=True)
class Point:
    velocity: <mask>

# END

# 2018.day10
@lru_cache()
def parse(puzzle: s

In [12]:
max(tk_dataset["train"].all_srcs, key=lambda s: len(s.tokenized_code)).print_code(500)

# fastapi.routing/APIRouter
def api_route(
    self,
    path,
    *,
    response_model = None,
    status_code = None,
    tags = None,
    dependencies = None,
    summary = None,
    description = None,
    response_description = "Successful Response",
    responses = None,
    deprecated = None,
    methods = None,
    operation_id = None,
    response_model_include = None,
    response_model_exclude = None,
    response_model_by_alias = True,
    response_model_exclude_unset = False,
    response_model_exclude_defaults = False,
    response_model_exclude_none = False,
    include_in_schema = True,
    response_class = Default(JSONResponse),
    name = None,
    callbacks = None,
    openapi_extra = None,
    generate_unique_id_function = Default(
        generate_unique_id
    ),
):
    def decorator(func):
        self.add_api_route(
            path,
            func,
            response_model=response_model,
            status_code=status_code,
            tags=tags,
        

In [4]:
tk_dataset["train"].print_stats()


num_repos: 572
num_files: 16281
num_lines: 2698669
num_labels: 295457
main_tokens_per_file:
   mean: 1311.6
   median: 608
   min: 8
   max: 55437
preamble_tokens_per_file:
   mean: 301.97
   median: 181
   min: 6
   max: 9784
target_tks_per_file:
   mean: 65.318
   median: 28
   min: 2
   max: 3798
n_files_too_wide: 444
too_wide_ratio: 0.014897
preprocess: PreprocessArgs(imports_in_preamble=True, stub_in_preamble=True, drop_comments=True)


In [8]:
def preamble_len(src):
    return len(src.preamble_code.split("\n"))


weird_src = max(tk_dataset["train"].all_srcs, key=preamble_len)


In [3]:
from spot.data import load_tokenized_srcsets, get_dataroot

sdata_path = get_dataroot() / "TokenizedSrcSets" / "ManyTypes4Py-v5-PreprocessArgs()"

tk_dataset = load_tokenized_srcsets(sdata_path)

Loading TokenizedSrcSets:  /mnt/nas/jiayi/SPOT/TokenizedSrcSets/ManyTypes4Py-v5-PreprocessArgs()
258M	/mnt/nas/jiayi/SPOT/TokenizedSrcSets/ManyTypes4Py-v5-PreprocessArgs()


In [6]:
tk_dataset["test"].print_stats()

num_repos: 50
num_files: 949
num_lines: 139121
num_labels: 17740
main_tokens_per_file:
   mean: 1270.5
   median: 632
   min: 23
   max: 57953
preamble_tokens_per_file:
   mean: 103.8
   median: 67
   min: 2
   max: 1517
target_tks_per_file:
   mean: 72.285
   median: 32
   min: 2
   max: 1882
n_files_too_wide: 1
too_wide_ratio: 0.00062735
preprocess: PreprocessArgs(imports_in_preamble=True, stub_in_preamble=True, drop_comments=True, max_callees=80, max_callers=20, drop_env_types=True, add_override_usages=False)
