In [9]:
%load_ext autoreload
%autoreload 2

# pip install git2doc
from git2doc import loader
from pprint import pprint
import tiktoken


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string, disallowed_special=(encoding.special_tokens_set - {'<|endoftext|>'})))
    return num_tokens

In [2]:
# retrieve a code given a repo URL

# using a random repo as an example
repo_name = "https://github.com/voynow/jamievoynow.com"

# return a list of Document objects
repo_data = loader.pull_code_from_repo(repo_name)

# or return a string of all the raw text
raw_repo = loader.docs_to_str(repo_data)

print(raw_repo[:1000])

app.py:

import json
import logging

import src.config as config
import src.services as services

from flask import Flask, render_template
from flask_caching import Cache
from flask_socketio import SocketIO, send


# Set up Flask app and socketio
app = Flask(__name__)
socketio = SocketIO(app, cors_allowed_origins="*")

# Configure caching
cache = Cache(app, config={"CACHE_TYPE": "simple"})

# Configure logging
handler = logging.FileHandler("app.log")
handler.setLevel(logging.INFO)
logging.basicConfig(
    filename="app.log",
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s - %(filename)s:%(lineno)d - %(funcName)s",
    datefmt="%d-%b-%y %H:%M:%S",
)
app.logger.addHandler(logging.StreamHandler())

# Fetch projects from GitHub
PROJECTS = services.fetch_projects_info(app)


@app.route("/")
@cache.cached(timeout=50)
def home():
    project_objs = [value for _, value in PROJECTS.items()]
    return render_template(
        


In [3]:
# retrieve metadata across many popular repos

# GitHub API for top 10 repos in the last 10 days that use Python
top_repos = loader.get_top_repos(
    n_repos=10, last_n_days=10, language="python", sort="stars", order="desc"
)

pprint(top_repos[0])

{'allow_forking': True,
 'archive_url': 'https://api.github.com/repos/baichuan-inc/baichuan-7B/{archive_format}{/ref}',
 'archived': False,
 'assignees_url': 'https://api.github.com/repos/baichuan-inc/baichuan-7B/assignees{/user}',
 'blobs_url': 'https://api.github.com/repos/baichuan-inc/baichuan-7B/git/blobs{/sha}',
 'branches_url': 'https://api.github.com/repos/baichuan-inc/baichuan-7B/branches{/branch}',
 'clone_url': 'https://github.com/baichuan-inc/baichuan-7B.git',
 'collaborators_url': 'https://api.github.com/repos/baichuan-inc/baichuan-7B/collaborators{/collaborator}',
 'comments_url': 'https://api.github.com/repos/baichuan-inc/baichuan-7B/comments{/number}',
 'commits_url': 'https://api.github.com/repos/baichuan-inc/baichuan-7B/commits{/sha}',
 'compare_url': 'https://api.github.com/repos/baichuan-inc/baichuan-7B/compare/{base}...{head}',
 'contents_url': 'https://api.github.com/repos/baichuan-inc/baichuan-7B/contents/{+path}',
 'contributors_url': 'https://api.github.com/repo

In [8]:
# pipeline for get_top_repos() -> pull_code_from_repo()
github_data = loader.pipeline_fetch_and_load(
    n_repos=250, last_n_days=30, language="python", delete=True
)

(0) Processing https://github.com/s0md3v/roop...
(1) Processing https://github.com/PromtEngineer/localGPT...
(2) Processing https://github.com/facebookresearch/audiocraft...
(3) Processing https://github.com/baichuan-inc/baichuan-7B...
(4) Processing https://github.com/kyegomez/tree-of-thoughts...
(5) Processing https://github.com/SysCV/sam-hq...
(6) Processing https://github.com/facebookresearch/ijepa...
(7) Processing https://github.com/princeton-vl/infinigen...
(8) Processing https://github.com/WankkoRee/eaio...
(9) Processing https://github.com/lyuchenyang/Macaw-LLM...
(10) Processing https://github.com/hiyouga/LLaMA-Efficient-Tuning...
(11) Processing https://github.com/princeton-nlp/MeZO...
(12) Processing https://github.com/wenge-research/YaYi...
(13) Processing https://github.com/aiwaves-cn/RecurrentGPT...
(14) Processing https://github.com/Liuhong99/Sophia...
(15) Processing https://github.com/deepmind/alphadev...
(16) Processing https://github.com/Victorwz/LongMem...
(17) Pro

In [22]:
from pympler import asizeof

sum([asizeof.asizeof(github_data[repo]) for repo in list(github_data.keys())[:250]])

198858616

In [13]:
import pandas as pd
import time

df_dict = {
    'repo': [],
    'file_path': [],
    'file_type': [],
    'num_tokens': [],
}

for repo, data in github_data.items():
    start = time.time()
    total_tokens = 0

    if data['docs']:
        for doc in data['docs']:    
            df_dict['repo'].append(repo)
            df_dict['file_path'].append(doc.metadata['file_path'])
            df_dict['file_type'].append(doc.metadata['file_type'])

            num_tokens = num_tokens_from_string(doc.page_content, "cl100k_base")
            df_dict['num_tokens'].append(num_tokens)
            total_tokens += num_tokens

        print(f"{repo: <100} {total_tokens: <15} {time.time() - start:3f}")

df = pd.DataFrame(df_dict)

https://github.com/s0md3v/roop                                                                       15023           0.013518
https://github.com/PromtEngineer/localGPT                                                            781973          0.583975
https://github.com/facebookresearch/audiocraft                                                       96308           0.077015
https://github.com/baichuan-inc/baichuan-7B                                                          25468           0.018793
https://github.com/kyegomez/tree-of-thoughts                                                         137144          0.097980
https://github.com/SysCV/sam-hq                                                                      34425           0.026086
https://github.com/facebookresearch/ijepa                                                            23869           0.023063
https://github.com/princeton-vl/infinigen                                                            1350156         1

In [23]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

df.groupby('file_type').agg({'num_tokens': ['sum', 'count', 'mean']}).sort_values(('num_tokens', 'sum'), ascending=False).head(25)

Unnamed: 0_level_0,num_tokens,num_tokens,num_tokens
Unnamed: 0_level_1,sum,count,mean
file_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
.py,21427260,9732,2201.732
.corrected,5449593,16,340599.562
,2712339,921,2944.993
.js,2482685,75,33102.467
.tsv,2416224,8,302028.0
.md,1999805,734,2724.53
.pickle,1483343,2,741671.5
.ndjson,804701,46,17493.5
.lock,761007,13,58539.0
.cpp,750130,108,6945.648
