In [2]:
%load_ext autoreload
%autoreload 2

# pip install git2doc
from git2doc import loader
from pprint import pprint
import tiktoken

In [5]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string, disallowed_special=(encoding.special_tokens_set - {'<|endoftext|>'})))
    return num_tokens

In [6]:
# retrieve a code given a repo URL

# using a random repo as an example
repo_name = "https://github.com/voynow/jamievoynow.com"

# return a list of Document objects
repo_data = loader.pull_code_from_repo(repo_name)

# or return a string of all the raw text
raw_repo = loader.files_to_str(repo_data)

print(raw_repo[:1000])

app.py:

import json
import logging

import src.config as config
import src.services as services

from flask import Flask, render_template
from flask_caching import Cache
from flask_socketio import SocketIO, send


# Set up Flask app and socketio
app = Flask(__name__)
socketio = SocketIO(app, cors_allowed_origins="*")

# Configure caching
cache = Cache(app, config={"CACHE_TYPE": "simple"})

# Configure logging
handler = logging.FileHandler("app.log")
handler.setLevel(logging.INFO)
logging.basicConfig(
    filename="app.log",
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s - %(filename)s:%(lineno)d - %(funcName)s",
    datefmt="%d-%b-%y %H:%M:%S",
)
app.logger.addHandler(logging.StreamHandler())

# Fetch projects from GitHub
PROJECTS = services.fetch_projects_info(app)


@app.route("/")
@cache.cached(timeout=50)
def home():
    project_objs = [value for _, value in PROJECTS.items()]
    return render_template(
        


In [75]:
# Intervals of 1000 repos on 5 x 73 day windows
top_repos = loader.get_top_repos(
    n_repos=5000,
    last_n_days=365,
    language="python",
)

Getting 833 repos x 6 intervals.
Querying 833 repos on 2022-06-23 -> 2022-08-22
Querying 833 repos on 2022-08-22 -> 2022-10-21
Querying 833 repos on 2022-10-21 -> 2022-12-20
HTTP Error: 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=language%3Apython+created%3A2022-10-21..2022-12-20&sort=sort&order=desc&per_page=100&page=5
Querying 833 repos on 2022-12-20 -> 2023-02-18
Querying 833 repos on 2023-02-18 -> 2023-04-19
HTTP Error: 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=language%3Apython+created%3A2023-02-18..2023-04-19&sort=sort&order=desc&per_page=100&page=5
HTTP Error: 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=language%3Apython+created%3A2023-02-18..2023-04-19&sort=sort&order=desc&per_page=100&page=5
HTTP Error: 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=language%3Apython+created%3A2023-02-18..2023-04-19&sort=sort&order=desc&per_page

In [77]:
# pipeline for get_top_repos() -> pull_code_from_repo()
github_data = loader.pipeline_fetch_and_load(
    n_repos=25,
    last_n_days=365,
    language="python",
    delete=True,
)

Getting 25 repos x 1 intervals.
Querying 25 repos on 2022-06-23 -> 2023-06-23
(0) Processing https://github.com/Significant-Gravitas/Auto-GPT...
(1) Processing https://github.com/xtekky/gpt4free...
(2) Processing https://github.com/LAION-AI/Open-Assistant...
(3) Processing https://github.com/microsoft/TaskMatrix...
(4) Processing https://github.com/THUDM/ChatGLM-6B...
(5) Processing https://github.com/tatsu-lab/stanford_alpaca...
(6) Processing https://github.com/lm-sys/FastChat...
(7) Processing https://github.com/facebookresearch/llama...
(8) Processing https://github.com/karpathy/nanoGPT...
(9) Processing https://github.com/lllyasviel/ControlNet...
(10) Processing https://github.com/jerryjliu/llama_index...
(11) Processing https://github.com/yoheinakajima/babyagi...
(12) Processing https://github.com/zhayujie/chatgpt-on-wechat...
(13) Processing https://github.com/mouredev/Hello-Python...
(14) Processing https://github.com/kaixindelele/ChatPaper...
(15) Processing https://github.com

KeyboardInterrupt: 

In [None]:
from pympler import asizeof

sum([asizeof.asizeof(github_data[repo]) for repo in list(github_data.keys())[:250]])

212235912

In [None]:
import pandas as pd
import time

df_dict = {
    'repo': [],
    'file_path': [],
    'file_type': [],
    'num_tokens': [],
}

for repo, data in github_data.items():
    start = time.time()
    total_tokens = 0

    if data['files']:
        for doc in data['files']:    
            df_dict['repo'].append(repo)
            df_dict['file_path'].append(doc['metadata']['file_path'])
            df_dict['file_type'].append(doc['metadata']['file_type'])

            num_tokens = num_tokens_from_string(doc['page_content'], "cl100k_base")
            df_dict['num_tokens'].append(num_tokens)
            total_tokens += num_tokens

        print(f"{repo: <100} {total_tokens: <15} {time.time() - start:3f}")

df = pd.DataFrame(df_dict)

https://github.com/automorphic-ai/aegis                                                              2469            0.246980
https://github.com/edzusans/Valorant-Instalock                                                       1355            0.000998
https://github.com/Felpesx/Twitter-Auto-Reply-Bot                                                    516             0.000000
https://github.com/techleadhd/chatgpt-retrieval                                                      499             0.001004
https://github.com/lcw99/evolve-instruct                                                             4563            0.002993
https://github.com/simonw/symbex                                                                     12770           0.010004
https://github.com/lucidrains/spear-tts-pytorch                                                      1507            0.000999
https://github.com/beyondguo/LLM-Tuning                                                              6031            0

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

df.groupby('file_type').agg({'num_tokens': ['sum', 'count', 'mean']}).sort_values(('num_tokens', 'sum'), ascending=False).head(25)

Unnamed: 0_level_0,num_tokens,num_tokens,num_tokens
Unnamed: 0_level_1,sum,count,mean
file_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
.py,26079274,14388,1812.571
.wav,3115219,55451,56.18
.po,2700403,1211,2229.895
.pem,1086131,14,77580.786
,986471,657,1501.478
.js,768689,100,7686.89
.md,443815,439,1010.968
.html,431153,261,1651.927
.lock,319364,7,45623.429
.cpp,73938,38,1945.737
