# Cloning and Processing repos from GitHub

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from tqdm import tqdm

pd.options.display.float_format = "{:.2f}".format

## Clone the repos from the filtered repos.

In [40]:
import pandas as pd
import git
from pathlib import Path
import os
from tqdm import tqdm
import json
from pydriller import Repository, ModificationType

## Read repo names.

In [41]:
filtered_df = pd.read_json("./metadata/filtered_ghs_results_05_mar_2023.jsonl", orient="records", lines=True)

In [42]:
import calendar
import time
import datetime
import logging
import jsonlines
import os
import shutil

logger = logging.getLogger("clone_repo")
fh = logging.FileHandler("./logs/clone_repo.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
logger.addHandler(fh)

langs = ['Python', 'TypeScript', 'JavaScript']
#langs = ['Java']

filtered_df = filtered_df[filtered_df['mainLanguage'].isin(langs)]

In [43]:
github_metadata_path = "./metadata/cloned_repo_metadata.jsonl"

names = filtered_df["name"].tolist()

mainLanguages = filtered_df["mainLanguage"].tolist()

if os.path.exists(github_metadata_path):
    with jsonlines.open(github_metadata_path, "r") as reader:
        processed_names = set(line["name"] for line in reader)
    logger.info(f"Already processed {len(processed_names)} repositories!")
    names = [name for name in names if name not in processed_names]
    mainLanguages = [lang for name, lang in zip(names, mainLanguages) if name not in processed_names]
    logger.info(f"{len(names)} repositories left!")
else:
    open(github_metadata_path, "w").close()

In [44]:
cur_chunk = []
chunksize = 20

output_path = '/Volumes/T7/data/repos/'

for lang in langs:
    Path(output_path + lang).mkdir(parents=True, exist_ok=True)

for id, repo_name in tqdm(enumerate(names), total=len(names)):
    try:
        lang = mainLanguages[id]
        url = f'https://github.com/{repo_name}.git'
        logger.info(f'Cloning repository at {url}.')
        local_path = os.path.join(output_path + lang, repo_name.replace("/", "@"))

        if os.path.exists(local_path):
            logger.info(f"{local_path} already exists. remove!!!")
            shutil.rmtree(local_path)

        r = git.Repo.clone_from(url, local_path)
        sha = r.rev_parse('HEAD').hexsha
        data = {
            "name": repo_name,
            "mainLanguage": lang,
            "url": url,
            "local_path": local_path,
            "sha": sha
        }
    except Exception as e:
        print(e)
        logger.warning(f"GitHub exception with repo {repo_name}, {repr(e)}")
        data = {"name": repo_name, "mainLanguage": lang, "url": url, "local_path": None, "sha": None}

    cur_chunk.append(data)
    if len(cur_chunk) > chunksize:
        with jsonlines.open(github_metadata_path, "a") as writer:
            writer.write_all(cur_chunk)
            cur_chunk = []

if cur_chunk:
    with jsonlines.open(github_metadata_path, "a") as writer:
        writer.write_all(cur_chunk)
        cur_chunk = []

  2%|▏         | 30/1500 [10:47<23:25:34, 57.37s/it] 

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/lzwme/chatgpt-sites.git /Volumes/T7/data/repos/TypeScript/lzwme@chatgpt-sites
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/lzwme@chatgpt-sites'...
fatal: could not read Username for 'https://github.com': Device not configured
'


  9%|▊         | 128/1500 [19:18<52:43,  2.31s/it]  

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/yesmore/inke.git /Volumes/T7/data/repos/TypeScript/yesmore@inke
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/yesmore@inke'...
fatal: could not read Username for 'https://github.com': Device not configured
'


 10%|█         | 152/1500 [21:05<6:55:45, 18.51s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/uiwjs/react-login-page.git /Volumes/T7/data/repos/TypeScript/uiwjs@react-login-page
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/uiwjs@react-login-page'...
POST git-upload-pack (gzip 1927 to 1030 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 12%|█▏        | 185/1500 [25:14<3:52:35, 10.61s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/recursivelyai/copilotkit.git /Volumes/T7/data/repos/TypeScript/recursivelyai@copilotkit
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/recursivelyai@copilotkit'...
POST git-upload-pack (gzip 11627 to 5873 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 13%|█▎        | 188/1500 [26:38<9:59:05, 27.40s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/rgthree/rgthree-comfy.git /Volumes/T7/data/repos/TypeScript/rgthree@rgthree-comfy
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/rgthree@rgthree-comfy'...
POST git-upload-pack (327 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 13%|█▎        | 190/1500 [26:55<6:36:26, 18.16s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/all-in-aigc/aiwallpaper.git /Volumes/T7/data/repos/TypeScript/all-in-aigc@aiwallpaper
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/all-in-aigc@aiwallpaper'...
POST git-upload-pack (277 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 15%|█▌        | 225/1500 [30:40<3:02:55,  8.61s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/douo/raycast-openai-translator.git /Volumes/T7/data/repos/TypeScript/douo@raycast-openai-translator
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/douo@raycast-openai-translator'...
POST git-upload-pack (277 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 16%|█▌        | 238/1500 [32:18<2:39:49,  7.60s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/google/aside.git /Volumes/T7/data/repos/TypeScript/google@aside
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/google@aside'...
POST git-upload-pack (gzip 1177 to 638 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 22%|██▏       | 324/1500 [38:05<9:29:54, 29.08s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/kumaaa-inc/shadow-panda.git /Volumes/T7/data/repos/TypeScript/kumaaa-inc@shadow-panda
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/kumaaa-inc@shadow-panda'...
POST git-upload-pack (gzip 1627 to 876 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 23%|██▎       | 341/1500 [40:23<6:01:04, 18.69s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/Enter-tainer/typst-preview.git /Volumes/T7/data/repos/TypeScript/Enter-tainer@typst-preview
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/Enter-tainer@typst-preview'...
POST git-upload-pack (gzip 2777 to 1451 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 23%|██▎       | 344/1500 [41:50<10:23:57, 32.39s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/geng-haoran/Simulately.git /Volumes/T7/data/repos/TypeScript/geng-haoran@Simulately
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/geng-haoran@Simulately'...
POST git-upload-pack (277 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 23%|██▎       | 352/1500 [43:09<6:16:52, 19.70s/it] 

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/coconut-xr/natuerlich.git /Volumes/T7/data/repos/TypeScript/coconut-xr@natuerlich
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/coconut-xr@natuerlich'...
POST git-upload-pack (327 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 24%|██▍       | 366/1500 [48:01<18:52:02, 59.90s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/mlg404/palworld-paldex-api.git /Volumes/T7/data/repos/TypeScript/mlg404@palworld-paldex-api
  stderr: 'Cloning into '/Volumes/T7/data/repos/TypeScript/mlg404@palworld-paldex-api'...
POST git-upload-pack (427 bytes)
error: RPC failed; curl 56 Recv failure: Connection reset by peer
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 34%|███▍      | 513/1500 [1:06:16<5:12:17, 18.98s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/chatchat-space/langchain-ChatGLM.git /Volumes/T7/data/repos/Python/chatchat-space@langchain-ChatGLM
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/chatchat-space@langchain-ChatGLM'...
POST git-upload-pack (gzip 1977 to 1053 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 35%|███▍      | 520/1500 [1:07:49<4:37:58, 17.02s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/cumulo-autumn/StreamDiffusion.git /Volumes/T7/data/repos/Python/cumulo-autumn@StreamDiffusion
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/cumulo-autumn@StreamDiffusion'...
POST git-upload-pack (677 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 37%|███▋      | 551/1500 [1:14:35<6:44:41, 25.59s/it] 

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/hiyouga/LLaMA-Efficient-Tuning.git /Volumes/T7/data/repos/Python/hiyouga@LLaMA-Efficient-Tuning
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/hiyouga@LLaMA-Efficient-Tuning'...
POST git-upload-pack (gzip 1277 to 703 bytes)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 37%|███▋      | 557/1500 [1:16:25<7:59:23, 30.50s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/ux-decoder/segment-everything-everywhere-all-at-once.git /Volumes/T7/data/repos/Python/ux-decoder@segment-everything-everywhere-all-at-once
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/ux-decoder@segment-everything-everywhere-all-at-once'...
POST git-upload-pack (327 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 41%|████      | 614/1500 [1:26:40<6:25:39, 26.12s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/PKU-YuanGroup/Video-LLaVA.git /Volumes/T7/data/repos/Python/PKU-YuanGroup@Video-LLaVA
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/PKU-YuanGroup@Video-LLaVA'...
POST git-upload-pack (227 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 41%|████      | 616/1500 [1:27:15<5:38:55, 23.00s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/yzfly/LangGPT.git /Volumes/T7/data/repos/Python/yzfly@LangGPT
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/yzfly@LangGPT'...
POST git-upload-pack (227 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 42%|████▏     | 633/1500 [1:31:35<9:01:13, 37.46s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/agiresearch/openagi.git /Volumes/T7/data/repos/Python/agiresearch@openagi
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/agiresearch@openagi'...
POST git-upload-pack (227 bytes)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 43%|████▎     | 640/1500 [1:34:45<10:44:57, 45.00s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/101dotxyz/GPTeam.git /Volumes/T7/data/repos/Python/101dotxyz@GPTeam
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/101dotxyz@GPTeam'...
POST git-upload-pack (gzip 2427 to 1271 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 44%|████▎     | 655/1500 [1:39:20<13:18:03, 56.67s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/barry-far/V2ray-Configs.git /Volumes/T7/data/repos/Python/barry-far@V2ray-Configs
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/barry-far@V2ray-Configs'...
POST git-upload-pack (227 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 44%|████▍     | 660/1500 [1:41:14<8:27:51, 36.28s/it] 

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/fudan-zvg/semantic-segment-anything.git /Volumes/T7/data/repos/Python/fudan-zvg@semantic-segment-anything
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/fudan-zvg@semantic-segment-anything'...
POST git-upload-pack (177 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 45%|████▌     | 681/1500 [1:45:11<3:22:19, 14.82s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/zwq2018/data-copilot.git /Volumes/T7/data/repos/Python/zwq2018@data-copilot
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/zwq2018@data-copilot'...
POST git-upload-pack (177 bytes)
Downloading assets/video1.GIF (30 MB)
Error downloading object: assets/video1.GIF (bbd30a8): Smudge error: Error downloading assets/video1.GIF (bbd30a8179ecb7f7173ae6857ebeb536a7f66a176f67596f1728be418cf9ea73): batch response: This repository is over its data quota. Account responsible for LFS bandwidth should purchase more data packs to restore access.

Errors logged to '/Volumes/T7/data/repos/Python/zwq2018@data-copilot/.git/lfs/logs/20240406T003819.012055.log'.
Use `git lfs logs last` to view the log.
error: external filter 'git-lfs filter-process' failed
fatal: assets/video1.GIF: smudge filter lfs failed
You can inspect what was checked out with 'git status'
and retry with 'git restore --source=HEAD :/'

'


 54%|█████▎    | 804/1500 [2:01:28<3:03:27, 15.81s/it] 

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/taichi-dev/taichi-nerfs.git /Volumes/T7/data/repos/Python/taichi-dev@taichi-nerfs
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/taichi-dev@taichi-nerfs'...
POST git-upload-pack (227 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 54%|█████▍    | 817/1500 [2:05:23<6:01:01, 31.72s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/TencentARC/MotionCtrl.git /Volumes/T7/data/repos/Python/TencentARC@MotionCtrl
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/TencentARC@MotionCtrl'...
POST git-upload-pack (227 bytes)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 56%|█████▋    | 846/1500 [2:12:27<3:18:39, 18.23s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/mattyamonaca/layerdivider.git /Volumes/T7/data/repos/Python/mattyamonaca@layerdivider
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/mattyamonaca@layerdivider'...
POST git-upload-pack (277 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 62%|██████▏   | 936/1500 [2:26:08<30:47,  3.28s/it]   

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/opencopilotdev/opencopilot.git /Volumes/T7/data/repos/Python/opencopilotdev@opencopilot
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/opencopilotdev@opencopilot'...
fatal: could not read Username for 'https://github.com': Device not configured
'


 66%|██████▌   | 989/1500 [2:33:25<1:26:15, 10.13s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/madaan/self-refine.git /Volumes/T7/data/repos/Python/madaan@self-refine
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/madaan@self-refine'...
POST git-upload-pack (227 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 66%|██████▋   | 995/1500 [2:37:33<9:59:33, 71.24s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/RaymondWang987/NVDS.git /Volumes/T7/data/repos/Python/RaymondWang987@NVDS
  stderr: 'Cloning into '/Volumes/T7/data/repos/Python/RaymondWang987@NVDS'...
POST git-upload-pack (327 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 69%|██████▉   | 1032/1500 [2:48:13<8:33:37, 65.85s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/badlogic/heissepreise.git /Volumes/T7/data/repos/JavaScript/badlogic@heissepreise
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/badlogic@heissepreise'...
POST git-upload-pack (227 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 70%|███████   | 1055/1500 [2:51:02<2:48:12, 22.68s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/underpig1/octos.git /Volumes/T7/data/repos/JavaScript/underpig1@octos
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/underpig1@octos'...
POST git-upload-pack (277 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 75%|███████▌  | 1126/1500 [3:02:24<59:31,  9.55s/it]  

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/truethari/reactfolio.git /Volumes/T7/data/repos/JavaScript/truethari@reactfolio
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/truethari@reactfolio'...
POST git-upload-pack (327 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 76%|███████▌  | 1137/1500 [3:03:06<45:54,  7.59s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/barqawiz/intellinode.git /Volumes/T7/data/repos/JavaScript/barqawiz@intellinode
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/barqawiz@intellinode'...
POST git-upload-pack (gzip 3177 to 1600 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 77%|███████▋  | 1153/1500 [3:05:48<1:52:47, 19.50s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/WenqiOfficial/StudyWithMiku.git /Volumes/T7/data/repos/JavaScript/WenqiOfficial@StudyWithMiku
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/WenqiOfficial@StudyWithMiku'...
POST git-upload-pack (227 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 78%|███████▊  | 1166/1500 [3:07:23<1:01:37, 11.07s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/quadjr/aframe-gaussian-splatting.git /Volumes/T7/data/repos/JavaScript/quadjr@aframe-gaussian-splatting
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/quadjr@aframe-gaussian-splatting'...
POST git-upload-pack (177 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 82%|████████▏ | 1230/1500 [3:18:51<1:30:38, 20.14s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/ssitvit/code-canvas.git /Volumes/T7/data/repos/JavaScript/ssitvit@code-canvas
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/ssitvit@code-canvas'...
POST git-upload-pack (277 bytes)
error: RPC failed; curl 56 Recv failure: Connection reset by peer
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 83%|████████▎ | 1252/1500 [3:22:51<1:37:25, 23.57s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/enhance-dev/enhance-music.git /Volumes/T7/data/repos/JavaScript/enhance-dev@enhance-music
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/enhance-dev@enhance-music'...
POST git-upload-pack (877 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 84%|████████▍ | 1258/1500 [3:24:18<1:30:16, 22.38s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/uba-gcoen/stichhub.git /Volumes/T7/data/repos/JavaScript/uba-gcoen@stichhub
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/uba-gcoen@stichhub'...
POST git-upload-pack (777 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 89%|████████▊ | 1329/1500 [3:32:47<51:33, 18.09s/it]  

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/mywebshortcuts/mywebshortcuts.git /Volumes/T7/data/repos/JavaScript/mywebshortcuts@mywebshortcuts
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/mywebshortcuts@mywebshortcuts'...
POST git-upload-pack (427 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 89%|████████▊ | 1330/1500 [3:32:47<36:20, 12.83s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/mopfasrfga/officehack.git /Volumes/T7/data/repos/JavaScript/mopfasrfga@officehack
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/mopfasrfga@officehack'...
fatal: could not read Username for 'https://github.com': Device not configured
'


 90%|█████████ | 1357/1500 [3:41:30<43:06, 18.09s/it]   

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/oslabs-beta/netpulse.git /Volumes/T7/data/repos/JavaScript/oslabs-beta@netpulse
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/oslabs-beta@netpulse'...
POST git-upload-pack (577 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 91%|█████████ | 1367/1500 [3:45:02<1:46:10, 47.90s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/star3am/hashiqube.git /Volumes/T7/data/repos/JavaScript/star3am@hashiqube
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/star3am@hashiqube'...
POST git-upload-pack (gzip 2977 to 1528 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 94%|█████████▍| 1413/1500 [3:51:14<41:59, 28.97s/it]  

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/bouzidanas/streamlit-code-editor.git /Volumes/T7/data/repos/JavaScript/bouzidanas@streamlit-code-editor
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/bouzidanas@streamlit-code-editor'...
POST git-upload-pack (577 bytes)
error: RPC failed; curl 56 Recv failure: Connection reset by peer
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 95%|█████████▍| 1419/1500 [3:52:14<24:29, 18.14s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/andrewwoan/PSU-VR.git /Volumes/T7/data/repos/JavaScript/andrewwoan@PSU-VR
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/andrewwoan@PSU-VR'...
POST git-upload-pack (327 bytes)
error: RPC failed; curl 18 Transferred a partial file
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 96%|█████████▌| 1440/1500 [3:54:21<19:47, 19.79s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/Kritika30032002/ReactCreations.git /Volumes/T7/data/repos/JavaScript/Kritika30032002@ReactCreations
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/Kritika30032002@ReactCreations'...
POST git-upload-pack (177 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 96%|█████████▋| 1445/1500 [3:55:48<20:59, 22.89s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/hellovivian/generative-disco.git /Volumes/T7/data/repos/JavaScript/hellovivian@generative-disco
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/hellovivian@generative-disco'...
POST git-upload-pack (477 bytes)
error: RPC failed; curl 56 Recv failure: Connection reset by peer
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 97%|█████████▋| 1456/1500 [3:57:21<04:31,  6.16s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/pythagora-io/api.git /Volumes/T7/data/repos/JavaScript/pythagora-io@api
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/pythagora-io@api'...
fatal: could not read Username for 'https://github.com': Device not configured
'


 98%|█████████▊| 1467/1500 [3:59:41<07:34, 13.78s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/wladradchenko/radio.wladradchenko.ru.git /Volumes/T7/data/repos/JavaScript/wladradchenko@radio.wladradchenko.ru
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/wladradchenko@radio.wladradchenko.ru'...
POST git-upload-pack (477 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


 99%|█████████▉| 1489/1500 [4:03:16<05:50, 31.90s/it]

Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/miguelmyers8/PlanetTechJS.git /Volumes/T7/data/repos/JavaScript/miguelmyers8@PlanetTechJS
  stderr: 'Cloning into '/Volumes/T7/data/repos/JavaScript/miguelmyers8@PlanetTechJS'...
POST git-upload-pack (677 bytes)
error: RPC failed; curl 92 HTTP/2 stream 3 was not closed cleanly: CANCEL (err 8)
fatal: the remote end hung up unexpectedly
fatal: early EOF
fatal: index-pack failed
'


100%|██████████| 1500/1500 [4:03:58<00:00,  9.76s/it]


Merge with searched Data.

In [45]:
github_metadata_path = "./metadata/cloned_repo_metadata.jsonl"

github_metadata = pd.read_json(github_metadata_path, orient="records", lines=True)
github_metadata

Unnamed: 0,name,mainLanguage,url,local_path,sha
0,brunodev85/winlator,Java,https://github.com/brunodev85/winlator.git,,
1,zongzibinbin/mallchat,Java,https://github.com/zongzibinbin/mallchat.git,/Volumes/T7/data/repos/Java/zongzibinbin@mallchat,f2563328d4a5311a2308079025da4e637437759b
2,alipay/fury,Java,https://github.com/alipay/fury.git,/Volumes/T7/data/repos/Java/alipay@fury,2ebdad055b159c06908835c093374e9b723f8bfb
3,aress31/burpgpt,Java,https://github.com/aress31/burpgpt.git,/Volumes/T7/data/repos/Java/aress31@burpgpt,79ea6b9fe66b351f108191e2a0732147091060c1
4,hncboy/ai-beehive,Java,https://github.com/hncboy/ai-beehive.git,/Volumes/T7/data/repos/Java/hncboy@ai-beehive,a87a815b14e82bc68c9e682b2438a728105e21a7
...,...,...,...,...,...
1995,xieerduos/electron-template,JavaScript,https://github.com/xieerduos/electron-template...,/Volumes/T7/data/repos/JavaScript/xieerduos@el...,aceea2f58cddb9847f03b67ee201ce5158a364dc
1996,microsoft/copilothackathon,JavaScript,https://github.com/microsoft/copilothackathon.git,/Volumes/T7/data/repos/JavaScript/microsoft@co...,9bcea90f505f47ff99335b145ef3ee22571d6303
1997,cztomsik/ggml-js,JavaScript,https://github.com/cztomsik/ggml-js.git,/Volumes/T7/data/repos/JavaScript/cztomsik@ggm...,e723d1cd7ed166d8a1e742e6633f2482a8c0c654
1998,5tiaowu/magichands,JavaScript,https://github.com/5tiaowu/magichands.git,/Volumes/T7/data/repos/JavaScript/5tiaowu@magi...,5ffd6307e51d62a8865803c6fee9a2d86ae6b33d


过滤出clone失败的项目，None error

In [46]:
github_metadata.loc[github_metadata.local_path.isnull()]

Unnamed: 0,name,mainLanguage,url,local_path,sha
0,brunodev85/winlator,Java,https://github.com/brunodev85/winlator.git,,
136,RedRoverSchool/JenkinsQA_07,Java,https://github.com/RedRoverSchool/JenkinsQA_07...,,
311,jurecapuder/AndroidWeatherApp,Java,https://github.com/jurecapuder/AndroidWeatherA...,,
368,Halqq/misericordia-client,Java,https://github.com/Halqq/misericordia-client.git,,
418,kiegroup/kogito-runtimes,Java,https://github.com/kiegroup/kogito-runtimes.git,,
460,CaiMuCheng/LeafIDE,Java,https://github.com/CaiMuCheng/LeafIDE.git,,
496,RIA-AED/RIABandwidthSaver,Java,https://github.com/RIA-AED/RIABandwidthSaver.git,,
529,lzwme/chatgpt-sites,TypeScript,https://github.com/lzwme/chatgpt-sites.git,,
627,yesmore/inke,TypeScript,https://github.com/yesmore/inke.git,,
651,uiwjs/react-login-page,TypeScript,https://github.com/uiwjs/react-login-page.git,,


In [47]:
filtered_df = pd.read_json("./metadata/filtered_ghs_results_05_mar_2023.jsonl", orient="records", lines=True)

filtered_df = filtered_df.merge(github_metadata, on="name", how="inner")

#ghs_df[['mainLanguage_y', 'mainLanguage_x']]
filtered_df = filtered_df.loc[~filtered_df.local_path.isna()]
filtered_df.groupby("mainLanguage_y").agg(num_repos=("name", "count"), num_commits=("commits", "mean"))

Unnamed: 0_level_0,num_repos,num_commits
mainLanguage_y,Unnamed: 1_level_1,Unnamed: 2_level_1
Java,493,250.39
JavaScript,480,202.88
Python,483,143.04
TypeScript,487,171.63


## Mining all *added method* commits in each repo.

In [48]:
logger = logging.getLogger("commit_miner")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler("./logs/commit_miner.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
logger.addHandler(fh)

## 存储repo的commit信息

In [49]:
from utils import download_all_commits
from call_parser import langs

github_metadata_path = "./metadata/cloned_repo_metadata.jsonl"
github_metadata = pd.read_json(github_metadata_path, orient="records", lines=True)
github_metadata = github_metadata.loc[~github_metadata.local_path.isna()]

commit_meta_path = "./metadata/repo_commit_metadata.jsonl"

processed_names = set()
if os.path.exists(commit_meta_path):
    with jsonlines.open(commit_meta_path, "r") as reader:
        processed_names = set(line["name"] for line in reader)
    logger.info(f"Already processed {len(processed_names)} repositories!")
    print(f"Already processed {len(processed_names)} repositories!")
else:
    open(commit_meta_path, "w").close()

Already processed 350 repositories!


make dir to save the commit histories.

In [50]:
output_path = "../data/commits/"

for lang in ["Java", "Python", "JavaScript", "TypeScript"]:
    cur_output_path = output_path + lang + "/"
    Path(cur_output_path).mkdir(parents=True, exist_ok=True)        

In [51]:
langs

{'Java': <call_parser.java_call_parser.JavaCallParser at 0x11d6a0ee0>,
 'Python': <call_parser.py_call_parser.PyCallParser at 0x11a2f3cd0>,
 'JavaScript': <call_parser.js_call_parser.JSCallParser at 0x11a2f0430>,
 'TypeScript': <call_parser.ts_call_parser.TSCallParser at 0x12aa9eb00>}

### ！！！Only Keep repos which contains [20-500] methods.

In [52]:
cur_chunk = []
chunksize = 20

bar = tqdm(github_metadata.itertuples(), total=len(github_metadata) - len(processed_names))
for row in bar:
    name = getattr(row, 'name')
    lang = getattr(row, 'mainLanguage')
    local_path = getattr(row, 'local_path')
    url = getattr(row, 'url')
    sha = getattr(row, 'sha')

    if name in processed_names:
        logger.info(f"{name} has been processed, skip.")
        continue

    try:
        extension = langs[lang].extension
        cur_commit_files_path = output_path + lang + "/" + name.replace("/", "@") + "_commits/"
        Path(cur_commit_files_path).mkdir(parents=True, exist_ok=True)
        
        result = download_all_commits(local_path, extension, bar=bar, output_path=cur_commit_files_path)
        if result is None:
            continue

        cur_output_path = output_path + lang + "/" + name.replace("/", "@") + "_commits.jsonl"  
            
        data = {
            "name": name,
            "mainLanguage": lang,
            "url": url,
            "local_path": local_path,
            "sha": sha,
            "commit_path": cur_output_path,
            "commit_file_path": cur_commit_files_path,
            "num_methods": len(result)
        }
    except Exception as e:
        print(e)
        logger.warning(f"GitHub exception with repo {name}, {repr(e)}")
        result = None
        data = {"name": name, "mainLanguage": lang, "url": url, "local_path": local_path, "sha": sha, 
                "commit_path": None, "commit_file_path": None, "num_methods": None}

    if result is not None:
        with jsonlines.open(cur_output_path, "w") as writer:
            writer.write_all(result)
    
    cur_chunk.append(data)
    if len(cur_chunk) > chunksize:
        with jsonlines.open(commit_meta_path, "a") as writer:
            writer.write_all(cur_chunk)
            cur_chunk = []

if cur_chunk:
    with jsonlines.open(commit_meta_path, "a") as writer:
        writer.write_all(cur_chunk)
        cur_chunk = []

Deal with .js: process repo ArchGPT@insomnium at commit 3ccd, processed :  92%|█████████▏| 1470/1593 [3:06:40<25:46, 12.58s/it]                                     [skip] fail to process 'yarn-standalone.js' with RecursionError - maximum recursion depth exceeded
Deal with .js: process repo oslabs-beta@Ludwig at commit 31d6, processed : : 1738it [5:08:49,  4.12s/it]                                                [skip] fail to process 'extension.js' with RecursionError - maximum recursion depth exceeded
Deal with .js: process repo oslabs-beta@Ludwig at commit e8b9, processed : : 1738it [5:27:24,  4.12s/it][skip] fail to process 'extension.js' with RecursionError - maximum recursion depth exceeded
[skip] fail to process 'extension.js' with RecursionError - maximum recursion depth exceeded
Deal with .js: process repo oslabs-beta@Ludwig at commit 460a, processed : : 1738it [5:44:26,  4.12s/it][skip] fail to process 'extension.js' with RecursionError - maximum recursion depth exceeded
[skip

In [33]:
import pandas as pd

commit_meta_path = "./metadata/repo_commit_metadata.jsonl"
commit_meta = pd.read_json(commit_meta_path, orient="records", lines=True)

filtered_df = pd.read_json("./metadata/filtered_ghs_results_05_mar_2023.jsonl", orient="records", lines=True)

filtered_df = filtered_df.merge(commit_meta, on=["name", 'mainLanguage'], how="inner")

filtered_df = filtered_df.loc[(~filtered_df.local_path.isna()) & (filtered_df['mainLanguage'] == 'Java')]
#df.sort_values(by=['A', 'B'], ascending=[True, False])
filtered_df.sort_values(by=['stargazers'], ascending=True, ignore_index=True)
filtered_df[['name', 'commit_file_path']]

Unnamed: 0,name,commit_file_path
1004,zongzibinbin/mallchat,../data/commits/Java/zongzibinbin@mallchat_com...
1005,alipay/fury,../data/commits/Java/alipay@fury_commits/
1006,aress31/burpgpt,../data/commits/Java/aress31@burpgpt_commits/
1007,hncboy/ai-beehive,../data/commits/Java/hncboy@ai-beehive_commits/
1008,hncboy/chatgpt-web-java,../data/commits/Java/hncboy@chatgpt-web-java_c...
...,...,...
1349,PeytonPlayz595/0.30-WebGL,../data/commits/Java/PeytonPlayz595@0.30-WebGL...
1350,zyyzyykk/kkTerminal,../data/commits/Java/zyyzyykk@kkTerminal_commits/
1351,sivaprasadreddy/spring-boot-tutorials-blog-series,../data/commits/Java/sivaprasadreddy@spring-bo...
1352,RussianInvestments/invest-api-java-sdk,../data/commits/Java/RussianInvestments@invest...


In [14]:
commit_meta = pd.read_json(commit_meta_path, orient="records", lines=True)
commit_meta = commit_meta[(commit_meta['num_methods'] > 50) & (commit_meta['num_methods'] < 1000)]

commit_meta.groupby("mainLanguage").agg(num_repos=("name", "count"), num_methods=("num_methods", "mean"))

Unnamed: 0_level_0,num_repos,num_methods
mainLanguage,Unnamed: 1_level_1,Unnamed: 2_level_1
Java,323,841.018576
JavaScript,208,938.663462
Python,295,652.983051
TypeScript,284,432.580986
