# Cloning and Processing repos from GitHub

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from tqdm import tqdm

pd.options.display.float_format = "{:.2f}".format

## Clone the repos from the filtered repos.

In [2]:
import pandas as pd
import git
from pathlib import Path
import os
from tqdm import tqdm
import json
from pydriller import Repository, ModificationType

## Read repo names.

In [29]:
filtered_df = pd.read_json("./metadata/filtered_ghs_results_05_mar_2023.jsonl", orient="records", lines=True)

filtered_df = filtered_df[filtered_df['mainLanguage']=='Java']

In [33]:
import calendar
import time
import datetime
import logging
import jsonlines
import os
import shutil

logger = logging.getLogger("clone_repo")
fh = logging.FileHandler("./logs/clone_repo.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
logger.addHandler(fh)

#langs = ['Java', 'Python', 'TypeScript', 'JavaScript']
langs = ['Java']

In [35]:
github_metadata_path = "./metadata/cloned_repo_metadata.jsonl"

names = filtered_df["name"].tolist()

mainLanguages = filtered_df["mainLanguage"].tolist()

if os.path.exists(github_metadata_path):
    with jsonlines.open(github_metadata_path, "r") as reader:
        processed_names = set(line["name"] for line in reader)
    logger.info(f"Already processed {len(processed_names)} repositories!")
    names = [name for name in names if name not in processed_names]
    mainLanguages = [lang for name, lang in zip(names, mainLanguages) if name not in processed_names]
    logger.info(f"{len(names)} repositories left!")
else:
    open(github_metadata_path, "w").close()

names = names[2:4]

names

['alipay/fury', 'aress31/burpgpt']

In [36]:
# cur_chunk = []
# chunksize = 20
#
# output_path = '../data/repos/'
#
# for lang in langs:
#     Path(output_path + lang).mkdir(parents=True, exist_ok=True)
#
# for id, repo_name in tqdm(enumerate(names), total=len(names)):
#     try:
#         lang = mainLanguages[id]
#         url = f'https://github.com/{repo_name}.git'
#         logger.info(f'Cloning repository at {url}.')
#         local_path = os.path.join(output_path + lang, repo_name.replace("/", "@"))
#
#         if os.path.exists(local_path):
#             logger.info(f"{local_path} already exists. remove!!!")
#             shutil.rmtree(local_path)
#
#         r = git.Repo.clone_from(url, local_path)
#         sha = r.rev_parse('HEAD').hexsha
#         data = {
#             "name": repo_name,
#             "mainLanguage": lang,
#             "url": url,
#             "local_path": local_path,
#             "sha": sha
#         }
#     except Exception as e:
#         print(e)
#         logger.warning(f"GitHub exception with repo {repo_name}, {repr(e)}")
#         data = {"name": repo_name, "mainLanguage": lang, "url": url, "local_path": None, "sha": None}
#
#     cur_chunk.append(data)
#     if len(cur_chunk) > chunksize:
#         with jsonlines.open(github_metadata_path, "a") as writer:
#             writer.write_all(cur_chunk)
#             cur_chunk = []
#
# if cur_chunk:
#     with jsonlines.open(github_metadata_path, "a") as writer:
#         writer.write_all(cur_chunk)
#         cur_chunk = []

100%|██████████| 2/2 [00:08<00:00,  4.06s/it]


Merge with searched Data.

In [37]:
github_metadata_path = "./metadata/cloned_repo_metadata.jsonl"

github_metadata = pd.read_json(github_metadata_path, orient="records", lines=True)
github_metadata

Unnamed: 0,name,mainLanguage,url,local_path,sha
0,alipay/fury,Java,https://github.com/alipay/fury.git,../data/repos/Java/alipay@fury,20a1a78b17a75a123a6f5b7094c06ff77defc0fe
1,aress31/burpgpt,Java,https://github.com/aress31/burpgpt.git,../data/repos/Java/aress31@burpgpt,79ea6b9fe66b351f108191e2a0732147091060c1


过滤出clone失败的项目，None error

In [38]:
github_metadata.loc[github_metadata.local_path.isnull()]

Unnamed: 0,name,mainLanguage,url,local_path,sha


In [39]:
filtered_df = pd.read_json("./metadata/filtered_ghs_results_05_mar_2023.jsonl", orient="records", lines=True)

filtered_df = filtered_df.merge(github_metadata, on="name", how="inner")

#ghs_df[['mainLanguage_y', 'mainLanguage_x']]
filtered_df = filtered_df.loc[~filtered_df.local_path.isna()]
filtered_df.groupby("mainLanguage_y").agg(num_repos=("name", "count"), num_commits=("commits", "mean"))

Unnamed: 0_level_0,num_repos,num_commits
mainLanguage_y,Unnamed: 1_level_1,Unnamed: 2_level_1
Java,2,326.5


## Mining all *added method* commits in each repo.

In [40]:
logger = logging.getLogger("commit_miner")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler("./logs/commit_miner.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
logger.addHandler(fh)

## 存储repo的commit信息

In [41]:
from utils import download_all_commits
from call_parser import langs

github_metadata_path = "./metadata/cloned_repo_metadata.jsonl"
github_metadata = pd.read_json(github_metadata_path, orient="records", lines=True)
github_metadata = github_metadata.loc[~github_metadata.local_path.isna()]

commit_meta_path = "./metadata/repo_commit_metadata.jsonl"

processed_names = set()
if os.path.exists(commit_meta_path):
    with jsonlines.open(commit_meta_path, "r") as reader:
        processed_names = set(line["name"] for line in reader)
    logger.info(f"Already processed {len(processed_names)} repositories!")
    print(f"Already processed {len(processed_names)} repositories!")
else:
    open(commit_meta_path, "w").close()



make dir to save the commit histories.

In [42]:
output_path = "../data/commits/"

for lang in ["Java", "Python", "JavaScript", "TypeScript"]:
    cur_output_path = output_path + lang + "/"
    Path(cur_output_path).mkdir(parents=True, exist_ok=True)        

In [43]:
langs

{'Java': <build.java_call_parser.JavaCallParser at 0x142ba1cc0>,
 'Python': <build.py_call_parser.PyCallParser at 0x142ba1cf0>,
 'JavaScript': <build.js_call_parser.JSCallParser at 0x142ba1f60>,
 'TypeScript': <build.ts_call_parser.TSCallParser at 0x142c37460>}

### ！！！Only Keep repos which contains [20-500] methods.

In [44]:
cur_chunk = []
chunksize = 20

bar = tqdm(github_metadata.itertuples(), total=len(github_metadata) - len(processed_names))
for row in bar:
    name = getattr(row, 'name')
    lang = getattr(row, 'mainLanguage')
    local_path = getattr(row, 'local_path')
    url = getattr(row, 'url')
    sha = getattr(row, 'sha')

    if name in processed_names:
        logger.info(f"{name} has been processed, skip.")
        continue

    try:
        extension = langs[lang].extension
        cur_commit_files_path = output_path + lang + "/" + name.replace("/", "@") + "_commits/"
        Path(cur_commit_files_path).mkdir(parents=True, exist_ok=True)
        
        result = download_all_commits(local_path, extension, bar=bar, output_path=cur_commit_files_path)
        if result is None:
            continue

        cur_output_path = output_path + lang + "/" + name.replace("/", "@") + "_commits.jsonl"  
            
        data = {
            "name": name,
            "mainLanguage": lang,
            "url": url,
            "local_path": local_path,
            "sha": sha,
            "commit_path": cur_output_path,
            "commit_file_path": cur_commit_files_path,
            "num_methods": len(result)
        }
    except Exception as e:
        print(e)
        logger.warning(f"GitHub exception with repo {name}, {repr(e)}")
        result = None
        data = {"name": name, "mainLanguage": lang, "url": url, "local_path": local_path, "sha": sha, 
                "commit_path": None, "commit_file_path": None, "num_methods": None}

    if result is not None:
        with jsonlines.open(cur_output_path, "w") as writer:
            writer.write_all(result)
    
    cur_chunk.append(data)
    if len(cur_chunk) > chunksize:
        with jsonlines.open(commit_meta_path, "a") as writer:
            writer.write_all(cur_chunk)
            cur_chunk = []

if cur_chunk:
    with jsonlines.open(commit_meta_path, "a") as writer:
        writer.write_all(cur_chunk)
        cur_chunk = []

Deal with .java: process repo aress31@burpgpt at commit 075d, processed : 100%|██████████| 2/2 [00:48<00:00, 24.05s/it]


In [7]:
from build_data import build_dataset

build_dataset(prefix_path='/home/tangze-U2315/project-method-miner/data/',
              process_lang="Java",
              stack_graph_path='/home/tangze-U2315/stack-graphs/target/debug/')

Unnamed: 0,name,mainLanguage,url,local_path,sha,commit_path,commit_file_path,num_methods
0,alipay/fury,Java,https://github.com/alipay/fury.git,../data/repos/Java/alipay@fury,0437556ef4f3445869923f6b90806f565fda0e33,../data/commits/Java/alipay@fury_commits.jsonl,../data/commits/Java/alipay@fury_commits/,10533
1,baidu/bifromq,Java,https://github.com/baidu/bifromq.git,../data/repos/Java/baidu@bifromq,8c3b21dbdb3bc34cac963d714669f50d2ffa9c6e,../data/commits/Java/baidu@bifromq_commits.jsonl,../data/commits/Java/baidu@bifromq_commits/,10495
2,google/bindiff,Java,https://github.com/google/bindiff.git,../data/repos/Java/google@bindiff,0a4526289e5a267d9c8f0f76a1f5dd21d691e808,../data/commits/Java/google@bindiff_commits.jsonl,../data/commits/Java/google@bindiff_commits/,7090
3,LangStream/langstream,Java,https://github.com/LangStream/langstream.git,../data/repos/Java/LangStream@langstream,979303b777d59b2910c32372b6e71a60486116e7,../data/commits/Java/LangStream@langstream_com...,../data/commits/Java/LangStream@langstream_com...,5889
4,cubefs/compass,Java,https://github.com/cubefs/compass.git,../data/repos/Java/cubefs@compass,d4b53de98d66a4771660efd80cf298903aa7f48e,../data/commits/Java/cubefs@compass_commits.jsonl,../data/commits/Java/cubefs@compass_commits/,5252
5,sofastack/sofa-serverless,Java,https://github.com/sofastack/sofa-serverless.git,../data/repos/Java/sofastack@sofa-serverless,de5709580d6b9e9a5f4bde0e514806e44f949914,../data/commits/Java/sofastack@sofa-serverless...,../data/commits/Java/sofastack@sofa-serverless...,3221
6,aizuda/easy-retry,Java,https://github.com/aizuda/easy-retry.git,../data/repos/Java/aizuda@easy-retry,86ae31421e6add1c75324dbab9400fa31cd0f120,../data/commits/Java/aizuda@easy-retry_commits...,../data/commits/Java/aizuda@easy-retry_commits/,2802
7,knowly-ai/langtorch,Java,https://github.com/knowly-ai/langtorch.git,../data/repos/Java/knowly-ai@langtorch,9809e32eca257465d28c390b2cf140bbffca697a,../data/commits/Java/knowly-ai@langtorch_commi...,../data/commits/Java/knowly-ai@langtorch_commits/,1551
8,dromara/sms4j,Java,https://github.com/dromara/sms4j.git,../data/repos/Java/dromara@sms4j,893b35371e286485420c7ffe22fba075a3855a55,../data/commits/Java/dromara@sms4j_commits.jsonl,../data/commits/Java/dromara@sms4j_commits/,1442
9,zongzibinbin/mallchat,Java,https://github.com/zongzibinbin/mallchat.git,../data/repos/Java/zongzibinbin@mallchat,f2563328d4a5311a2308079025da4e637437759b,../data/commits/Java/zongzibinbin@mallchat_com...,../data/commits/Java/zongzibinbin@mallchat_com...,1260
