In [2]:
import main
import matplotlib.pyplot as plt
import pandas as pd
import pathlib
import secrets_manager

pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.2f}'.format)

plt.style.use('ggplot')

In [3]:
GITHUB_ACCESS_TOKEN = secrets_manager.get_secrets()['GITHUB_ACCESS_TOKEN']

In [38]:
# this took an hour to run
data = []

trending_repos = main.get_trending_repos(
    GITHUB_ACCESS_TOKEN,
    language="python",
    per_page=100,
    last_n_days=30
)

for repo in trending_repos:
    resp = main.get_repo_contents_recursive(
        GITHUB_ACCESS_TOKEN,
        repo["owner"]["login"],
        repo["name"]
    )
    for item in resp:
        data.append([repo['full_name'], repo['size'], item['path'], len(item['content'])])

pd.DataFrame(data, columns=['repo', 'size', 'path', 'lines']).to_csv('metadata.csv')

In [4]:
df = pd.read_csv('metadata.csv', index_col=0)

In [5]:
df.sort_values(by="lines", ascending=False).head(25)

Unnamed: 0,repo,size,path,lines
4569,agiresearch/OpenAGI,300803,github_models/Restormer/Motion_Deblurring/pretrained_models/motion_deblurring.pth.7,99261382
4566,agiresearch/OpenAGI,300803,github_models/Restormer/Motion_Deblurring/pretrained_models/motion_deblurring.pth.4,99261382
4563,agiresearch/OpenAGI,300803,github_models/Restormer/Motion_Deblurring/pretrained_models/motion_deblurring.pth.1,99261382
4564,agiresearch/OpenAGI,300803,github_models/Restormer/Motion_Deblurring/pretrained_models/motion_deblurring.pth.2,99261382
4565,agiresearch/OpenAGI,300803,github_models/Restormer/Motion_Deblurring/pretrained_models/motion_deblurring.pth.3,99261382
4568,agiresearch/OpenAGI,300803,github_models/Restormer/Motion_Deblurring/pretrained_models/motion_deblurring.pth.6,99261382
4567,agiresearch/OpenAGI,300803,github_models/Restormer/Motion_Deblurring/pretrained_models/motion_deblurring.pth.5,99261382
4517,agiresearch/OpenAGI,300803,github_models/Restormer/Defocus_Deblurring/pretrained_models/single_image_defocus_deblurring.pth.1,99252125
4539,agiresearch/OpenAGI,300803,github_models/Restormer/Denoising/pretrained_models/real_denoising.pth.1,99081850
6832,hiyouga/ChatGLM-Efficient-Tuning,73616,tests/comparison_data_v2.json,81424316


In [6]:
df['suffix'] = df['path'].apply(lambda x: pathlib.Path(x).suffix)

In [8]:
mean_size_cutoff = 54484.83

df_agg = df.groupby('suffix').agg({'lines': ['count', 'mean']})
df_agg_size = df_agg[df_agg['lines']['mean'] <= mean_size_cutoff]
df_agg_count = df_agg_size[df_agg_size['lines']['count'] > 1]

In [10]:
data = []

trending_repos = main.get_trending_repos(
    GITHUB_ACCESS_TOKEN,
    language="python",
    per_page=10,
    last_n_days=30
)

for repo in trending_repos:
    resp = main.get_repo_contents_recursive(
        GITHUB_ACCESS_TOKEN,
        repo["owner"]["login"],
        repo["name"],
        suffixes=df_agg_count.index.values
    )
    for item in resp:
        data.append([repo['full_name'], repo['size'], item['item']['path'], len(item['content'])])

pd.DataFrame(data, columns=['repo', 'size', 'path', 'lines']).to_csv('metadata_filter.csv')

In [11]:
"md" in df_agg_count.index.values or ".md" in df_agg_count.index.values

True

In [13]:
df_agg_count.sort_values(by=('lines', 'mean'), ascending=False).head(25)

Unnamed: 0_level_0,lines,lines
Unnamed: 0_level_1,count,mean
suffix,Unnamed: 1_level_2,Unnamed: 2_level_2
.js,63,54484.83
.ico,5,38795.4
.npz,146,30185.58
.woff2,7,24533.0
.cc,2,23559.0
.cu,33,15500.55
.ts,8,14533.62
.txt,295,11467.18
.qm,3,10909.33
.py,4311,10198.74
