In [22]:
import requests
from dotenv import load_dotenv
import os

_ = load_dotenv()

GH_USER = os.environ["GH_USERNAME"]
GH_TOKEN = os.environ["GH_TOKEN"]

In [23]:
import pandas as pd

full_repo_model_df = pd.read_csv('UMLFiles_List_V2.0.csv')
full_repo_model_df

Unnamed: 0,Project,Model Link - Github
0,0-complexity/ovcdoc_public/,https://www.github.com/0-complexity/ovcdoc_pub...
1,0003088/libelektra-qt-gui-test/,https://www.github.com/0003088/libelektra-qt-g...
2,00s/deadman/,https://www.github.com/00s/deadman/tree/master...
3,01db0y/ImageUploader/,https://www.github.com/01db0y/ImageUploader/tr...
4,01db0y/ShareIt/,https://www.github.com/01db0y/ShareIt/tree/mas...
...,...,...
93602,yotomyoto/301_assignment1/,https://www.github.com/yotomyoto/301_assignmen...
93603,zeronero13/af6/,https://www.github.com/zeronero13/af6/tree/mas...
93604,zeronero13/happehardver/,https://www.github.com/zeronero13/happehardver...
93605,Akshit-/ClientServerCommunication/,https://www.github.com/Akshit-/ClientServerCom...


In [3]:
full_repo_df = full_repo_model_df.drop(columns=['Model Link - Github']).drop_duplicates(ignore_index=True)
full_repo_df

Unnamed: 0,Project
0,0-complexity/ovcdoc_public/
1,0003088/libelektra-qt-gui-test/
2,00s/deadman/
3,01db0y/ImageUploader/
4,01db0y/ShareIt/
...,...
24725,vectorxiang/vectorxiang.github.io/
24726,victorsndvg/FPL/
24727,wmde/FundraisingFrontend/
24728,yotomyoto/301_assignment1/


In [4]:
def get_star_count(repo_name):
    try:
        r = requests.get('https://api.github.com/repos/{}'.format(repo_name), auth=(GH_USER, GH_TOKEN), timeout=60)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(r.headers['X-RateLimit-Remaining'])
        print(r.headers['X-RateLimit-Reset'])
        return None

    return r.json().get('stargazers_count')

In [5]:
print(f"No. of stars: {get_star_count('zakipauzi/concept-domain-coverage/'[:-1])}")

No. of stars: 2


In [6]:
def repo_exists(repo_name):
    try:
        r = requests.get(f'https://api.github.com/repos/{repo_name}', auth=(GH_USER, GH_TOKEN), timeout=60)
        return r.status_code == 200
    except requests.exceptions.RequestException as e:
        print(f'Error: {e}')
        return False

In [7]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

tqdm.pandas()

def add_repo_exist(df):
    def fetch_exist(repo):
        return repo_exists(repo[:-1])

    with ThreadPoolExecutor(max_workers=50) as executor:
        df['Exists'] = list(tqdm(executor.map(fetch_exist, df['Project']), total=len(df['Project'])))
    return df

In [None]:
full_repo_df = add_repo_exist(full_repo_df)

In [None]:
full_repo_df = full_repo_df[full_repo_df['Exists']]
full_repo_df.to_csv('full_repo_df.csv', index=False)

In [10]:
full_repo_df.head()

Unnamed: 0,Project,Exists
1,0003088/libelektra-qt-gui-test/,True
2,00s/deadman/,True
3,01db0y/ImageUploader/,True
4,01db0y/ShareIt/,True
5,01org/vmf/,True


In [11]:
full_repo_df.tail()

Unnamed: 0,Project,Exists
23805,xeguh83/8Puzzle/,True
23807,xen2/SharpLang/,True
23808,Xennis/graphical_model_editor/,True
23809,xenodium/xenodium.github.io/,True
23810,XenofoR/PAQ/,True


### Get GH projects with UML files

In [8]:
full_repo_df = pd.read_csv('full_repo_df.csv').drop(columns=['Exists'])
full_repo_df

Unnamed: 0,Project
0,0003088/libelektra-qt-gui-test/
1,00s/deadman/
2,01db0y/ImageUploader/
3,01db0y/ShareIt/
4,01org/vmf/
...,...
6868,xeguh83/8Puzzle/
6869,xen2/SharpLang/
6870,Xennis/graphical_model_editor/
6871,xenodium/xenodium.github.io/


### Check if repo is active

In [7]:
from concurrent.futures import ThreadPoolExecutor

def get_last_activity(repo_name):
    try:
        r = requests.get(f'https://api.github.com/repos/{repo_name}', auth=(GH_USER, GH_TOKEN), timeout=60)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f'Error: {e}')
        return None

    return r.json().get('updated_at')

def add_last_activity(df):
    def fetch_last_activity(repo):
        return get_last_activity(repo[:-1])

    with ThreadPoolExecutor(max_workers=50) as executor:
        df['Last_Activity'] = list(tqdm(executor.map(fetch_last_activity, df['Project']), total=len(df['Project'])))
    return df

In [6]:
# full_repo_active_df = add_last_activity(full_repo_df)
# full_repo_active_df

In [7]:
from datetime import datetime, timedelta
import pytz

# Filter for projects with last activity within the last year
# one_year_ago = datetime.now(pytz.UTC) - timedelta(days=365)
# full_repo_active_df['Last_Activity'] = pd.to_datetime(full_repo_active_df['Last_Activity'])
# full_repo_active_df = full_repo_active_df[full_repo_active_df['Last_Activity'] >= one_year_ago]

full_repo_active_df = pd.read_csv('full_repo_active_df.csv')
full_repo_active_df

Unnamed: 0,Project,Last_Activity
0,0416354917/COMP6442/,2024-06-08 05:46:58+00:00
1,3electrons/AnalogWidgets/,2024-03-11 02:32:07+00:00
2,4dnucleome/cog-abm/,2024-08-09 17:24:23+00:00
3,Aapzu/super-duper-pentago/,2024-04-24 19:13:20+00:00
4,aarondunn/bugkick/,2024-01-10 13:58:30+00:00
...,...,...
529,wwivbbs/wwiv/,2024-11-02 15:11:04+00:00
530,wwj718/wwj718.github.io/,2024-11-14 06:45:00+00:00
531,wyon/note/,2024-08-23 06:52:12+00:00
532,xen2/SharpLang/,2024-10-18 19:10:08+00:00


In [8]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def get_contributor_count(repo_name, limit_hit):
    try:
        r = requests.get(f'https://api.github.com/repos/{repo_name}/contributors', auth=(GH_USER, GH_TOKEN), timeout=60)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f'Error: {e}')
        print(r.headers)
        return None

    return len(r.json())

def add_contributor_count(df):
    
    limit_hit = False

    def fetch_contributors(repo):
        return get_contributor_count(repo[:-1], limit_hit)

    tqdm.pandas()
    with ThreadPoolExecutor(max_workers=50) as executor:
        df['Contributors'] = list(tqdm(executor.map(fetch_contributors, df['Project']), total=len(df['Project'])))
    return df

# repo_contrib_df_1 = add_contributor_count(full_repo_df)

In [9]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def add_stargazer_count(df):
    def fetch_stargazers(repo):
        return get_star_count(repo[:-1])

    tqdm.pandas()
    with ThreadPoolExecutor(max_workers=50) as executor:
        df['Stargazers'] = list(tqdm(executor.map(fetch_stargazers, df['Project']), total=len(df['Project'])))
    return df

In [20]:
repo_df = pd.read_csv('repo_master_df.csv')
repo_df

Unnamed: 0,Project,Last_Activity,Contributors,Stargazers
0,4dnucleome/cog-abm/,2024-08-09 17:24:23+00:00,3.0,6.0
1,Aapzu/super-duper-pentago/,2024-04-24 19:13:20+00:00,2.0,0.0
2,aarondunn/bugkick/,2024-01-10 13:58:30+00:00,6.0,118.0
3,abarbour/psd/,2023-12-22 22:40:16+00:00,2.0,9.0
4,abego/treelayout/,2024-09-12 04:35:38+00:00,2.0,92.0
...,...,...,...,...
529,wooknight/phpcallgraph/,2024-11-14 12:20:17+00:00,1.0,38.0
530,wp-plugins/richtexteditor/,2024-03-01 14:57:10+00:00,0.0,0.0
531,wristware/iso8583/,2023-11-27 20:23:59+00:00,1.0,3.0
532,wwj718/wwj718.github.io/,2024-11-14 06:45:00+00:00,1.0,12.0


In [21]:
models_df = repo_df.merge(full_repo_model_df, on='Project', how='left')
models_df.to_csv('models_df.csv', index=False)
models = models_df['Model Link - Github'].tolist()
models = [x.split('.')[-1] for x in models]
models = list(set(models))
models

['bmp', 'xmi', 'png', 'uml', 'jpg', 'jpeg', 'gif', 'svg']

In [5]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def get_graphic_files(repo_name, path=''):
    try:
        r = requests.get(f'https://api.github.com/repos/{repo_name}/contents/{path}', auth=(GH_USER, GH_TOKEN), timeout=60)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        # print("Remaining: " + r.headers['X-RateLimit-Remaining'])
        with open('rate_limit_log.txt', 'w') as f:
            f.write("Time reset: " + r.headers['X-RateLimit-Reset'] + '\n')
        return ['Failed']

    files = r.json()
    graphic_files = []
    
    for file in files:
        if file['type'] == 'file' and file['name'].lower().endswith(('jpg', 'gif', 'jpeg', 'bmp', 'svg', 'xmi', 'uml', 'png', 'puml')):
            graphic_files.append(file['path'])
        elif file['type'] == 'dir':
            graphic_files.extend(get_graphic_files(repo_name, file['path']))
    return graphic_files

def add_graphic_files(df):
    def fetch_graphic_files(repo):
        return get_graphic_files(repo[:-1])

    with ThreadPoolExecutor(max_workers=1) as executor:
        df['Graphic_Files'] = list(tqdm(executor.map(fetch_graphic_files, df['Project']), total=len(df['Project'])))
    return df

In [6]:
import time

def process_graphics(repo_df, ctr):

    stop_flag = False

    while not stop_flag:

        print("Processing repo_graphics_df_{}".format(ctr))
        # add graphic files
        repo_df = add_graphic_files(repo_df)

        repo_df.to_csv("repo_graphics_df_{}.csv".format(ctr), index=False)
        print("Saved repo_graphics_df_{}.csv".format(ctr))

        next_repo_df = repo_df[repo_df['Graphic_Files'].apply(lambda x: 'Failed' in x)]
        next_repo_df = pd.concat([next_repo_df.iloc[1:], next_repo_df.iloc[[0]]]).reset_index(drop=True)

        print("Processed {} repositories".format(len(repo_df) - len(next_repo_df)))

        if len(next_repo_df) == 0:
            stop_flag = True
            print("All repositories processed!")
            break
        else:
            repo_df = next_repo_df
            ctr += 1

            print("Sleeping for 25 minutes, now it is {}".format(time.ctime()))
            time.sleep(1500)

In [9]:
import ast

repo_graphics_df = pd.read_csv('repo_graphics_df.csv')
repo_graphics_df['Graphic_Files'] = repo_graphics_df['Graphic_Files'].apply(ast.literal_eval)
# show length
repo_graphics_df['Graphic_Files_Length'] = repo_graphics_df['Graphic_Files'].apply(lambda x: len(x))
repo_graphics_df

Unnamed: 0,Project,Last_Activity,Contributors,Stargazers,Graphic_Files,Graphic_Files_Length
0,4dnucleome/cog-abm/,2024-08-09 17:24:23+00:00,3.0,6.0,"[AgentClassDiagram.png, SimulationClassDiagram...",4
1,Aapzu/super-duper-pentago/,2024-04-24 19:13:20+00:00,2.0,0.0,[documentation/javalabra/checkstyle/images/rss...,9
2,aarondunn/bugkick/,2024-01-10 13:58:30+00:00,6.0,118.0,"[BugKick.dev.js/bugkick/images/fb_1_azure.png,...",859
3,abarbour/psd/,2023-12-22 22:40:16+00:00,2.0,9.0,"[vignettes/yuml_d.png, vignettes/yuml_n.png]",2
4,abego/treelayout/,2024-09-12 04:35:38+00:00,2.0,92.0,[org.abego.treelayout.demo/src/main/java/org/a...,44
...,...,...,...,...,...,...
484,badqiu/rapid-framework/,2024-07-24 09:58:37+00:00,0.0,36.0,"[images/badqiu_gmail.com_black.jpg, images/bad...",883
485,SnowFox108/NexusCore3/,2024-06-19 10:24:00+00:00,2.0,0.0,[Documentations/NexusCore.Document/Concept/Arc...,1506
486,cacheonix/cacheonix-core/,2024-08-24 13:49:53+00:00,2.0,52.0,[3rdparty/apache-log4j-1.2.15/src/main/java/or...,337
487,apache/wicket/,2024-11-10 21:10:53+00:00,30.0,739.0,[archetypes/quickstart/src/main/resources/arch...,281


In [10]:
repo_graphics_df = repo_graphics_df[repo_graphics_df['Graphic_Files_Length'] > 0].reset_index(drop=True)
repo_graphics_df.to_csv('repo_graphics_df.csv', index=False)
repo_graphics_df

Unnamed: 0,Project,Last_Activity,Contributors,Stargazers,Graphic_Files,Graphic_Files_Length
0,4dnucleome/cog-abm/,2024-08-09 17:24:23+00:00,3.0,6.0,"[AgentClassDiagram.png, SimulationClassDiagram...",4
1,Aapzu/super-duper-pentago/,2024-04-24 19:13:20+00:00,2.0,0.0,[documentation/javalabra/checkstyle/images/rss...,9
2,aarondunn/bugkick/,2024-01-10 13:58:30+00:00,6.0,118.0,"[BugKick.dev.js/bugkick/images/fb_1_azure.png,...",859
3,abarbour/psd/,2023-12-22 22:40:16+00:00,2.0,9.0,"[vignettes/yuml_d.png, vignettes/yuml_n.png]",2
4,abego/treelayout/,2024-09-12 04:35:38+00:00,2.0,92.0,[org.abego.treelayout.demo/src/main/java/org/a...,44
...,...,...,...,...,...,...
484,badqiu/rapid-framework/,2024-07-24 09:58:37+00:00,0.0,36.0,"[images/badqiu_gmail.com_black.jpg, images/bad...",883
485,SnowFox108/NexusCore3/,2024-06-19 10:24:00+00:00,2.0,0.0,[Documentations/NexusCore.Document/Concept/Arc...,1506
486,cacheonix/cacheonix-core/,2024-08-24 13:49:53+00:00,2.0,52.0,[3rdparty/apache-log4j-1.2.15/src/main/java/or...,337
487,apache/wicket/,2024-11-10 21:10:53+00:00,30.0,739.0,[archetypes/quickstart/src/main/resources/arch...,281


### Sampling

- Total: 489 projects
- Confidence level: 95%
- Margin of error: 5%
- Ideal sample size: 216
- Random seed: 42
- Files: ~61k

In [15]:
sampled_repo_graphics_df = repo_graphics_df.sample(n=216, random_state=42).reset_index(drop=True)
sampled_repo_graphics_df.head()

Unnamed: 0,Project,Last_Activity,Contributors,Stargazers,Graphic_Files,Graphic_Files_Length
0,sprossiter/JSIT/,2024-08-26 23:12:56+00:00,1.0,2.0,[src/main/resources/docs/diagrams/alAddDepende...,6
1,baidu/broc/,2024-10-25 16:02:35+00:00,4.0,100.0,"[doc/dependent_relationship.jpg, doc/flowchart...",3
2,telldus/telldus/,2024-09-09 12:44:16+00:00,10.0,64.0,[telldus-gui/3rdparty/qtsingleapplication/doc/...,106
3,Splamy/TS3AudioBot/,2024-11-14 13:01:16+00:00,12.0,698.0,"[TS3AudioBot/Media/SleepingKitty.png, TS3Audio...",5
4,benmfaul/XRTB/,2024-05-17 07:27:22+00:00,1.0,121.0,"[src/Test.uml, src/com/xrtb/bidder/CampaignPro...",54


In [16]:
sampled_repo_graphics_df.tail()

Unnamed: 0,Project,Last_Activity,Contributors,Stargazers,Graphic_Files,Graphic_Files_Length
211,Vdragon/NTOU_CPP_Lab_Opensource_Project/,2024-03-19 13:36:53+00:00,0.0,3.0,"[Lab_6_1/Class_diagram/Lab 6-1.uml, Lab_6_2/Cl...",2
212,tarbrain/TBStateMachine/,2024-02-13 10:47:56+00:00,1.0,49.0,[Documentation/test_setup.png],1
213,apache/click/,2024-10-07 04:28:28+00:00,4.0,14.0,"[documentation/docs/velocity/images/logo.gif, ...",135
214,streambuf/recognition_numbers/,2024-04-16 11:26:55+00:00,1.0,12.0,"[screenshots/1.png, screenshots/2.png, screens...",4
215,BrOrlandi/SFECommerce/,2024-05-29 18:28:04+00:00,3.0,15.0,[uml_diagram.png],1


In [17]:
sampled_repo_graphics_df['Graphic_Files_Length'].sum()

61065

In [18]:
sampled_repo_graphics_df

Unnamed: 0,Project,Last_Activity,Contributors,Stargazers,Graphic_Files,Graphic_Files_Length
0,sprossiter/JSIT/,2024-08-26 23:12:56+00:00,1.0,2.0,[src/main/resources/docs/diagrams/alAddDepende...,6
1,baidu/broc/,2024-10-25 16:02:35+00:00,4.0,100.0,"[doc/dependent_relationship.jpg, doc/flowchart...",3
2,telldus/telldus/,2024-09-09 12:44:16+00:00,10.0,64.0,[telldus-gui/3rdparty/qtsingleapplication/doc/...,106
3,Splamy/TS3AudioBot/,2024-11-14 13:01:16+00:00,12.0,698.0,"[TS3AudioBot/Media/SleepingKitty.png, TS3Audio...",5
4,benmfaul/XRTB/,2024-05-17 07:27:22+00:00,1.0,121.0,"[src/Test.uml, src/com/xrtb/bidder/CampaignPro...",54
...,...,...,...,...,...,...
211,Vdragon/NTOU_CPP_Lab_Opensource_Project/,2024-03-19 13:36:53+00:00,0.0,3.0,"[Lab_6_1/Class_diagram/Lab 6-1.uml, Lab_6_2/Cl...",2
212,tarbrain/TBStateMachine/,2024-02-13 10:47:56+00:00,1.0,49.0,[Documentation/test_setup.png],1
213,apache/click/,2024-10-07 04:28:28+00:00,4.0,14.0,"[documentation/docs/velocity/images/logo.gif, ...",135
214,streambuf/recognition_numbers/,2024-04-16 11:26:55+00:00,1.0,12.0,"[screenshots/1.png, screenshots/2.png, screens...",4


In [19]:
sampled_repo_graphics_df.to_csv('sampled_repo_graphics_df.csv', index=False)