In [3]:
import requests
from dotenv import load_dotenv
import os

_ = load_dotenv()

GH_USER = os.environ["GH_USERNAME"]
GH_TOKEN = os.environ["GH_TOKEN"]

In [34]:
import pandas as pd

full_repo_model_df = pd.read_csv('UMLFiles_List_V2.0.csv')
full_repo_model_df

Unnamed: 0,Project,Model Link - Github
0,0-complexity/ovcdoc_public/,https://www.github.com/0-complexity/ovcdoc_pub...
1,0003088/libelektra-qt-gui-test/,https://www.github.com/0003088/libelektra-qt-g...
2,00s/deadman/,https://www.github.com/00s/deadman/tree/master...
3,01db0y/ImageUploader/,https://www.github.com/01db0y/ImageUploader/tr...
4,01db0y/ShareIt/,https://www.github.com/01db0y/ShareIt/tree/mas...
...,...,...
93602,yotomyoto/301_assignment1/,https://www.github.com/yotomyoto/301_assignmen...
93603,zeronero13/af6/,https://www.github.com/zeronero13/af6/tree/mas...
93604,zeronero13/happehardver/,https://www.github.com/zeronero13/happehardver...
93605,Akshit-/ClientServerCommunication/,https://www.github.com/Akshit-/ClientServerCom...


In [35]:
full_repo_df = full_repo_model_df.drop(columns=['Model Link - Github']).drop_duplicates(ignore_index=True)
full_repo_df

Unnamed: 0,Project
0,0-complexity/ovcdoc_public/
1,0003088/libelektra-qt-gui-test/
2,00s/deadman/
3,01db0y/ImageUploader/
4,01db0y/ShareIt/
...,...
24725,vectorxiang/vectorxiang.github.io/
24726,victorsndvg/FPL/
24727,wmde/FundraisingFrontend/
24728,yotomyoto/301_assignment1/


In [37]:
def get_star_count(repo_name):
    try:
        r = requests.get('https://api.github.com/repos/{}'.format(repo_name), auth=(GH_USER, GH_TOKEN), timeout=60)
    except requests.exceptions.RequestException as e:
        if r.status_code != 404:
            print('Error: {}'.format(e))
        return None

    return r.json().get('stargazers_count', 0)

In [38]:
print(f"No. of stars: {get_star_count('zakipauzi/concept-domain-coverage/'[:-1])}")

No. of stars: 2


In [39]:
def repo_exists(repo_name):
    try:
        r = requests.get(f'https://api.github.com/repos/{repo_name}', auth=(GH_USER, GH_TOKEN), timeout=60)
        return r.status_code == 200
    except requests.exceptions.RequestException as e:
        print(f'Error: {e}')
        return False

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

tqdm.pandas()

def add_repo_exist(df):
    def fetch_exist(repo):
        return repo_exists(repo[:-1])

    with ThreadPoolExecutor(max_workers=50) as executor:
        df['Exists'] = list(tqdm(executor.map(fetch_exist, df['Project']), total=len(df['Project'])))
    return df

In [8]:
full_repo_df = add_repo_exist(full_repo_df)

100%|██████████| 24730/24730 [1:07:20<00:00,  6.12it/s]


In [None]:
full_repo_df = full_repo_df[full_repo_df['Exists']]
full_repo_df.to_csv('full_repo_df.csv', index=False)

In [10]:
full_repo_df.head()

Unnamed: 0,Project,Exists
1,0003088/libelektra-qt-gui-test/,True
2,00s/deadman/,True
3,01db0y/ImageUploader/,True
4,01db0y/ShareIt/,True
5,01org/vmf/,True


In [11]:
full_repo_df.tail()

Unnamed: 0,Project,Exists
23805,xeguh83/8Puzzle/,True
23807,xen2/SharpLang/,True
23808,Xennis/graphical_model_editor/,True
23809,xenodium/xenodium.github.io/,True
23810,XenofoR/PAQ/,True


### Get GH projects with UML files

In [1]:
import pandas as pd
full_repo_df = pd.read_csv('full_repo_df.csv').drop(columns=['Exists'])
full_repo_df

Unnamed: 0,Project
0,0003088/libelektra-qt-gui-test/
1,00s/deadman/
2,01db0y/ImageUploader/
3,01db0y/ShareIt/
4,01org/vmf/
...,...
6868,xeguh83/8Puzzle/
6869,xen2/SharpLang/
6870,Xennis/graphical_model_editor/
6871,xenodium/xenodium.github.io/


In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def get_contributor_count(repo_name, limit_hit):
    try:
        r = requests.get(f'https://api.github.com/repos/{repo_name}/contributors', auth=(GH_USER, GH_TOKEN), timeout=60)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        if not limit_hit:
            print(f'Error: {e}')
            print(r.headers)
            limit_hit = True
        return None

    return len(r.json())

def add_contributor_count(df):
    
    limit_hit = False

    def fetch_contributors(repo):
        return get_contributor_count(repo[:-1], limit_hit)

    tqdm.pandas()
    with ThreadPoolExecutor(max_workers=50) as executor:
        df['Contributors'] = list(tqdm(executor.map(fetch_contributors, df['Project']), total=len(df['Project'])))
    return df

repo_contrib_df = add_contributor_count(full_repo_df)

In [28]:
repo_contrib_df = pd.read_csv('repo_contrib_df.csv')
repo_contrib_df

Unnamed: 0,Project,Contributors
0,0003088/libelektra-qt-gui-test/,12.0
1,01org/vmf/,11.0
2,080419android/scheduler/,3.0
3,0877624/0877624-0896919-0894785/,3.0
4,09421/Track-A-Bus/,2.0
...,...,...
1099,ClintEsteMadera/arco-iris/,2.0
1100,cliqz-oss/keyvi/,7.0
1101,AOSPA/android_frameworks_base/,19.0
1102,Azure/azure-content/,14000.0


In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def get_graphic_files(repo_name, path=''):
    try:
        r = requests.get(f'https://api.github.com/repos/{repo_name}/contents/{path}', auth=(GH_USER, GH_TOKEN), timeout=60)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f'Error: {e}')
        return ['Failed']

    files = r.json()
    graphic_files = []
    for file in files:
        if file['type'] == 'file' and file['name'].lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.svg', 'puml', '.uml')):
            graphic_files.append(file['path'])
        elif file['type'] == 'dir':
            graphic_files.extend(get_graphic_files(repo_name, file['path']))
    return graphic_files

def add_graphic_files(df):
    def fetch_graphic_files(repo):
        return get_graphic_files(repo[:-1])

    with ThreadPoolExecutor(max_workers=50) as executor:
        df['Graphic_Files'] = list(tqdm(executor.map(fetch_graphic_files, df['Project']), total=len(df['Project'])))
    return df

full_repo_df = add_graphic_files(full_repo_df)

  0%|          | 0/6873 [00:00<?, ?it/s]

Error: 404 Client Error: Not Found for url: https://api.github.com/repos/365day/365day.github.io/contents/blog/2014/05/09/xing-neng-ce-shi-jin-jie-zhi-nan-ji-chu-pian-%5B?%5D-(xi-tong-zi-yuan-de-jiang-jie-)
Error: 404 Client Error: Not Found for url: https://api.github.com/repos/131213web/ProjectTask/contents/tasks/task07/MusicShare/WebRoot/picture/%EF%BF%BD%EF%BF%BD%EF%BF%BD
Error: 404 Client Error: Not Found for url: https://api.github.com/repos/131213web/ProjectTask/contents/tasks/task07/MusicShare/WebRoot/picture/%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD
Error: 404 Client Error: Not Found for url: https://api.github.com/repos/131213web/ProjectTask/contents/tasks/task07/MusicShare/WebRoot/picture/%EF%BF%BD%EF%BF%BD%EF%BF%BD%D6%B5%EF%BF%BD%EF%BF%BD%EF%BF%BD


In [14]:
full_repo_df

Unnamed: 0,Project,Graphic_Files
0,0003088/libelektra-qt-gui-test/,[]
1,00s/deadman/,[]
2,01db0y/ImageUploader/,[]
3,01db0y/ShareIt/,[]
4,01org/vmf/,[]
...,...,...
6868,xeguh83/8Puzzle/,[Failed]
6869,xen2/SharpLang/,[Failed]
6870,Xennis/graphical_model_editor/,[Failed]
6871,xenodium/xenodium.github.io/,[Failed]


In [11]:
full_repo_df_failed = full_repo_df[full_repo_df['Graphic_Files'].apply(lambda x: x == ['Failed'])]
full_repo_df_succeeded = full_repo_df[full_repo_df['Graphic_Files'].apply(lambda x: x != ['Failed'])]
full_repo_df_failed

Unnamed: 0,Project,Graphic_Files
4604,StoragePerformanceAnalyzer/SPA/,[Failed]
4605,storbukas/shallowblue/,[Failed]
4606,Storken/BestWebapp4eva/,[Failed]
4607,storm20200/UniversityFlakySnakey/,[Failed]
4608,stormcool/pathfinder/,[Failed]
...,...,...
6868,xeguh83/8Puzzle/,[Failed]
6869,xen2/SharpLang/,[Failed]
6870,Xennis/graphical_model_editor/,[Failed]
6871,xenodium/xenodium.github.io/,[Failed]


In [12]:
full_repo_df_succeeded

Unnamed: 0,Project,Graphic_Files
0,0003088/libelektra-qt-gui-test/,[]
1,00s/deadman/,[]
2,01db0y/ImageUploader/,[]
3,01db0y/ShareIt/,[]
4,01org/vmf/,[]
...,...,...
4599,Stoddard-Austin/SandBox/,[]
4600,Stollie/HR-CommandPattern-HWOpdracht/,[ClassDiagram1.png]
4601,stombor-tgm/Rueckwaertssalto/,"[UML_Datenlogik.png, UML_neu.png]"
4602,stonerworx/mediaq-poi/,[]


In [4]:
# full_repo_df_succeeded.to_csv('full_repo_df_succeeded.csv', index=False)
# full_repo_df_failed.to_csv('full_repo_df_failed.csv', index=False)
full_repo_df_succeeded = pd.read_csv('full_repo_df_succeeded.csv')
full_repo_df_failed = pd.read_csv('full_repo_df_failed.csv')

In [7]:
full_repo_df_failed = add_graphic_files(full_repo_df_failed)

 30%|██▉       | 679/2269 [02:07<04:34,  5.80it/s]

Error: 404 Client Error: Not Found for url: https://api.github.com/repos/thomas-bornschlegel/AndroidRemoteSlideshow/contents
{'Date': 'Fri, 08 Nov 2024 14:51:36 GMT', 'Content-Type': 'application/json; charset=utf-8', 'github-authentication-token-expiration': '2025-01-06 12:59:52 +0000', 'X-GitHub-Media-Type': 'github.v3; format=json', 'x-accepted-github-permissions': 'contents=read', 'x-github-api-version-selected': '2022-11-28', 'X-RateLimit-Limit': '5000', 'X-RateLimit-Remaining': '1772', 'X-RateLimit-Reset': '1731079420', 'X-RateLimit-Used': '3228', 'X-RateLimit-Resource': 'core', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset', 'Access-Control-Allow-Origin': '*', 'Strict-Transport-Security': 'max-age=31536000; incl

100%|██████████| 2269/2269 [06:41<00:00,  5.66it/s]


In [None]:
import ast

# Append full_repo_df_failed to full_repo_df
repo_df = pd.concat([full_repo_df_succeeded, full_repo_df_failed], ignore_index=True)

# Ensure all Graphic_Files are lists
repo_df['Graphic_Files'] = repo_df['Graphic_Files'].apply(lambda x: x if isinstance(x, list) else ast.literal_eval(x))

# Remove rows where Graphic_files column is ['Failed'] or []
repo_df = repo_df[~repo_df['Graphic_Files'].isin([['Failed']])]
repo_df = repo_df[repo_df['Graphic_Files'].map(len) > 0]
repo_df = repo_df.reset_index(drop=True)

repo_df

Unnamed: 0,Project,Graphic_Files
0,1127050148/SitiNurpadilah_1127050148_IF5G_Tuga...,"[Activity Diagram Vending Machine.uml, State D..."
1,12010994/TAA-TP3/,[char.uml]
2,12520054/etn-contra-game/,"[2.png, Untitled-1.png, boss3_bullet.png, fire..."
3,188383/mydjango/,[class diagram.svg]
4,1987cr/Duckface/,[Class Diagram.png]
...,...,...
1264,xAanas/modele-latex/,[insat.jpg]
1265,xat/contao-rpc/,[class-diagram.png]
1266,xaviborja/MPWAR-EXAM/,[Diagrama-de-clases-del-dominio.png]
1267,Xeald86/blackJack/,"[Updated_BlackJack_class_diagram.png, old_Blac..."


In [None]:
repo_df.to_csv('repo_df.csv', index=False)

### Graphics from Github Repos

In [29]:
repo_df = pd.read_csv('repo_contrib_df.csv')
repo_df

Unnamed: 0,Project,Graphic_Files,Contributors
0,1127050148/SitiNurpadilah_1127050148_IF5G_Tuga...,"['Activity Diagram Vending Machine.uml', 'Stat...",1
1,12520054/etn-contra-game/,"['2.png', 'Untitled-1.png', 'boss3_bullet.png'...",1
2,188383/mydjango/,['class diagram.svg'],1
3,267497/projektNaZal/,"['DiagramCzynnosci.jpg', 'DiagramKlas.jpg', 'D...",1
4,365day/365day.github.io/,['favicon.png'],1
...,...,...,...
1161,xAanas/modele-latex/,['insat.jpg'],1
1162,xat/contao-rpc/,['class-diagram.png'],2
1163,xaviborja/MPWAR-EXAM/,['Diagrama-de-clases-del-dominio.png'],2
1164,Xeald86/blackJack/,"['Updated_BlackJack_class_diagram.png', 'old_B...",1


In [31]:
import os
import requests
from tqdm import tqdm

def sanitize_folder_name(folder_name):
    return "".join(c if c.isalnum() or c in (' ', '.', '_') else '_' for c in folder_name)

def download_file(url, dest_folder):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder, exist_ok=True)
    response = requests.get(url, stream=True)
    file_name = os.path.join(dest_folder, url.split('/')[-1])
    with open(file_name, 'wb') as file:
        for chunk in response.iter_content(1024):
            file.write(chunk)

def download_graphics(df):
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        repo_name = row['Project'][:-1]
        graphic_files = row['Graphic_Files']
        sanitized_repo_name = sanitize_folder_name(repo_name.replace('/', '_'))
        for file_path in graphic_files:
            file_url = f'https://raw.githubusercontent.com/{repo_name}/master/{file_path}'
            download_file(file_url, os.path.join('downloaded_graphics', sanitized_repo_name))

download_graphics(repo_df)

  0%|          | 0/1166 [00:05<?, ?it/s]


PermissionError: [Errno 13] Permission denied: 'downloaded_graphics\\1127050148_SitiNurpadilah_1127050148_IF5G_TugasMandiri_VendingMachine\\ '

In [28]:
from openai import OpenAI
client = OpenAI()

def get_uml_files(repo_name):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", 
                "content": "You are an AI assistant helping a user to determine if the Github repository contains UML diagrams. " \
                    "The repository name will be given to you in the format of 'username/repository_name/'. "\
                    "You need to check if the repository contains UML diagrams and provide the user with the list of UML diagrams found in the repository." \
                    "If the repository does not contain any UML diagrams, only reply 'None' without quotes."
                    "Where there is at least a UML diagram, you need to provide the user with the list of UML diagrams found in the repository in the form of list of strings of the filepaths." \
                    "Repository: {}".format(repo_name)}],
        temperature=0,
        max_tokens=2048,
        top_p=1,
        seed=0,
        frequency_penalty=0,
        presence_penalty=0
    )

    return response.choices[0].message.content

In [29]:
get_uml_files('zakipauzi/concept-domain-coverage/')

'None'

In [31]:
from tqdm import tqdm

tqdm.pandas()

full_repo_df['UML_Files'] = full_repo_df['Project'].progress_apply(get_uml_files)

100%|██████████| 598/598 [04:35<00:00,  2.17it/s]


In [33]:
full_repo_df[full_repo_df['UML_Files'].map(lambda x: x != 'None')]

Unnamed: 0,Project,Graphic_Files,UML_Files
42,abaumgarner/CSCD349-UML/,['349finaluml.png'],"['diagrams/class_diagram.uml', 'diagrams/seque..."
75,AdrianGherle/BlackJack-attempt_UML-diagram/,['BlackJack-attempt diagram.gif'],"['UML/ClassDiagram.png', 'UML/SequenceDiagram...."
92,aggarwalsomya/UMLParser/,['op.png'],"['diagrams/class_diagram.uml', 'diagrams/seque..."
125,AlekseiIvshin/04_UML/,"['ActiviryDiagram.png', 'Class Diagram.png', '...","['class_diagram.png', 'sequence_diagram.svg', ..."
275,Asiorek/uml/,"['classDiagram.png', 'objectDiagram.png', 'ogo...","['diagrams/class_diagram.uml', 'diagrams/seque..."


In [None]:


full_repo_df = full_repo_df.sort_values(by='Graphic_Files', key=lambda x: x.map(len), ascending=False).reset_index(drop=True)
full_repo_df

Unnamed: 0,Project,Graphic_Files
0,AgileVentures/MetPlus_tracker/,"['MetPlus-Agency-Admin-BPC-1.png', 'MetPlus-Ag..."
1,askiba/master-thesis/,"['agh.jpg', 'airDrag.png', 'architekture-simpl..."
2,audreyt/cn.ethercalc.net/,"['collab-borders.png', 'collab-conflict.png', ..."
3,AudioCommons/audiocommons.github.io/,"['android-chrome-144x144.png', 'android-chrome..."
4,Blaskon/GroupProject/,"['CrokePark.jpeg', 'CrokePark1.jpg', 'GPO1.jpg..."
...,...,...
593,rymdo/SuperEpicSpelOfDoom/,['cb.bmp']
594,aggarwalsomya/UMLParser/,['op.png']
595,abhinay100/fengoffice_app/,['s.gif']
596,cassini-mohsin/complaints/,['s.gif']


In [45]:
# identify if the graphics are UML diagrams

def is_uml_diagram(file_name):
    uml_keywords = ['uml', 'classdiagram', 'sequencediagram', 'statediagram', 'activitydiagram', 'componentdiagram', 'deploymentdiagram', 'usecasediagram']
    return any(keyword in file_name.lower() for keyword in uml_keywords)

def identify_uml_files(df):
    df['UML_Files'] = df['Graphic_Files'].apply(lambda files: [is_uml_diagram(file) for file in files])
    return df

full_repo_df = identify_uml_files(full_repo_df)

In [46]:
full_repo_df

Unnamed: 0,Project,Graphic_Files,UML_Files
0,AgileVentures/MetPlus_tracker/,"['MetPlus-Agency-Admin-BPC-1.png', 'MetPlus-Ag...","[False, False, False, False, False, False, Fal..."
1,askiba/master-thesis/,"['agh.jpg', 'airDrag.png', 'architekture-simpl...","[False, False, False, False, False, False, Fal..."
2,audreyt/cn.ethercalc.net/,"['collab-borders.png', 'collab-conflict.png', ...","[False, False, False, False, False, False, Fal..."
3,AudioCommons/audiocommons.github.io/,"['android-chrome-144x144.png', 'android-chrome...","[False, False, False, False, False, False, Fal..."
4,Blaskon/GroupProject/,"['CrokePark.jpeg', 'CrokePark1.jpg', 'GPO1.jpg...","[False, False, False, False, False, False, Fal..."
...,...,...,...
593,rymdo/SuperEpicSpelOfDoom/,['cb.bmp'],"[False, False, False, False, False, False, Fal..."
594,aggarwalsomya/UMLParser/,['op.png'],"[False, False, False, False, False, False, Fal..."
595,abhinay100/fengoffice_app/,['s.gif'],"[False, False, False, False, False, False, Fal..."
596,cassini-mohsin/complaints/,['s.gif'],"[False, False, False, False, False, False, Fal..."
