Your task is to build a dashboard to show the most impactful LLM-related open source software (OSS) projects.

In [1]:
import requests
import pandas as pd
import os


In [2]:
# Turn off SettingWithCopyWarning
pd.options.mode.chained_assignment = None

In [3]:
## EDA

In [4]:
df = pd.read_csv('../data/sample-data.csv')

In [135]:
df[df['category']=='Model repo'].head()

Unnamed: 0,repo,category,subcat,stars,star_1d,star_1d_pct,star_7d,star_7d_pct,forks,description,top_devs,contributors,created_at,updated_at,downloads*
15,xai-org/grok-1,Model repo,,46567,93,0.20%,1120,2.41%,7783,Grok open release,"ibab, syzymon, mane",6,2024-03-17,2024-03-19,0
36,openai/whisper,Model repo,,59133,47,0.08%,368,0.62%,6772,Robust Speech Recognition via Large-Scale Weak...,"jongwook, ryanheise, EliEron",65,2022-09-16,2022-09-26,0
40,state-spaces/mamba,Model repo,,8568,41,0.48%,357,4.17%,676,,"tridao, albertfgu, epicfilemcnulty",14,2023-12-01,2024-03-23,2595
46,suno-ai/bark,Model repo,Multimodal,31878,37,0.12%,267,0.84%,3749,🔊 Text-Prompted Generative Audio Model,"gkucsko, kmfreyberg, mcamac",16,2023-04-07,2023-07-19,0
55,hpcaitech/Open-Sora,Model repo,Multimodal,13869,32,0.23%,459,3.31%,1245,Open-Sora: Democratizing Efficient Video Produ...,"zhengzangw, xyupeng, FrankLeeeee",20,2024-02-20,2024-03-23,0


In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 968 entries, 0 to 967
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   repo          968 non-null    object        
 1   category      968 non-null    object        
 2   subcat        828 non-null    object        
 3   stars         968 non-null    int64         
 4   star_1d       968 non-null    int64         
 5   star_1d_pct   968 non-null    object        
 6   star_7d       968 non-null    int64         
 7   star_7d_pct   968 non-null    object        
 8   forks         968 non-null    int64         
 9   description   931 non-null    object        
 10  top_devs      968 non-null    object        
 11  contributors  968 non-null    object        
 12  created_at    968 non-null    datetime64[ns]
 13  updated_at    908 non-null    datetime64[ns]
 14  downloads*    968 non-null    int64         
dtypes: datetime64[ns](2), int64(5), object(8

In [131]:
df.category.unique()

array(['Tutorials', 'Applications', 'AI engineering', 'Model development',
       'Model repo', 'Infrastructure', 'Lists'], dtype=object)

In [7]:
df.isna().any()

repo            False
category        False
subcat           True
stars           False
star_1d         False
star_1d_pct     False
star_7d         False
star_7d_pct     False
forks           False
description      True
top_devs        False
contributors    False
created_at      False
updated_at      False
downloads*      False
dtype: bool

In [8]:
def check_nan(dataset: pd.DataFrame) -> pd.DataFrame: # Calculate the number of NaN values in each column
    nan_counts = dataset.isnull().sum()

    # Calculate the total number of values in each column
    total_counts = dataset.shape[0]

    # Calculate the percentage of NaN values in each column
    nan_percentage = round((nan_counts / total_counts) * 100, 1)

    # Combine the counts and percentages into a DataFrame for better readability
    nan_summary = pd.DataFrame({'NaN Count': nan_counts, 'NaN Percentage': nan_percentage})

    # Display the summary
    print("Summary of NaN Values in Each Column:")
    print(nan_summary)

In [9]:
check_nan(df)

Summary of NaN Values in Each Column:
              NaN Count  NaN Percentage
repo                  0             0.0
category              0             0.0
subcat              140            14.5
stars                 0             0.0
star_1d               0             0.0
star_1d_pct           0             0.0
star_7d               0             0.0
star_7d_pct           0             0.0
forks                 0             0.0
description          37             3.8
top_devs              0             0.0
contributors          0             0.0
created_at            0             0.0
updated_at            0             0.0
downloads*            0             0.0


In [10]:
# Convert the 'date_column' from object to datetime
def safe_convert(date_str):
    try:
        return pd.to_datetime(date_str)
    except pd.errors.OutOfBoundsDatetime:
        return pd.NaT

df['updated_at'] = df['updated_at'].apply(safe_convert)
df['created_at'] = pd.to_datetime(df['created_at']).apply(safe_convert)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 968 entries, 0 to 967
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   repo          968 non-null    object        
 1   category      968 non-null    object        
 2   subcat        828 non-null    object        
 3   stars         968 non-null    int64         
 4   star_1d       968 non-null    int64         
 5   star_1d_pct   968 non-null    object        
 6   star_7d       968 non-null    int64         
 7   star_7d_pct   968 non-null    object        
 8   forks         968 non-null    int64         
 9   description   931 non-null    object        
 10  top_devs      968 non-null    object        
 11  contributors  968 non-null    object        
 12  created_at    968 non-null    datetime64[ns]
 13  updated_at    908 non-null    datetime64[ns]
 14  downloads*    968 non-null    int64         
dtypes: datetime64[ns](2), int64(5), object(8

In [12]:
df.head()

Unnamed: 0,repo,category,subcat,stars,star_1d,star_1d_pct,star_7d,star_7d_pct,forks,description,top_devs,contributors,created_at,updated_at,downloads*
0,microsoft/generative-ai-for-beginners,Tutorials,,39458,1420,3.60%,7662,19.42%,21108,"18 Lessons, Get Started Building with Generati...","john0isaac, koreyspace, yoshioterada",54,2023-06-19,2024-03-26,0
1,princeton-nlp/SWE-agent,Applications,Coding,4479,1352,30.19%,0,0.00%,370,SWE-agent: Agent Computer Interfaces Enable So...,"carlosejimenez, john-b-yang, klieret",9,2024-04-02,2024-04-04,0
2,stitionai/devika,Applications,Coding,14384,402,2.79%,2332,16.21%,1739,Devika is an Agentic AI Software Engineer that...,"mufeedvh, ARajgor, nalaso",31,2024-03-21,2024-04-04,0
3,mshumer/gpt-author,Applications,Writing,1959,229,11.69%,305,15.57%,283,,mshumer,1,2023-06-20,NaT,0
4,janhq/jan,Applications,Bots,15933,223,1.40%,1998,12.54%,868,Jan is an open source alternative to ChatGPT t...,"henryh0x1, louis-jan, 0xSage",41,2023-08-17,2024-04-04,476374


In [13]:
# # Assuming `data` is your DataFrame
# top_projects = df.nlargest(10, 'stars')

# plt.figure(figsize=(10, 6))
# plt.barh(top_projects['repo'], top_projects['stars'], color='skyblue')
# plt.xlabel('Stars')
# plt.ylabel('Repository')
# plt.title('Top 10 LLM Projects by Stars')
# plt.gca().invert_yaxis()  # Invert y-axis to have the project with the most stars on top
# plt.show()

df.nlargest(10, 'stars')

Unnamed: 0,repo,category,subcat,stars,star_1d,star_1d_pct,star_7d,star_7d_pct,forks,description,top_devs,contributors,created_at,updated_at,downloads*
25,tensorflow/tensorflow,Model development,Modeling & training,181969,69,0.04%,379,0.21%,73798,An Open Source Machine Learning Framework for ...,"mihaimaruseac, ezhulenev, yongtang",100+,2015-11-07,2024-04-04,0
49,Significant-Gravitas/AutoGPT,AI engineering,Agent,160425,33,0.02%,350,0.22%,41575,AutoGPT is the vision of accessible AI for eve...,"Pwuts, waynehamadi, Torantulino",100+,2023-03-16,2024-03-25,3510
22,AUTOMATIC1111/stable-diffusion-webui,Applications,Image production,127663,71,0.06%,683,0.54%,24856,Stable Diffusion web UI,"AUTOMATIC1111, w-e-w, dfaker",100+,2022-08-22,2024-04-01,531852
47,huggingface/transformers,Model development,Modeling & training,123644,37,0.03%,412,0.33%,24543,🤗 Transformers: State-of-the-art Machine Learn...,"thomwolf, sgugger, LysandreJik",100+,2018-10-29,2023-05-18,1820
41,f/awesome-chatgpt-prompts,Lists,Prompt engineering,102623,40,0.04%,343,0.33%,13810,This repo includes ChatGPT prompt curation to ...,"f, iuzn, fengkiej",80,2022-12-05,2024-01-08,0
20,langchain-ai/langchain,AI engineering,AIE framework,81221,76,0.09%,656,0.81%,12374,🦜🔗 Build context-aware reasoning applications,"hwchase17, baskaryan, efriis",100+,2022-10-17,2024-04-04,4816
85,pytorch/pytorch,Model development,Modeling & training,77209,20,0.03%,202,0.26%,20886,Tensors and Dynamic neural networks in Python ...,"ezyang, malfet, zou3519",100+,2016-08-13,NaT,29369
39,ChatGPTNextWeb/ChatGPT-Next-Web,AI engineering,AI interface,66349,44,0.07%,498,0.75%,54128,A cross-platform ChatGPT/Gemini UI (Web / PWA ...,"Yidadaa, H0llyW00dzZ, fred-bf",100+,2023-03-10,2024-04-04,162109
118,CompVis/stable-diffusion,Model repo,,64934,15,0.02%,179,0.28%,9792,A latent text-to-image diffusion model,"rromb, pesser, patrickvonplaten",7,2022-08-10,2022-08-22,0
187,fighting41love/funNLP,Lists,,63238,9,0.01%,242,0.38%,14037,中英文敏感词、语言检测、中外手机/电话归属地/运营商查询、名字推断性别、手机号抽取、身份证抽...,"fighting41love, wainshine, imhuster",11,2018-08-21,2023-08-24,0


In [14]:
# GitHub API endpoint for repository search
search_url = "https://api.github.com/search/repositories"

# Search parameters
query = "stars:>=1000 llm in:topics,readme,describtion"  # Repositories with at least 500 stars
sort = "stars"  # Sort by the number of stars in descending order
order = "desc"
per_page = 100  # Number of results per page
params={"q": query, "sort": sort, "order": order, "per_page": per_page, }


GITHUB_API_KEY = os.environ.get('GITHUB_API_KEY')

# GitHub API token (optional, but recommended to avoid rate limiting)
headers = {
    "Authorization": f"token {GITHUB_API_KEY}"
}

# Send a GET request to the GitHub API
response = requests.get(search_url, params=params, headers=headers)


def get_next_page_link(response):
    """Extract the URL for the next page from the response's Link header."""
    link_header = response.headers.get('Link', None)
    if link_header:
        links = link_header.split(', ')
        next_link = [link.split('; ')[0].strip('<>') for link in links if 'rel="next"' in link]
        if next_link:
            return next_link[0]
    return None

def fetch_all_pages(url, params, headers):
    while url:
        response = requests.get(url, params=params, headers=headers)
        if response.status_code == 200:
            data = response.json()
            yield data["items"]  # Assuming the data is a list of items
            # Get the next page URL
            url = get_next_page_link(response)
            params = None  # Ensure subsequent requests don't duplicate the initial query parameters
        else:
            print(f"Failed to fetch data: {response.status_code}")
            break

In [15]:
response.headers

{'Server': 'GitHub.com', 'Date': 'Sun, 07 Apr 2024 17:37:31 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Cache-Control': 'no-cache', 'Vary': 'Accept, Authorization, Cookie, X-GitHub-OTP, Accept-Encoding, Accept, X-Requested-With', 'X-OAuth-Scopes': 'read:discussion, read:project, repo, user', 'X-Accepted-OAuth-Scopes': '', 'github-authentication-token-expiration': '2024-05-06 18:42:13 UTC', 'X-GitHub-Media-Type': 'github.v3; format=json', 'Link': '<https://api.github.com/search/repositories?q=stars%3A%3E%3D1000+llm+in%3Atopics%2Creadme%2Cdescribtion&sort=stars&order=desc&per_page=100&page=2>; rel="next", <https://api.github.com/search/repositories?q=stars%3A%3E%3D1000+llm+in%3Atopics%2Creadme%2Cdescribtion&sort=stars&order=desc&per_page=100&page=7>; rel="last"', 'x-github-api-version-selected': '2022-11-28', 'X-RateLimit-Limit': '30', 'X-RateLimit-Remaining': '29', 'X-RateLimit-Reset': '1712511511', 'X-RateLimit-Used': '1', 'X-RateLimit-Reso

In [16]:
repos = []
for items in fetch_all_pages(search_url, params, headers):
    repos.extend(items)

In [17]:
len(repos)

604

In [125]:
repos[0]

{'id': 552661142,
 'node_id': 'R_kgDOIPDwlg',
 'name': 'langchain',
 'full_name': 'langchain-ai/langchain',
 'private': False,
 'owner': {'login': 'langchain-ai',
  'id': 126733545,
  'node_id': 'O_kgDOB43M6Q',
  'avatar_url': 'https://avatars.githubusercontent.com/u/126733545?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/langchain-ai',
  'html_url': 'https://github.com/langchain-ai',
  'followers_url': 'https://api.github.com/users/langchain-ai/followers',
  'following_url': 'https://api.github.com/users/langchain-ai/following{/other_user}',
  'gists_url': 'https://api.github.com/users/langchain-ai/gists{/gist_id}',
  'starred_url': 'https://api.github.com/users/langchain-ai/starred{/owner}{/repo}',
  'subscriptions_url': 'https://api.github.com/users/langchain-ai/subscriptions',
  'organizations_url': 'https://api.github.com/users/langchain-ai/orgs',
  'repos_url': 'https://api.github.com/users/langchain-ai/repos',
  'events_url': 'https://api.github.com/users/lan

In [112]:
df2 = pd.DataFrame(repos)
selected_columns = ['name', 'full_name', 'topics', 'description', 'created_at', 'updated_at', 'stargazers_count', 'forks_count', 'license']
df2.columns

Index(['id', 'node_id', 'name', 'full_name', 'private', 'owner', 'html_url',
       'description', 'fork', 'url', 'forks_url', 'keys_url',
       'collaborators_url', 'teams_url', 'hooks_url', 'issue_events_url',
       'events_url', 'assignees_url', 'branches_url', 'tags_url', 'blobs_url',
       'git_tags_url', 'git_refs_url', 'trees_url', 'statuses_url',
       'languages_url', 'stargazers_url', 'contributors_url',
       'subscribers_url', 'subscription_url', 'commits_url', 'git_commits_url',
       'comments_url', 'issue_comment_url', 'contents_url', 'compare_url',
       'merges_url', 'archive_url', 'downloads_url', 'issues_url', 'pulls_url',
       'milestones_url', 'notifications_url', 'labels_url', 'releases_url',
       'deployments_url', 'created_at', 'updated_at', 'pushed_at', 'git_url',
       'ssh_url', 'clone_url', 'svn_url', 'homepage', 'size',
       'stargazers_count', 'watchers_count', 'language', 'has_issues',
       'has_projects', 'has_downloads', 'has_wiki', 'has

In [113]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604 entries, 0 to 603
Data columns (total 81 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           604 non-null    int64  
 1   node_id                      604 non-null    object 
 2   name                         604 non-null    object 
 3   full_name                    604 non-null    object 
 4   private                      604 non-null    bool   
 5   owner                        604 non-null    object 
 6   html_url                     604 non-null    object 
 7   description                  583 non-null    object 
 8   fork                         604 non-null    bool   
 9   url                          604 non-null    object 
 10  forks_url                    604 non-null    object 
 11  keys_url                     604 non-null    object 
 12  collaborators_url            604 non-null    object 
 13  teams_url           

In [114]:
# Convert the 'created_at' column to datetime
# Normalizing a datetime to midnight effectively removes the time component, 
# making the time part 00:00:00.
df2['created_at'] = pd.to_datetime(df2['created_at']).dt.date
df2['updated_at'] = pd.to_datetime(df2['updated_at']).dt.date


In [115]:
df2_filtered = df2[selected_columns]
df2_filtered.head()

Unnamed: 0,name,full_name,topics,description,created_at,updated_at,stargazers_count,forks_count,license
0,langchain,langchain-ai/langchain,[],🦜🔗 Build context-aware reasoning applications,2022-10-17,2024-04-07,81425,12434,"{'key': 'mit', 'name': 'MIT License', 'spdx_id..."
1,ChatGPT-Next-Web,ChatGPTNextWeb/ChatGPT-Next-Web,"[chatgpt, cross-platform, desktop, fe, gemini,...",A cross-platform ChatGPT/Gemini UI (Web / PWA ...,2023-03-10,2024-04-07,66548,54259,"{'key': 'mit', 'name': 'MIT License', 'spdx_id..."
2,awesome-machine-learning,josephmisiti/awesome-machine-learning,[],A curated list of awesome Machine Learning fra...,2014-07-15,2024-04-07,63304,14447,"{'key': 'other', 'name': 'Other', 'spdx_id': '..."
3,awesome-cpp,fffaraz/awesome-cpp,"[awesome, awesome-list, c, c-plus-plus, cpp, c...",A curated list of awesome C++ (or C) framework...,2014-07-17,2024-04-07,55101,7613,"{'key': 'mit', 'name': 'MIT License', 'spdx_id..."
4,llama.cpp,ggerganov/llama.cpp,"[ggml, llama]",LLM inference in C/C++,2023-03-10,2024-04-07,54581,7703,"{'key': 'mit', 'name': 'MIT License', 'spdx_id..."


In [116]:
df2_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604 entries, 0 to 603
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              604 non-null    object
 1   full_name         604 non-null    object
 2   topics            604 non-null    object
 3   description       583 non-null    object
 4   created_at        604 non-null    object
 5   updated_at        604 non-null    object
 6   stargazers_count  604 non-null    int64 
 7   forks_count       604 non-null    int64 
 8   license           517 non-null    object
dtypes: int64(2), object(7)
memory usage: 42.6+ KB


In [117]:
# Check for nan values
check_nan(df2_filtered)

Summary of NaN Values in Each Column:
                  NaN Count  NaN Percentage
name                      0             0.0
full_name                 0             0.0
topics                    0             0.0
description              21             3.5
created_at                0             0.0
updated_at                0             0.0
stargazers_count          0             0.0
forks_count               0             0.0
license                  87            14.4


In [118]:
df2_filtered['license'] = df2_filtered['license'].apply(lambda x: x['name'] if x is not None else None)
df2_filtered['topics'] = df2_filtered['topics'].apply(lambda x: 'no topic' if x == [] else ', '.join(x)) 

In [119]:
# Check if a specific words occurs in a scpecific column
def check_name(text) -> bool:
    if 'langchain' in text.lower():
        return True
    else:
        return False

In [120]:
df2_filtered[df2_filtered['name'].apply(check_name)].head()

Unnamed: 0,name,full_name,topics,description,created_at,updated_at,stargazers_count,forks_count,license
0,langchain,langchain-ai/langchain,no topic,🦜🔗 Build context-aware reasoning applications,2022-10-17,2024-04-07,81425,12434,MIT License
24,Langchain-Chatchat,chatchat-space/Langchain-Chatchat,"chatbot, chatchat, chatglm, chatglm-6b, chatgl...",Langchain-Chatchat（原Langchain-ChatGLM）基于 Langc...,2023-03-31,2024-04-07,26368,4633,Apache License 2.0
47,gpt4-pdf-chatbot-langchain,mayooear/gpt4-pdf-chatbot-langchain,"gpt4, langchain, nextjs, openai, pdf, typescript",GPT4 & LangChain Chatbot for large PDF docs,2023-03-17,2024-04-07,14466,2969,
70,langchainjs,langchain-ai/langchainjs,no topic,🦜🔗 Build context-aware reasoning applications 🦜🔗,2023-02-06,2024-04-07,10644,1749,MIT License
133,LangChain-Chinese-Getting-Started-Guide,liaokongVFX/LangChain-Chinese-Getting-Started-...,"aigc, chatgpt, langchain, openai, openai-api",LangChain 的中文入门教程,2023-04-07,2024-04-07,6648,533,


In [121]:
# Drop rows where the 'license' column has NaN
df2_clean = df2_filtered.dropna(subset=['license'])
df2_clean.head()

Unnamed: 0,name,full_name,topics,description,created_at,updated_at,stargazers_count,forks_count,license
0,langchain,langchain-ai/langchain,no topic,🦜🔗 Build context-aware reasoning applications,2022-10-17,2024-04-07,81425,12434,MIT License
1,ChatGPT-Next-Web,ChatGPTNextWeb/ChatGPT-Next-Web,"chatgpt, cross-platform, desktop, fe, gemini, ...",A cross-platform ChatGPT/Gemini UI (Web / PWA ...,2023-03-10,2024-04-07,66548,54259,MIT License
2,awesome-machine-learning,josephmisiti/awesome-machine-learning,no topic,A curated list of awesome Machine Learning fra...,2014-07-15,2024-04-07,63304,14447,Other
3,awesome-cpp,fffaraz/awesome-cpp,"awesome, awesome-list, c, c-plus-plus, cpp, cp...",A curated list of awesome C++ (or C) framework...,2014-07-17,2024-04-07,55101,7613,MIT License
4,llama.cpp,ggerganov/llama.cpp,"ggml, llama",LLM inference in C/C++,2023-03-10,2024-04-07,54581,7703,MIT License


In [122]:
df2_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 517 entries, 0 to 603
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              517 non-null    object
 1   full_name         517 non-null    object
 2   topics            517 non-null    object
 3   description       507 non-null    object
 4   created_at        517 non-null    object
 5   updated_at        517 non-null    object
 6   stargazers_count  517 non-null    int64 
 7   forks_count       517 non-null    int64 
 8   license           517 non-null    object
dtypes: int64(2), object(7)
memory usage: 40.4+ KB


In [123]:
# Save the DataFrame as a CSV file
df2_clean.to_csv('../data/github_repos_data.csv', index=False)