In [1]:
import requests
import csv
import time

# GitHub API token
GITHUB_TOKEN = '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'
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

# Helper function to clean up company names
def clean_company_name(company):
    if company:
        company = company.strip().lstrip('@').upper()
    return company

# Function to fetch users from the GitHub API
def fetch_users(city="Berlin", min_followers=200):
    users = []
    page = 1

    while True:
        url = f"https://api.github.com/search/users?q=location:{city}+followers:>{min_followers}&page={page}&per_page=100"
        response = requests.get(url, headers=HEADERS)
        data = response.json()

        # Break if no more results
        if 'items' not in data or not data['items']:
            break

        for user in data['items']:
            # Get full user info
            user_url = user['url']
            user_response = requests.get(user_url, headers=HEADERS)
            user_data = user_response.json()

            # Extract required fields
            users.append({
                'login': user_data['login'],
                'name': user_data['name'],
                'company': clean_company_name(user_data['company']),
                'location': user_data['location'],
                'email': user_data['email'],
                'hireable': user_data['hireable'],
                'bio': user_data['bio'],
                'public_repos': user_data['public_repos'],
                'followers': user_data['followers'],
                'following': user_data['following'],
                'created_at': user_data['created_at'],
            })
        page += 1
        time.sleep(1)  # Avoid hitting API rate limits

    return users

# Function to fetch repositories for a user
def fetch_repositories(user_login):
    repositories = []
    page = 1

    while True:
        url = f"https://api.github.com/users/{user_login}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        repo_data = response.json()

        # Break if no more repositories
        if not repo_data:
            break

        for repo in repo_data:
            repositories.append({
                'login': user_login,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })

        # If fewer than 100 repositories are returned, it means we're on the last page
        if len(repo_data) < 100:
            break

        page += 1  # Move to the next page
        time.sleep(1)  # Avoid hitting API rate limits

    return repositories

# Save users to CSV
def save_users_to_csv(users, filename="users.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=users[0].keys())
        writer.writeheader()
        writer.writerows(users)

# Save repositories to CSV
def save_repositories_to_csv(repositories, filename="repositories.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=repositories[0].keys())
        writer.writeheader()
        writer.writerows(repositories)

def main():
    print("Fetching users...")
    users = fetch_users()
    save_users_to_csv(users)
    print(f"Saved {len(users)} users to users.csv")

    print("Fetching repositories...")
    all_repositories = []
    for user in users:
        user_repos = fetch_repositories(user["login"])
        all_repositories.extend(user_repos)
        print(f"Fetched {len(user_repos)} repositories for user {user['login']}")

    save_repositories_to_csv(all_repositories)
    print(f"Saved {len(all_repositories)} repositories to repositories.csv")

if __name__ == "__main__":
    main()

Fetching users...
Saved 602 users to users.csv
Fetching repositories...
Fetched 73 repositories for user tiangolo
Fetched 215 repositories for user schacon
Fetched 151 repositories for user rwieruch
Fetched 149 repositories for user shuding
Fetched 79 repositories for user android10
Fetched 54 repositories for user marijnh
Fetched 7 repositories for user mxmnk
Fetched 110 repositories for user nikic
Fetched 22 repositories for user greenrobot
Fetched 32 repositories for user sebastianruder
Fetched 48 repositories for user vakila
Fetched 115 repositories for user tonsky
Fetched 285 repositories for user felixge
Fetched 129 repositories for user alexeygrigorev
Fetched 106 repositories for user hanxiao
Fetched 19 repositories for user ines
Fetched 73 repositories for user apaszke
Fetched 175 repositories for user lewagon
Fetched 105 repositories for user codebytere
Fetched 152 repositories for user prisma
Fetched 51 repositories for user armancodv
Fetched 56 repositories for user peterbou

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
users = pd.read_csv('users.csv')
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,tiangolo,Sebastián Ramírez,,"Berlin, Germany",tiangolo@gmail.com,True,"Creator of FastAPI, Typer, SQLModel, Asyncer, ...",73,26452,3,2012-01-12T22:37:04Z
1,schacon,Scott Chacon,GITBUTLERAPP,"Berlin, Germany",schacon@gmail.com,,,215,13758,26,2008-01-27T17:19:28Z
2,rwieruch,Robin Wieruch,,Berlin/Remote,,True,React & Next.js • JavaScript & TypeScript • Fr...,151,8618,30,2012-10-03T15:11:48Z
3,shuding,Shu Ding,VERCEL,Berlin,g@shud.in,,Be curious. Read widely. Try new things. — aar...,149,6758,345,2013-02-23T07:46:30Z
4,android10,Fernando Cejas,PEPPR-IO,"Berlin, Germany",android10@fernandocejas.com,True,Quantum Engineering at @Qruise-ai. Former Dire...,79,6716,85,2012-01-20T21:35:31Z


In [4]:
users['hireable'] = users['hireable'].fillna(False).astype(bool)

  users['hireable'] = users['hireable'].fillna(False).astype(bool)


In [5]:
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

tiangolo,schacon,rwieruch,shuding,android10


In [6]:
users['created_at'] = pd.to_datetime(users['created_at'])

In [7]:
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

schacon,adamwiggins,myobie,lstoll,znarf


In [8]:
repos = pd.read_csv('repositories.csv')
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,tiangolo,tiangolo/a2wsgi,2024-01-07T20:24:07Z,10,10,,True,True,apache-2.0
1,tiangolo,tiangolo/alembic,2020-05-22T09:50:31Z,5,5,,True,True,mit
2,tiangolo,tiangolo/anaconda_cluster_install,2015-03-11T14:58:44Z,5,5,Shell,True,True,
3,tiangolo,tiangolo/angular-docker-multi-stage-example,2017-10-02T18:43:28Z,15,15,,True,True,
4,tiangolo,tiangolo/annotated-types,2023-08-27T14:32:00Z,8,8,,True,False,mit


In [9]:
repos['license_name'].value_counts().head(3)

Unnamed: 0_level_0,count
license_name,Unnamed: 1_level_1
mit,16155
apache-2.0,6526
other,4621


In [10]:
users['company'].value_counts().head(1)

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
MICROSOFT,8


In [11]:
repos['language'].value_counts().head(1)

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,10447


In [12]:
users_after_2020 = users[users['created_at'] > '2020-01-01']
users_after_2020.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
37,typst,Typst,,Berlin,hello@typst.app,False,Compose papers faster: Focus on your text and ...,25,2115,0,2020-06-29 15:08:38+00:00
81,cs-MohamedAyman,Ayman M.,,"Berlin, Berlin, Germany",,True,Machine Learning Mentor and Advisor | Research...,13,1075,0,2020-01-22 13:06:45+00:00
107,zaraco,Zahra Teymouri,VECTRONIC AEROSPACE,"Berlin, Germany",zahrateymouri90@gmail.com,False,Software Developer,30,877,909,2020-03-08 12:07:10+00:00
238,chrisgrieser,Chris Grieser,TECHNICAL UNIVERSITY OF BERLIN,"Berlin, Germany",,False,Researcher in sociology & SWE,80,465,26,2020-10-22 10:50:35+00:00
281,slint-ui,Slint,,Berlin,info@slint.dev,False,"Slint - Declarative GUI for Rust, C++, and Jav...",36,392,0,2020-05-03 15:35:02+00:00


In [13]:
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
Python,95
JavaScript,89
HTML,30
TypeScript,29
Rust,28


In [14]:
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

Fluent 12950.0


In [15]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

tiangolo,marijnh,vakila,alexeygrigorev,lewagon


In [16]:
correlation = users['followers'].corr(users['public_repos'])
correlation

0.017166162259139948

In [17]:
import csv
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)

    print(f"{slope:.3f}")
else:
    print("Error")

0.290


In [18]:
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})

correlation = repos['has_projects'].corr(repos['has_wiki'])

print(round(correlation, 3))

0.404


In [19]:
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
difference

47.746189808321645

In [20]:
from sklearn.linear_model import LinearRegression
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

4.175071173073752

In [21]:
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])

            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

derhuerst,janpio,saschanaz,blueyed,jamesmunns


In [22]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

-0.01091631603553056

In [23]:
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Schneider
