In [None]:
import pandas as pd
import requests
import time

In [None]:
GITHUB_TOKEN = 'token'
HEADERS = {
    'Authorization': f'token {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github.v3+json'
}

In [None]:
def fetch_users():
    users = []
    page = 1

    while True:
        response = requests.get(
            f'https://api.github.com/search/users?q=location:Toronto+followers:>100&per_page=100&page={page}',
            headers=HEADERS
        )

        if response.status_code == 403:  # Rate limit exceeded
            print("Rate limit exceeded. Waiting for reset...")
            reset_time = int(response.headers.get('X-RateLimit-Reset'))
            wait_time = max(reset_time - int(time.time()), 0) + 5  # Add buffer
            time.sleep(wait_time)
            continue

        if response.status_code != 200:
            print(f"Error fetching users: {response.json()}")
            break

        data = response.json()
        if not data['items']:
            break  # No more users found

        users.extend(data['items'])
        page += 1

    return users

In [None]:
def fetch_user_details(user_url):
    response = requests.get(user_url, headers=HEADERS)

    if response.status_code == 403:  # Rate limit exceeded
        print("Rate limit exceeded. Waiting for reset...")
        reset_time = int(response.headers.get('X-RateLimit-Reset'))
        wait_time = max(reset_time - int(time.time()), 0) + 5  # Add buffer
        time.sleep(wait_time)
        return fetch_user_details(user_url)  # Retry after waiting

    if response.status_code != 200:
        print(f"Error fetching user details: {response.json()}")
        return None

    return response.json()

In [None]:
def fetch_user_repositories(username):
    repos = []
    page = 1

    while True:
        response = requests.get(
            f'https://api.github.com/users/{username}/repos?per_page=100&page={page}',
            headers=HEADERS
        )

        if response.status_code == 403:  # Rate limit exceeded
            print("Rate limit exceeded. Waiting for reset...")
            reset_time = int(response.headers.get('X-RateLimit-Reset'))
            wait_time = max(reset_time - int(time.time()), 0) + 5  # Add buffer
            time.sleep(wait_time)
            continue

        if response.status_code != 200:
            print(f"Error fetching repositories for {username}: {response.json()}")
            break

        data = response.json()
        if not data:
            break  # No more repositories found

        repos.extend(data)
        page += 1

    return repos

In [None]:
users = fetch_users()

In [None]:
users_data = []
repositories_data = []

for u in users:
    user = fetch_user_details(u['url'])
    if user:
      username = user['login']
      company = user['company']
      if company:
        company_str_cleaned = ' '.join(part.strip().lstrip('@').upper() for part in company.split())
      else:
        company_str_cleaned = ''
      hireable = user['hireable']
      if hireable is True:
        hireable = 'true'
      elif hireable is False:
        hireable = 'false'
      else:
        hireable = ''
      user_info = {
          'login': username,
          'name': user['name'] or '',
          'company':  company_str_cleaned,
          'location': user['location'] or '',
          'email': user['email'] or '',
          'hireable': hireable,
          'bio': user['bio'] or '',
          'public_repos': user['public_repos'],
          'followers': user['followers'],
          'following': user['following'],
          'created_at': user['created_at']
      }
      users_data.append(user_info)

      repos = fetch_user_repositories(username)
      for repo in repos:
        has_projects = repo['has_projects']
        if has_projects is True:
          has_projects = 'true'
        elif has_projects is False:
          has_projects = 'false'
        else:
          has_projects = ''
        has_wiki = repo['has_wiki']
        if has_wiki is True:
          has_wiki = 'true'
        elif has_wiki is False:
          has_wiki = 'false'
        else:
          has_wiki = ''
        repo_info = {
            'login': username,
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'] or '',
            'has_projects': has_projects,
            'has_wiki': has_wiki,
            'license_name': repo['license']['name'] if repo['license'] else ''
        }
        repositories_data.append(repo_info)

In [None]:
# Save to CSV
pd.DataFrame(users_data).to_csv('users.csv', index=False)
pd.DataFrame(repositories_data).to_csv('repositories.csv', index=False)

print("Data fetched and saved to users.csv and repositories.csv")

Data fetched and saved to users.csv and repositories.csv


In [None]:
uzr = pd.read_csv('/content/users.csv')

In [None]:
rep = pd.read_csv('/content/repositories.csv')

In [None]:
#Q1
uzr.loc[uzr.followers.sort_values(ascending=False).head(5).index].login.values

array(['aneagoie', 'ZhangMYihua', 'susanli2016', 'thedaviddias',
       'ange-yaghi'], dtype=object)

In [None]:
#Q2
uzr.loc[uzr.created_at.sort_values().head(5).index].login.values

array(['jamesmacaulay', 'michaelklishin', 'myles', 'nwjsmith', 'vito'],
      dtype=object)

In [None]:
#Q3
rep.license_name.value_counts().head(3).keys()

Index(['MIT License', 'Other', 'Apache License 2.0'], dtype='object', name='license_name')

In [None]:
#Q4
uzr.company.value_counts().head(1).keys()

Index(['UNIVERSITY OF TORONTO', 'SHOPIFY', 'NVIDIA', 'GOOGLE',
       'YORK UNIVERSITY', 'MOZILLA', 'GETSENTRY', 'WEALTHSIMPLE', 'MICROSOFT',
       'GITHUB',
       ...
       'BINAXITY', 'LOUIS LAZARIS', 'MAPLE', 'KAGGLE', 'G2I', 'THEFRONTSIDE',
       'GITLABHQ', 'NURENYX', 'DATADOG', 'COUNTLY'],
      dtype='object', name='company', length=308)

In [None]:
#Q5
rep.language.value_counts().head(1).keys()

Index(['JavaScript'], dtype='object', name='language')

In [None]:
#Q6
datethr = '31-12-2020'
created_at_dt = pd.to_datetime(uzr.created_at)
uzrs_after_2020 = uzr.loc[created_at_dt > datethr]
rep[rep.login.isin(uzrs_after_2020.login.values)].language.value_counts()[1:2].keys()

Index(['TypeScript'], dtype='object', name='language')

In [None]:
#Q7
rep.groupby('language').stargazers_count.mean().sort_values(ascending=False).head(1).keys()

Index(['Cython'], dtype='object', name='language')

In [None]:
#Q8
uzr['leader_strength'] = uzr.followers /  (1 + uzr.following)
uzr.loc[uzr.leader_strength.sort_values(ascending=False).head(5).index].login.values

array(['aneagoie', 'nayuki', 'GrapheneOS', 'hlissner', 'rspivak'],
      dtype=object)

In [None]:
#Q9
uzr['followers'].corr(uzr['public_repos'])

0.05503721505396733

In [None]:
#Q10
from sklearn.linear_model import LinearRegression

X = uzr[['public_repos']]
y = uzr['followers']

model = LinearRegression()
model.fit(X, y)

print("Slope: ", model.coef_[0])

Slope:  0.25084946597223734


In [None]:
#Q11
rep['has_projects'].astype(int).corr(rep['has_wiki'].astype(int))

0.3532076787341683

In [None]:
#Q12
uzr[uzr.hireable == True].following.mean() - uzr[uzr.hireable != True].following.mean()

-12.608149030947175

In [None]:
#Q13
from sklearn.linear_model import LinearRegression
#import re
#print(uzr.shape)
new_uzr =uzr.copy()

In [None]:
new_uzr['bio_wc'] = new_uzr['bio'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)

In [None]:
df_filtered = new_uzr[new_uzr['bio_wc'] > 0]

In [None]:
X = df_filtered[['bio_wc']]
y = df_filtered['followers']
model = LinearRegression()
model.fit(X, y)
print("Slope: ", model.coef_[0])

Slope:  8.532693974150503


In [None]:
#Q14
rep_created_dt = pd.to_datetime(rep.created_at)
weekend_created = rep.loc[(rep_created_dt.dt.day_name() == 'Saturday') | (rep_created_dt.dt.day_name() == 'Sunday')]
weekend_created.login.value_counts().head(5).keys()

Index(['n1ckfg', 'jsoref', 'QuinntyneBrown', 'invokethreatguy', 'andyw8'], dtype='object', name='login')

In [None]:
#Q15
fraction_hireable = uzr[uzr['hireable'] == True]['email'].notna().mean()
fraction_non_hireable = uzr[uzr['hireable'] != True]['email'].notna().mean()
fraction = fraction_hireable - fraction_non_hireable
print(fraction)

0.13038254141919353


In [None]:
#Q16
df = uzr.copy()
name = df['name'].dropna().str.strip()
surname = name.str.split().str[-1]
surname.value_counts()

Unnamed: 0_level_0,count
name,Unnamed: 1_level_1
Ahmed,4
Kumar,3
Li,3
Wu,3
Brown,3
...,...
Santos,1
Teneycke,1
Mankovski,1
Peiris,1
