In [None]:
import requests
import pandas as pd
import time
from urllib3.util import Retry
from requests.adapters import HTTPAdapter

# GitHub API Token (replace with your actual token)
api_token = 'Personal GitHub API will not allow if pasted here. Please replace with your own token.'  

# GitHub API URL and Headers
base_url = 'https://api.github.com'
headers = {
    'Authorization': f'token {api_token}',
    'Accept': 'application/vnd.github.v3+json'  # Added explicit API version
}

# Initialize retry strategy and session for requests
try:
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    session = requests.Session()
    session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
except Exception as e:
    print(f"Error setting up retry strategy: {e}")
    print("Continuing without retry mechanism...")
    session = requests.Session()

# Function to handle GitHub API rate limiting
def handle_rate_limiting(response):
    if response.status_code == 403 and "X-RateLimit-Remaining" in response.headers:
        if int(response.headers["X-RateLimit-Remaining"]) == 0:
            reset_time = int(response.headers["X-RateLimit-Reset"])
            sleep_time = reset_time - int(time.time())
            print(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.")
            time.sleep(sleep_time)
            return True
    return False

# Collect users with location 'Shanghai' and followers >= 200
users_data = []
page = 1
while True:
    users_url = f"{base_url}/search/users?q=location:Shanghai+followers:>=200&page={page}&per_page=100"
    try:
        response = session.get(users_url, headers=headers)
        response.raise_for_status()  # Added error checking
        if handle_rate_limiting(response):
            continue
        data = response.json()
        if 'items' not in data or not data['items']:
            break
        users_data.extend(data['items'])
        page += 1
        print(f"Fetched page {page} of users")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching users page {page}: {e}")
        if page > 1:  # If we have some data, continue with processing
            break
        else:
            raise  # If we have no data at all, raise the error

# Extract detailed user information
users = []
for i, user in enumerate(users_data, 1):
    try:
        user_detail_url = user['url']
        user_response = session.get(user_detail_url, headers=headers)
        user_response.raise_for_status()
        if handle_rate_limiting(user_response):
            continue
        user_info = user_response.json()

        # Clean up company name as per requirements
        company = user_info.get('company', '')
        if company:
            company = company.strip().lstrip('@').upper()

        users.append({
            'login': user_info['login'],
            'name': user_info.get('name'),
            'company': company,
            'location': user_info.get('location'),
            'email': user_info.get('email'),
            'hireable': 'true' if user_info.get('hireable') else 'false',
            'bio': user_info.get('bio'),
            'public_repos': user_info.get('public_repos'),
            'followers': user_info.get('followers'),
            'following': user_info.get('following'),
            'created_at': user_info.get('created_at')
        })
        print(f"Processed user {i} of {len(users_data)}: {user_info['login']}")
    except requests.exceptions.RequestException as e:
        print(f"Error processing user {user.get('login', 'unknown')}: {e}")
        continue

# Convert user data to DataFrame and save as CSV
users_df = pd.DataFrame(users)
users_df.to_csv('users.csv', index=False)

# Fetch repositories for each user, limited to 500 most recently pushed repos per user
repos = []
for i, user in enumerate(users, 1):
    page = 1
    user_repos = []
    print(f"Fetching repositories for user {i} of {len(users)}: {user['login']}")
    
    while True:
        try:
            repos_url = f"{base_url}/users/{user['login']}/repos?sort=pushed&direction=desc&page={page}&per_page=100"
            repos_response = session.get(repos_url, headers=headers)
            repos_response.raise_for_status()
            if handle_rate_limiting(repos_response):
                continue
            repos_data = repos_response.json()
            if not repos_data or len(user_repos) >= 500:
                break
                
            for repo in repos_data:
                if len(user_repos) >= 500:
                    break
                user_repos.append({
                    'login': user['login'],
                    'full_name': repo['full_name'],
                    'created_at': repo['created_at'],
                    'stargazers_count': repo['stargazers_count'],
                    'watchers_count': repo['watchers_count'],
                    'language': repo['language'],
                    'has_projects': 'true' if repo['has_projects'] else 'false',
                    'has_wiki': 'true' if repo['has_wiki'] else 'false',
                    'license_name': repo['license']['name'] if repo['license'] else ''
                })
            page += 1
        except requests.exceptions.RequestException as e:
            print(f"Error fetching repositories for user {user['login']} page {page}: {e}")
            break
        
    repos.extend(user_repos)
    print(f"Fetched {len(user_repos)} repositories for {user['login']}")

# Convert repository data to DataFrame and save as CSV
repos_df = pd.DataFrame(repos)
repos_df.to_csv('repositories.csv', index=False)

print("\nData scraping completed successfully!")
print(f"Total users scraped: {len(users)}")
print(f"Total repositories scraped: {len(repos)}")

Fetched page 2 of users
Fetched page 3 of users
Fetched page 4 of users
Fetched page 5 of users
Fetched page 6 of users
Fetched page 7 of users
Fetched page 8 of users
Fetched page 9 of users
Processed user 1 of 744: peng-zhihui
Processed user 2 of 744: ruanyf
Processed user 3 of 744: phodal
Processed user 4 of 744: liyupi
Processed user 5 of 744: stormzhang
Processed user 6 of 744: Ovilia
Processed user 7 of 744: astaxie
Processed user 8 of 744: bailicangdu
Processed user 9 of 744: PanJiaChen
Processed user 10 of 744: YunaiV
Processed user 11 of 744: sofish
Processed user 12 of 744: dyc87112
Processed user 13 of 744: skyzh
Processed user 14 of 744: nihui
Processed user 15 of 744: oldratlee
Processed user 16 of 744: xufei
Processed user 17 of 744: amusi
Processed user 18 of 744: TommyZihao
Processed user 19 of 744: xuxueli
Processed user 20 of 744: wx-chevalier
Processed user 21 of 744: krahets
Processed user 22 of 744: teddysun
Processed user 23 of 744: hellokaton
Processed user 24 of