In [2]:
from github import Github
import pandas as pd
import os
import requests

tokens = [
    ##put some tokens here##
]
current_token_index = 0

def get_next_token():
    global current_token_index
    token = tokens[current_token_index]
    current_token_index = (current_token_index + 1) % len(tokens)
    return token

def switch_github_client():
    token = get_next_token()
    print(f"Switching to the next token: {token[:10]}...")
    return Github(token)

g = switch_github_client()

filtered_data = pd.read_csv("data/github.csv")

output_dir = "rust_repos_rs"
os.makedirs(output_dir, exist_ok=True)
def fetch_files_from_repo(repo_full_name):
    global g
    try:
        repo = g.get_repo(repo_full_name)
        contents = safe_get_contents(repo, "")
        process_directory(repo, contents, repo_full_name)
    except Exception as e:
        print(f"Error fetching repository {repo_full_name}: {e}")
        if "403" in str(e):
            print("Rate limit exceeded or forbidden. Switching token...")
            g = switch_github_client()
            print(f"Switched to the next token. Retrying {repo_full_name}...")
            fetch_files_from_repo(repo_full_name)

def safe_get_contents(repo, path):
    global g
    while True:
        try:
            contents = repo.get_contents(path)
            return contents
        except Exception as e:
            if "403" in str(e):
                print(f"Rate limit exceeded when accessing {path}. Switching token...")
                g = switch_github_client()
                repo = g.get_repo(repo.full_name)
            else:
                raise e

def process_directory(repo, contents, repo_name, folder=""):
    for item in contents:
        try:
            if item.type == "dir":
                # Process subdirectory recursively
                sub_contents = safe_get_contents(repo, item.path)
                process_directory(repo, sub_contents, repo_name, folder=os.path.join(folder, item.name))
            elif item.type == "file" and item.name.endswith(".rs"):
                # Download .rs file
                download_file(item.download_url, repo_name, os.path.join(folder, item.name))
        except Exception as e:
            print(f"Error processing item {item.path} in {repo_name}: {e}")
            if "403" in str(e):
                print(f"Rate limit exceeded or forbidden while accessing {item.path}. Switching token...")
                g = switch_github_client()
                print(f"Switched to the next token. Retrying {item.path}...")
                sub_contents = safe_get_contents(repo, item.path)
                process_directory(repo, sub_contents, repo_name, folder)

def download_file(url, repo_name, relative_path):
    try:
        headers = {"Authorization": f"token {get_next_token()}"}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Ensure subdirectory structure is preserved
            save_path = os.path.join(output_dir, repo_name, relative_path)
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            with open(save_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {save_path}")
        else:
            print(f"Failed to download {relative_path}: {response.status_code}")
    except Exception as e:
        print(f"Error downloading file {relative_path}: {e}")

Switching to the next token: github_pat...


In [3]:
# Fetch .rs files from all repositories
for repo_name in filtered_data[270:300]['name']:
    print(f"Fetching .rs files from {repo_name}...")
    fetch_files_from_repo(repo_name)

Fetching .rs files from Rodderik/Genocide-Kernel...
Fetching .rs files from MIPS/packages-apps-Launcher2...
Downloaded: rust_repos_rs/MIPS/packages-apps-Launcher2/res/raw/allapps.rs
Fetching .rs files from courtc/kwaak3-cc...
Fetching .rs files from mrshoe/sol...
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/camera.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/light.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/main.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/material.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/parser.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/ray.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/scene.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/sceneobject.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/sphere.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/triangle.rs
Downloaded: rust_repos_rs/mrshoe/sol/rust/src/vector.rs
Fetching .rs files from UbitUmarov/Aurora-libomv...
Fetching .rs files from mif-kpa/mifOS...
Downloaded: rust_repos_rs

Request GET /repos/buglabs/oe-buglabs/contents/recipes/glibc/glibc-2.9/etc failed with 403: Forbidden
Setting next backoff to 1708.227079s


Fetching .rs files from NoLifeDev/OldNoLifeStoryTwo...
Fetching .rs files from epabst/codehaus-mojo...


Request GET /repos/epabst/codehaus-mojo/contents/mojo/ounce-maven-plugin/src/site/apt/examples failed with 403: Forbidden
Setting next backoff to 1714.728598s
Request GET /repos/epabst/codehaus-mojo/contents/sandbox/shade-maven-plugin/src/test failed with 403: Forbidden
Setting next backoff to 1755.452686s


Fetching .rs files from xudifsd/lab...
Downloaded: rust_repos_rs/xudifsd/lab/Rustlab/src/main.rs
Downloaded: rust_repos_rs/xudifsd/lab/Rustlab/src/top.rs
Downloaded: rust_repos_rs/xudifsd/lab/Rustlab/src/tree.rs
Fetching .rs files from Ariba/dotfiles...
Error fetching repository Ariba/dotfiles: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository", "status": "404"}
Fetching .rs files from hialin/hialin...
Error fetching repository hialin/hialin: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository", "status": "404"}
Fetching .rs files from yyli/overo-oe...
Fetching .rs files from astrellon/SimpleRPG...
Fetching .rs files from pogin503/dot-emacs...
Downloaded: rust_repos_rs/pogin503/dot-emacs/etc/autoinsert/template.rs
Fetching .rs files from sinis/MakeRT...


In [86]:
input_dir = "rust_repos_rs"
output_file = "combined_rust_files.txt"

def combine_files(input_directory, output_filepath):
    with open(output_filepath, "w", encoding="utf-8") as outfile:
        for root, _, files in os.walk(input_directory):
            for file in files:
                if file.endswith(".rs"):
                    file_path = os.path.join(root, file)
                    try:
                        with open(file_path, "r", encoding="utf-8") as infile:
                            content = infile.read()
                            outfile.write(content + "\n")
                            print(f"Added: {file_path}")
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

print(f"Combining files from {input_dir} into {output_file}...")
combine_files(input_dir, output_file)
print("Combination complete.")

Combining files from rust_repos_rs into combined_rust_files.txt...
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/pratt/src/lib.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/pratt/src/parser.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/leetcode/071-simplify-path.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/leetcode/077-combi.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/leetcode/287-find-duplicate.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/leetcode/334-increasing-triplet.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/leetcode/014-longest-common-prefix.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/leetcode/053-max-subarr.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/leetcode/022-generate-paren.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/leetcode/300-longest-incr-subsequence.rs
Added: rust_repos_rs/Fleurer/fleurer.github.com/exercises/lee