In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys

import re
import pandas as pd
import subprocess
import time
from tqdm import tqdm

In [5]:
# Define individual regex patterns for each metric
epoch_pattern_tf = r'epoch\s*[\[\(]?\s*(\d+)\s*/\s*\d+[\]\)]?'  
loss_pattern_tf = r' loss:\s*(\d+\.\d+)'
acc_pattern_tf = r' acc(?:uracy)?:\s*(\d+\.\d+)'
val_loss_pattern_tf = r' val_loss:\s*(\d+\.\d+)'
val_acc_pattern_tf = r' val_acc(?:uracy)?:\s*(\d+\.\d+)'

iter_pattern_pytorch =  r'iter\s*\[(\d+)\s*/\s*\d+\]'
epoch_pattern_pytorch = r'epoch\s*[\[\(]?\s*(\d+)\s*/\s*\d+[\]\)]?' 
loss_pattern_pytorch = r' loss:\s*(\d+\.\d+)'
acc_pattern_pytorch = r' top1:\s*(\d+\.\d+)'
val_loss_pattern_pytorch = r' val_loss:\s*(\d+\.\d+)'
val_acc_pattern_pytorch = r' val_top1:\s*(\d+\.\d+)'


def regex_metrics_py(text, name):
    text = text.lower()

    iteration_matches= re.findall(iter_pattern_pytorch, text)
    epoch_matches = re.findall(epoch_pattern_pytorch, text)
    loss_matches = re.findall(loss_pattern_pytorch, text)
    acc_matches = re.findall(acc_pattern_pytorch, text)
    val_loss_matches = re.findall(val_loss_pattern_pytorch, text)
    val_acc_matches = re.findall(val_acc_pattern_pytorch, text)

    epoch_matches = [int(i) for i in epoch_matches]
    epoch_matches = [i+1 for i in range(max(epoch_matches))]
    
    for metric in [val_loss_matches, val_acc_matches]:
        if len(metric) != len(epoch_matches):
            metric[:] = [None] * len(epoch_matches)
        else:
            metric[:] = [round(float(i)/100, 4) for i in metric]
    
    for metric in [loss_matches, acc_matches]:
        if len(metric) != len(iteration_matches):
            metric[:] = [None] * len(iteration_matches)
        else:
            metric[:] = [round(float(i)/100, 4) for i in metric]

    last_loss = []
    last_acc = []
    step_size = len(loss_matches) // len(epoch_matches) if len(epoch_matches) > 0 else 1  # no division by 0
    for ix in range(len(epoch_matches)):
        try:
            last_acc.append(acc_matches[ix * step_size])
            last_loss.append(loss_matches[ix * step_size])
        except IndexError:
            last_loss.append(None)
            last_acc.append(None)
            
    structured_data1 = list(zip(epoch_matches, val_acc_matches, val_loss_matches, last_acc, last_loss))
    df1 = pd.DataFrame(structured_data1, columns=['Epoch', 'Val_Accuracy', 'Val_Loss', 'Accuracy', 'Loss'])
    if len(df1) > 0:
        print(f"found {len(df1)} metrics for {name + 'epochs'}")
        df1.to_csv(name + "-epochs.csv", index=False)

    
    structured_data2 = list(zip(iteration_matches, acc_matches, loss_matches))
    df2 = pd.DataFrame(structured_data2, columns=['Iteration', 'Accuracy', 'Loss'])
    if len(df2) > 0:
        print(f"found {len(df2)} metrics for {name + 'iterations'}")
        df2.to_csv(name + "-iterations.csv", index=False)


def regex_metrics_tf(text, name):
    text = text.lower()

    epoch_matches = re.findall(epoch_pattern_tf, text)
    loss_matches = re.findall(loss_pattern_tf, text)
    acc_matches = re.findall(acc_pattern_tf, text)
    val_loss_matches = re.findall(val_loss_pattern_tf, text)
    val_acc_matches = re.findall(val_acc_pattern_tf, text)
    
    for metric in [loss_matches, acc_matches, val_loss_matches, val_acc_matches]:
        if len(metric) != len(epoch_matches):
            metric[:] = [None] * len(epoch_matches)
        else:
            metric[:] = [float(i) for i in metric]
    epoch_matches = [int(i) for i in epoch_matches]
    
    # List to hold the structured data
    structured_data = list(zip(epoch_matches, acc_matches, loss_matches, val_acc_matches, val_loss_matches))
    
    # Create a DataFrame for better readability
    df = pd.DataFrame(structured_data, columns=['Epoch', 'Accuracy', 'Loss', 'Val_Accuracy', 'Val_Loss'])
    if len(df) > 0:
        print(f"found {len(df)} metrics for {name}")
        df.to_csv(name + ".csv", index=False)
        return df
    else:
        return

#regex_metrics_py(text, "test-test")  

## GitHub

In [6]:
# GitHub API search URL with pagination parameters
GITHUB_TOKEN = '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'
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}

In [None]:
def fetch_paginated_results(q, headers, per_page=100):
    page = 0
    all_items = []
    
    while True:
        page += 1
        search_url = f'https://api.github.com/search/code?q={q}&page={page}&per_page={per_page}'
        response = requests.get(search_url, headers=headers)
        
        if response.status_code != 200:  # reach API limit
            r = requests.get("https://api.github.com/rate_limit", headers=headers)
            waiting_time = r.json()["resources"]["code_search"]["reset"] - time.time()
            print("waiting time:", waiting_time, "seconds")
            time.sleep(waiting_time)
            raise ValueError(response.text)
            
        data = response.json()
        # Check if items were returned
        if 'items' in data and len(data['items']) > 0:
            all_items.extend(data['items'])  # Add new items to the list
        else:
            break  # No more results or an error

    return  [i["html_url"].replace("https://github.com/", "https://raw.githubusercontent.com/").replace("/blob", "") for i in all_items]


def scrape_github(q):
    # Fetch results for the current date range
    try:
        urls = fetch_paginated_results(q, HEADERS)
    except ValueError:
        return scrape_github(q)
    except Exception as e:  # to account for maybe timeout
        print(e)
        return
    return urls



# Initialize the list to store all results across date ranges
all_results = []
queries_py = ['"Epoch [1/" AND "pytorch" AND "loss: ' + str(1 + round(0.0001 * i, 4)) + '"' for i in range(1, 10000)]
queries_tf = ['"Epoch 1/" AND "tensorflow" AND "loss: ' + str(round(0.0001 * i, 4)) + '"' for i in range(1, 10000)]

for query in tqdm(queries_tf):
    urls = scrape_github(query)
    all_results.extend(urls)


In [None]:
r = requests.get("https://api.github.com/rate_limit", headers=H)
t = r.json()["resources"]["code_search"]["reset"]
print("reset:", t, "\nnow:  ", time.time())
print("difference", t-time.time())
print(r.json()["resources"]["code_search"])

In [None]:
# Fetch search results from GitHub API

results = response.json()
for raw_url in all_results[:10]:
    # Construct the raw URL for the notebook
    #repo = item['repository']['full_name']
    #path = item['path']
    #raw_url = f'https://raw.githubusercontent.com/{repo}/master/{path}'
    
    r = requests.get(raw_url)
    soup = bs(r.text, 'html.parser')

    name = raw_url.split("https://raw.githubusercontent.com/")[1]
    name = name.split("/")
    name = "--".join(name[:2])
    regex_metrics(soup.text, name)


# Selenium GitHub Scraper

In [2]:
chrome_options = Options()
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
un = "vincent-stadler"
pw = ""
query_urls = ['https://github.com/search?q=%22Epoch+1%2F%22+AND+%22tensorflow%22+AND+%22loss%3A+{}%22&type=code&p={}'.format(str(round(0.0001 * i, 4)), str(j+1)) for i in range(1, 10000) for j in range(5)]

In [24]:
def login(d):
    d.get("https://github.com/login")

    wait = WebDriverWait(d, 10)
    username_field = wait.until(EC.presence_of_element_located((By.ID, "login_field")))
    username_field.send_keys(un)
    
    # Input the password
    password_field = d.find_element(By.ID, "password")
    password_field.send_keys(pw)
    
    password_field.send_keys(Keys.RETURN)
    wait.until(EC.url_changes("https://github.com/login"))

def find_links(soup):
    l = []
    items = soup.find_all("div", "bmcJak")
    if len(items) > 0:
        for item in items:
            try:
                link = item.find("a", "gwACeB")["href"]
                link = ("https://raw.githubusercontent.com" + link).replace("/blob", "")
                l.append(link)
            except Exception as e:
                print(e)
    return l

def fetch_links(d, url):
    d.get(url)
    WebDriverWait(d, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'Box-sc-g0xbh4-0'))
    )
    soup = bs(d.page_source, 'html.parser')
    return find_links(soup)

In [4]:
active_drivers = []
for i in range(10):
    _ = webdriver.Chrome()
    login(_)
    active_drivers.append(_)

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_drivers)) as executor:
    futures = []
    for i, url in enumerate(tqdm(query_urls[213 + 333 + 639:])):
        driver = active_drivers[i % len(active_drivers)]  # Round-robin assignment of drivers
        futures.append(executor.submit(fetch_links, driver, url))
    
    # Collecting results as they complete
    for future in concurrent.futures.as_completed(futures):
        links.extend(future.result())  # Add the parsed soup to soups list
        print("Len links:", len(links))

In [31]:
for url in tqdm(query_urls[213 + 333 + 639:]):  #213 + 333 + 639
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'Box-sc-g0xbh4-0')))
    soup = bs(driver.page_source, 'html.parser')
    links.extend(find_links(soup))
    

  1%|▉                                                                          | 639/49449 [34:35<44:01:44,  3.25s/it]


KeyboardInterrupt: 

In [32]:
len(links)

13040

In [34]:
query_urls.index(url)

1185

In [33]:
import pickle
with open("links1.pickle", "wb") as f:
    pickle.dump(links, f)