# Get repo urls from all dataset

### Create list of github repo urls from all raw dataset

In [20]:
import pandas as pd

# Load all datasets
df1 = pd.read_csv('raw_dataset/OSO/dataset.csv')
df2 = pd.read_csv('raw_dataset/pond/dataset.csv')
df3 = pd.read_csv('raw_dataset/pond/test.csv')
df4 = pd.read_csv('raw_dataset/hf/dataset.csv')
df5 = pd.read_csv('raw_dataset/hf/test.csv')

# Combine all dataframes into one
df = pd.concat([df1, df2, df3, df4,df5], ignore_index=True)

# Extract unique URLs from 'project_a' and 'project_b' columns
unique_urls_a = df['project_a'].dropna().unique()
unique_urls_b = df['project_b'].dropna().unique()

# Combine and find unique URLs across both columns
unique_urls = set(unique_urls_a).union(set(unique_urls_b))

# Convert to a list if needed
unique_urls_list = list(unique_urls)

# Print the unique GitHub URLs
print(unique_urls_list)

# Optionally, save to a new CSV file
pd.DataFrame(unique_urls_list, columns=['repo_url']).to_csv('all_github_urls.csv', index=False)


['https://github.com/sindresorhus/p-cancelable', 'https://github.com/vuejs/vue', 'https://github.com/prysmaticlabs/protoc-gen-go-cast', 'https://github.com/tklauser/go-sysconf', 'https://github.com/vitest-dev/vitest', 'https://github.com/libp2p/go-libp2p', 'https://github.com/ethereum/remix-plugin', 'https://github.com/level/mem', 'https://github.com/humanwhocodes/object-schema', 'https://github.com/google/btree', 'https://github.com/facebook/fbjs', 'https://github.com/pycqa/flake8', 'https://github.com/chainsafe/lodestar', 'https://github.com/streetsidesoftware/cspell-dicts', 'https://github.com/walletconnect/walletconnect-monorepo', 'https://github.com/nomicfoundation/hardhat', 'https://github.com/walletconnect/walletconnect-utils', 'https://github.com/erigontech/speedtest', 'https://github.com/yarnpkg/yarn', 'https://github.com/servo/unicode-bidi', 'https://github.com/ljharb/call-bind', 'https://github.com/kaelzhang/node-ignore', 'https://github.com/mmcloughlin/addchain', 'https://g

In [21]:
len(unique_urls_list)

166

### Get the metrics using github request API
Be careful setting the timesleep to avoid hitting rate limits 

In [22]:
import pandas as pd  
import requests  
import time  
from datetime import datetime  
import math  
  
 
unique_urls = pd.read_csv('all_github_urls.csv')['repo_url'].tolist()

# Your GitHub token  
GITHUB_TOKEN = 'your github token here'  
  
# Function to get metrics from GitHub  
def get_github_metrics(repo_url):  
    # Extract the owner and repo name from the URL  
    parts = repo_url.split('/')  
    owner = parts[-2]  
    repo = parts[-1]  
      
    # GitHub API URL  
    api_url = f'https://api.github.com/repos/{owner}/{repo}'  
      
    # Set up headers with authentication  
    headers = {  
        'Authorization': f'token {GITHUB_TOKEN}',  
        'Accept': 'application/vnd.github.v3+json'  
    }  
      
    # Make the request to the GitHub API  
    response = requests.get(api_url, headers=headers)  
      
    if response.status_code == 200:  
        data = response.json()  
          
        # Extract basic metrics  
        metrics = {  
            'url': repo_url,  
            'is_private': data.get('private', False),  
            'has_homepage': bool(data.get('homepage', '')),  
            'size': data.get('size', 0),  
            'stars': data.get('stargazers_count', 0),  
            'watchers': data.get('watchers_count', 0),  
            'has_projects': data.get('has_projects', False),  
            'has_pages': data.get('has_pages', False),  
            'has_wiki': data.get('has_wiki', False),  
            'has_discussions': data.get('has_discussions', False),  
            'forks': data.get('forks_count', 0),  
            'is_archived': data.get('archived', False),  
            'is_disabled': data.get('disabled', False),  
            'open_issues': data.get('open_issues_count', 0),  
            'subscribers_count': data.get('subscribers_count', 0),  
            'created_at': data.get('created_at', ''),  
            'updated_at': data.get('updated_at', '')  
        }  
          
        # Calculate boolean features  
        metrics['is_private_b'] = int(metrics['is_private'])  
        metrics['has_homepage_b'] = int(metrics['has_homepage'])  
        metrics['size_b'] = int(metrics['size'] > 0)  
        metrics['stars_b'] = int(metrics['stars'] > 0)  
        metrics['watchers_b'] = int(metrics['watchers'] > 0)  
        metrics['has_projects_b'] = int(metrics['has_projects'])  
        metrics['has_pages_b'] = int(metrics['has_pages'])  
        metrics['has_wiki_b'] = int(metrics['has_wiki'])  
        metrics['has_discussions_b'] = int(metrics['has_discussions'])  
        metrics['forks_b'] = int(metrics['forks'] > 0)  
        metrics['is_archived_b'] = int(metrics['is_archived'])  
        metrics['is_disabled_b'] = int(metrics['is_disabled'])  
        metrics['open_issues_b'] = int(metrics['open_issues'] > 0)  
        metrics['subscribers_count_b'] = int(metrics['subscribers_count'] > 0)  
          
        # Calculate ratios  
        total_count = metrics['stars'] + metrics['watchers'] + metrics['forks'] + metrics['size']  
        if total_count > 0:  
            metrics['stars_ratio'] = metrics['stars'] / total_count  
            metrics['watchers_ratio'] = metrics['watchers'] / total_count  
            metrics['forks_ratio'] = metrics['forks'] / total_count  
            metrics['size_ratio'] = metrics['size'] / total_count  
        else:  
            metrics['stars_ratio'] = 0  
            metrics['watchers_ratio'] = 0  
            metrics['forks_ratio'] = 0  
            metrics['size_ratio'] = 0  
          
        # Calculate commit count  
        commits_url = f'https://api.github.com/repos/{owner}/{repo}/commits'  
        commits_response = requests.get(commits_url, headers=headers)  
        if commits_response.status_code == 200:  
            commits_data = commits_response.json()  
            commit_count = len(commits_data)  
            # Estimate total commit count by checking the last page  
            last_page = commits_response.links.get('last', {}).get('url')  
            if last_page:  
                last_page_response = requests.get(last_page, headers=headers)  
                if last_page_response.status_code == 200:  
                    last_page_data = last_page_response.json()  
                    commit_count += (len(last_page_data) - 1) * 30  # Assuming 30 commits per page  
            metrics['commit_count'] = commit_count  
        else:  
            print(f"Failed to fetch commits for {repo_url}: {commits_response.status_code}")  
            metrics['commit_count'] = 0  
          
        # Calculate additional features based on commit count  
        created_at = datetime.strptime(metrics['created_at'], '%Y-%m-%dT%H:%M:%SZ')  
        updated_at = datetime.strptime(metrics['updated_at'], '%Y-%m-%dT%H:%M:%SZ')  
        now = datetime.utcnow()  
        age_days = (now - created_at).days  
        days_since_update = (now - updated_at).days  
          
        metrics['commit_count_b'] = int(metrics['commit_count'] > 0)  
        metrics['commit_count_ratio'] = metrics['commit_count'] / (total_count + metrics['commit_count']) if total_count + metrics['commit_count'] > 0 else 0  
          
        # Calculate decay (assuming a simple decay over time)  
        metrics['commit_decay'] = metrics['commit_count'] / (age_days + 1)  # +1 to avoid division by zero  
        metrics['commit_decay_b'] = int(metrics['commit_decay'] > 0)  
          
        metrics['age_days'] = age_days  
        metrics['days_since_update'] = days_since_update  
          
        metrics['age_days_b'] = int(metrics['age_days'] > 0)  
        metrics['days_since_update_b'] = int(metrics['days_since_update'] > 0)  
          
        # Calculate logarithmic values  
        metrics['log_stars'] = metrics['stars'] if metrics['stars'] == 0 else math.log(metrics['stars'])  
        metrics['log_watchers'] = metrics['watchers'] if metrics['watchers'] == 0 else math.log(metrics['watchers'])  
        metrics['log_forks'] = metrics['forks'] if metrics['forks'] == 0 else math.log(metrics['forks'])  
        metrics['log_commit_count'] = metrics['commit_count'] if metrics['commit_count'] == 0 else math.log(metrics['commit_count'])  
          
        # Calculate binary logarithmic values  
        metrics['log_stars_b'] = int(metrics['log_stars'] > 0)  
        metrics['log_watchers_b'] = int(metrics['log_watchers'] > 0)  
        metrics['log_forks_b'] = int(metrics['log_forks'] > 0)  
        metrics['log_commit_count_b'] = int(metrics['log_commit_count'] > 0)  
          
        return metrics  
    else:  
        print(f"Failed to fetch data for {repo_url}: {response.status_code}")  
        return None  
  
# List to hold all metrics  
all_metrics = []  
  
# Counter for processed repositories  
counter = 0  
  
# Loop through each unique URL and get metrics  
for url in unique_urls:  
    metrics = get_github_metrics(url)  
    if metrics:  
        all_metrics.append(metrics)  
        counter += 1  # Increment the counter after successful fetch  
        print(f"Processed {counter}/{len(unique_urls)}: {url}")  # Print progress  
      
    # Sleep to avoid hitting the rate limit  
    time.sleep(1)  # Adjust the sleep time as necessary  
  
# Convert the metrics to a DataFrame  
metrics_df = pd.DataFrame(all_metrics)  
  
# Save the metrics to a CSV file  
metrics_df.to_csv('all_github_metrics.csv', index=False)  
  
print("Metrics scraping completed.")  


Processed 1/166: https://github.com/sindresorhus/p-cancelable
Processed 2/166: https://github.com/vuejs/vue


KeyboardInterrupt: 

In [23]:
all_metrics

[{'url': 'https://github.com/sindresorhus/p-cancelable',
  'is_private': False,
  'has_homepage': False,
  'size': 59,
  'stars': 440,
  'watchers': 440,
  'has_projects': False,
  'has_pages': False,
  'has_wiki': False,
  'has_discussions': False,
  'forks': 22,
  'is_archived': False,
  'is_disabled': False,
  'open_issues': 3,
  'subscribers_count': 8,
  'created_at': '2016-11-28T10:27:44Z',
  'updated_at': '2025-01-28T21:08:50Z',
  'is_private_b': 0,
  'has_homepage_b': 0,
  'size_b': 1,
  'stars_b': 1,
  'watchers_b': 1,
  'has_projects_b': 0,
  'has_pages_b': 0,
  'has_wiki_b': 0,
  'has_discussions_b': 0,
  'forks_b': 1,
  'is_archived_b': 0,
  'is_disabled_b': 0,
  'open_issues_b': 1,
  'subscribers_count_b': 1,
  'stars_ratio': 0.4578563995837669,
  'watchers_ratio': 0.4578563995837669,
  'forks_ratio': 0.022892819979188347,
  'size_ratio': 0.061394380853277836,
  'commit_count': 690,
  'commit_count_b': 1,
  'commit_count_ratio': 0.4179285281647486,
  'commit_decay': 0.23053