# This notebook is for scrape the (open and closed) issues and pull-request, and 'used by' data.
Ialso mirror the dataframe, so the number of pairs will be 2 times higher

In [None]:
import pandas as pd  
import requests  
import time  
from datetime import datetime  
   
# Function to fetch GitHub metrics  
def get_github_metrics(repo_url, headers):  
    # Extract the owner and repo name from the URL  
    parts = repo_url.split('/')  
    owner = parts[-2]  
    repo = parts[-1]  
      
    # GitHub API URLs  
    issues_open_url = f'https://api.github.com/search/issues?q=repo:{owner}/{repo}+type:issue+state:open'  
    issues_closed_url = f'https://api.github.com/search/issues?q=repo:{owner}/{repo}+type:issue+state:closed'  
    pulls_open_url = f'https://api.github.com/search/issues?q=repo:{owner}/{repo}+type:pr+state:open'  
    pulls_closed_url = f'https://api.github.com/search/issues?q=repo:{owner}/{repo}+type:pr+state:closed'  
    commits_url = f'https://api.github.com/repos/{owner}/{repo}/commits'  
      
    # Fetch open issues count  
    issues_open_response = requests.get(issues_open_url, headers=headers)  
    if issues_open_response.status_code == 200:  
        issues_open_data = issues_open_response.json()  
        open_issues_count = issues_open_data.get('total_count', 0)  
    else:  
        print(f"Failed to fetch open issues for {repo_url}: {issues_open_response.status_code}")  
        open_issues_count = 0  
      
    # Fetch closed issues count  
    issues_closed_response = requests.get(issues_closed_url, headers=headers)  
    if issues_closed_response.status_code == 200:  
        issues_closed_data = issues_closed_response.json()  
        closed_issues_count = issues_closed_data.get('total_count', 0)  
    else:  
        print(f"Failed to fetch closed issues for {repo_url}: {issues_closed_response.status_code}")  
        closed_issues_count = 0  
      
    # Fetch open pull requests count  
    pulls_open_response = requests.get(pulls_open_url, headers=headers)  
    if pulls_open_response.status_code == 200:  
        pulls_open_data = pulls_open_response.json()  
        open_pulls_count = pulls_open_data.get('total_count', 0)  
    else:  
        print(f"Failed to fetch open pull requests for {repo_url}: {pulls_open_response.status_code}")  
        open_pulls_count = 0  
      
    # Fetch closed pull requests count  
    pulls_closed_response = requests.get(pulls_closed_url, headers=headers)  
    if pulls_closed_response.status_code == 200:  
        pulls_closed_data = pulls_closed_response.json()  
        closed_pulls_count = pulls_closed_data.get('total_count', 0)  
    else:  
        print(f"Failed to fetch closed pull requests for {repo_url}: {pulls_closed_response.status_code}")  
        closed_pulls_count = 0  
      
    # Fetch commit count  
    commits_response = requests.get(commits_url, headers=headers)  
    if commits_response.status_code == 200:  
        commits_data = commits_response.json()  
        commit_count = len(commits_data)  
        # Estimate total commit count by checking the last page  
        last_page = commits_response.links.get('last', {}).get('url')  
        if last_page:  
            last_page_response = requests.get(last_page, headers=headers)  
            if last_page_response.status_code == 200:  
                last_page_data = last_page_response.json()  
                commit_count += (len(last_page_data) - 1) * 30  # Assuming 30 commits per page  
        else:  
            commit_count = len(commits_data)  
    else:  
        print(f"Failed to fetch commits for {repo_url}: {commits_response.status_code}")  
        commit_count = 0  
      
    # Extract basic metrics  
    metrics = {  
        'url': repo_url,  
        'open_issues_count': open_issues_count,  
        'closed_issues_count': closed_issues_count,  
        'open_pulls_count': open_pulls_count,  
        'closed_pulls_count': closed_pulls_count,  
        'commit_count': commit_count  
    }  
      
    return metrics  
  
# Function to handle rate limiting  
def handle_rate_limiting(headers):  
    rate_limit_url = 'https://api.github.com/rate_limit'  
    response = requests.get(rate_limit_url, headers=headers)  
    if response.status_code == 200:  
        rate_limit_data = response.json()  
        remaining = rate_limit_data['rate']['remaining']  
        reset_time = rate_limit_data['rate']['reset']  
        if remaining == 0:  
            reset_timestamp = datetime.fromtimestamp(reset_time)  
            current_time = datetime.now()  
            wait_time = (reset_timestamp - current_time).total_seconds() + 5  # Add 5 seconds buffer  
            print(f"Rate limit reached. Waiting for {wait_time} seconds until reset.")  
            time.sleep(wait_time)  
    else:  
        print(f"Failed to fetch rate limit information: {response.status_code}")  


In [None]:
# Function to mirror the DataFrame  
def mirror_dataframe(df):  
    mirrored_df = df.copy()  
    mirrored_df['project_a'], mirrored_df['project_b'] = df['project_b'], df['project_a']  
    mirrored_df['weight_a'], mirrored_df['weight_b'] = df['weight_b'], df['weight_a']  
    return mirrored_df  
 
# Load the datasets  
hf_test = pd.read_csv('raw_dataset/hf/test.csv')  
pond_test = pd.read_csv('raw_dataset/pond/test.csv')  
oso_train = pd.read_csv('raw_dataset/OSO/dataset.csv')  
hf_train = pd.read_csv('raw_dataset/hf/dataset.csv')  
pond_train = pd.read_csv('raw_dataset/pond/dataset.csv')  
  
# Load the metrics DataFrame  
metrics_path = 'metrics_with_summary.csv'  
metrics_df = pd.read_csv(metrics_path)  
  
# Ensure URLs in metrics are unique and can be used for joining  
metrics_df = metrics_df.drop_duplicates(subset='url')  
  
# Rename columns in metrics to add suffixes for project_a and project_b  
metrics_a = metrics_df.rename(columns=lambda col: f"{col}_project_a" if col != 'url' else 'url')  
metrics_b = metrics_df.rename(columns=lambda col: f"{col}_project_b" if col != 'url' else 'url')  
  
# Function to enrich the dataset with metrics  
def enrich_dataset(df, metrics_a, metrics_b):  
    # Merge metrics data for project_a  
    enriched_df = df.merge(metrics_a, how='left', left_on='project_a', right_on='url')  
    enriched_df.drop(columns='url', inplace=True)  # Drop extra 'url' column from merge  
      
    # Merge metrics data for project_b  
    enriched_df = enriched_df.merge(metrics_b, how='left', left_on='project_b', right_on='url')  
    enriched_df.drop(columns='url', inplace=True)  # Drop extra 'url' column from merge  
      
    return enriched_df  
  
GitHub token  
GITHUB_TOKEN = 'github token'  
  
# Headers for GitHub API requests  
headers = {  
    'Authorization': f'token {GITHUB_TOKEN}',  
    'Accept': 'application/vnd.github.v3+json'  
}  
  
# Process each dataset  
datasets = {  
    'hf_test': hf_test,  
    'pond_test': pond_test,  
    'aug_train': oso_train,  #OSO data
    'hf_train': hf_train,  
    'pond_train': pond_train  
}  
  
enriched_datasets = {}  
  
for name, df in datasets.items():  
    # Create the mirrored DataFrame  
    mirrored_df = mirror_dataframe(df)  
      
    # Concatenate the original and mirrored DataFrames  
    combined_df = pd.concat([df, mirrored_df], ignore_index=True)  
      
    # Extract unique URLs for project_a and project_b  
    unique_urls = combined_df['project_a'].tolist() + combined_df['project_b'].tolist()  
    unique_urls = list(set(unique_urls))  
      
    # Fetch metrics for each unique URL  
    all_metrics = []  
    for url in unique_urls:  
        metrics = get_github_metrics(url, headers)  
        if metrics:  
            all_metrics.append(metrics)  
        handle_rate_limiting(headers)  # Check and handle rate limiting  
      
    # Convert the metrics to a DataFrame  
    metrics_df = pd.DataFrame(all_metrics)  
      
    # Ensure URLs in metrics are unique and can be used for joining  
    metrics_df = metrics_df.drop_duplicates(subset='url')  
      
    # Rename columns in metrics to add suffixes for project_a and project_b  
    metrics_a = metrics_df.rename(columns=lambda col: f"{col}_project_a" if col != 'url' else 'url')  
    metrics_b = metrics_df.rename(columns=lambda col: f"{col}_project_b" if col != 'url' else 'url')  
      
    # Enrich the combined DataFrame with metrics  
    enriched_df = enrich_dataset(combined_df, metrics_a, metrics_b)  
      
    # Save the enriched DataFrame to a CSV file  
    enriched_df.to_csv(f'enriched_{name}.csv', index=False)  
      
    # Store the enriched DataFrame for verification  
    enriched_datasets[name] = enriched_df  
      
    print(f"Processed and saved enriched DataFrame for {name} to enriched_{name}.csv")  
  
# Optionally, print the head of each enriched DataFrame  
for name, enriched_df in enriched_datasets.items():  
    print(f"Enriched DataFrame for {name} head:")  
    print(enriched_df.head())  
    print("\n")  
