## Mirror + merge metrics and summary with train data

In [None]:
import pandas as pd

# Function to mirror the DataFrame
def mirror_dataframe(df):
    mirrored_df = df.copy()
    mirrored_df['project_a'], mirrored_df['project_b'] = df['project_b'], df['project_a']
    mirrored_df['weight_a'], mirrored_df['weight_b'] = df['weight_b'], df['weight_a']
    return mirrored_df

# Define paths
raw_dataset_path = 'raw_dataset/'
metrics_path = 'metrics_with_summary.csv'

# Load GitHub metrics
metrics = pd.read_csv(metrics_path)

# Ensure URLs in metrics are unique and can be used for joining
metrics = metrics.drop_duplicates(subset='url')

# Rename columns in metrics to add suffixes for project_a and project_b
metrics_a = metrics.rename(columns=lambda col: f"{col}_project_a" if col != 'url' else col)
metrics_b = metrics.rename(columns=lambda col: f"{col}_project_b" if col != 'url' else col)

# List of datasets to process
datasets = {
    'oso_train': 'OSO/dataset.csv',
    'hf_train': 'hf/dataset.csv',
    'pond_train': 'pond/dataset.csv'
}

# Process each dataset
for dataset_name, file_path in datasets.items():
    # Load dataset
    dataset = pd.read_csv(raw_dataset_path + file_path)
    
    # Mirror the dataset
    mirrored_dataset = mirror_dataframe(dataset)
    
    # Concatenate original and mirrored datasets
    combined_dataset = pd.concat([dataset, mirrored_dataset], ignore_index=True)
    
    # Merge metrics data for project_a
    combined_dataset = combined_dataset.merge(metrics_a, how='left', left_on='project_a', right_on='url')
    combined_dataset.drop(columns='url', inplace=True)  # Drop extra 'url' column from merge
    
    # Merge metrics data for project_b
    combined_dataset = combined_dataset.merge(metrics_b, how='left', left_on='project_b', right_on='url')
    combined_dataset.drop(columns='url', inplace=True)  # Drop extra 'url' column from merge
    
    # Display enriched dataset head
    print(f"Enriched {dataset_name} dataset head:")
    print(combined_dataset.head())
    
    # Save the enriched dataset
    enriched_file_path = f'enriched_dataset_with_summary/{dataset_name}_enriched_with_summary.csv'
    combined_dataset.to_csv(enriched_file_path, index=False)
    print(f"Enriched dataset saved to {enriched_file_path}")


Enriched oso_train dataset head:
   id                                          project_a  \
0   1  https://github.com/prettier-solidity/solidity-...   
1   2  https://github.com/prysmaticlabs/protoc-gen-go...   
2   3  https://github.com/prysmaticlabs/protoc-gen-go...   
3   4  https://github.com/prysmaticlabs/protoc-gen-go...   
4   5             https://github.com/ethers-io/ethers.js   

                                           project_b  weight_a  weight_b  \
0             https://github.com/ethers-io/ethers.js  0.018868  0.981132   
1             https://github.com/ethers-io/ethers.js  0.940556  0.059444   
2  https://github.com/walletconnect/walletconnect...  0.959448  0.040552   
3                 https://github.com/sigp/lighthouse  0.955172  0.044828   
4  https://github.com/walletconnect/walletconnect...  0.599248  0.400752   

   total_amount_usd   funder  quarter  is_private_project_a  \
0               636  clrfund  2023-07                 False   
1             59014  gi

## Merge metrics and summary with test data

In [1]:
import pandas as pd

# Function to mirror the DataFrame
def mirror_dataframe(df):
    mirrored_df = df.copy()
    mirrored_df['project_a'], mirrored_df['project_b'] = df['project_b'], df['project_a']
    return mirrored_df

# Define paths
raw_dataset_path = 'raw_dataset/'
metrics_path = 'metrics_with_summary.csv'

# Load GitHub metrics
metrics = pd.read_csv(metrics_path)

# Ensure URLs in metrics are unique and can be used for joining
metrics = metrics.drop_duplicates(subset='url')

# Rename columns in metrics to add suffixes for project_a and project_b
metrics_a = metrics.rename(columns=lambda col: f"{col}_project_a" if col != 'url' else col)
metrics_b = metrics.rename(columns=lambda col: f"{col}_project_b" if col != 'url' else col)

# List of datasets to process
datasets = {
    'hf_test': 'hf/test.csv',
    'pond_test': 'pond/test.csv'
    
}

# Process each dataset
for dataset_name, file_path in datasets.items():
    # Load dataset
    dataset = pd.read_csv(raw_dataset_path + file_path)
    
    # Merge metrics data for project_a
    dataset = dataset.merge(metrics_a, how='left', left_on='project_a', right_on='url')
    dataset.drop(columns='url', inplace=True)  # Drop extra 'url' column from merge
    
    # Merge metrics data for project_b
    dataset = dataset.merge(metrics_b, how='left', left_on='project_b', right_on='url')
    dataset.drop(columns='url', inplace=True)  # Drop extra 'url' column from merge
    
    # Display enriched dataset head
    print(f"Enriched {dataset_name} dataset head:")
    print(dataset.head())
    
    # Save the enriched dataset
    enriched_file_path = f'enriched_dataset_with_summary/{dataset_name}_enriched_with_summary.csv'
    dataset.to_csv(enriched_file_path, index=False)
    print(f"Enriched dataset saved to {enriched_file_path}")


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Enriched hf_test dataset head:
   id                                          project_a  \
0   1  https://github.com/prettier-solidity/prettier-...   
1   8  https://github.com/prettier-solidity/prettier-...   
2  13  https://github.com/prettier-solidity/prettier-...   
3  15  https://github.com/prettier-solidity/prettier-...   
4  18             https://github.com/prysmaticlabs/prysm   

                                project_b  is_private_project_a  \
0  https://github.com/ethers-io/ethers.js                 False   
1       https://github.com/bluealloy/revm                 False   
2     https://github.com/paradigmxyz/reth                 False   
3         https://github.com/ipfs/js-ipfs                 False   
4       https://github.com/consensys/teku                 False   

   has_homepage_project_a  size_project_a  stars_project_a  \
0                    True            5016              732   
1                    True            5016              732   
2                  