In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import LabelEncoder  
import xgboost as xgb  
from sklearn.metrics import mean_squared_error, r2_score  
from sklearn.feature_extraction.text import TfidfVectorizer




### Load training and test data (from pond and huggingface) 

In [17]:
# Function to extract repository name from URL  
def extract_repo_name(url):  
    try:  
        return url.split('/')[-2] + '/' + url.split('/')[-1]  
    except:  
        return None  
  

# Load the test dataset  
test_data_path = r'enriched_dataset_with_summary/hf_test_enriched_with_summary.csv'  
test_data = pd.read_csv(test_data_path)  

# Load additional training data
additional_data_path = r'enriched_dataset_with_summary/pond_train_enriched_with_summary.csv'      
train_add = pd.read_csv(additional_data_path)      
  
# Load target training data     
train_data_path = r'enriched_dataset_with_summary/hf_train_enriched_with_summary.csv'      
train_hf = pd.read_csv(train_data_path)  


### Minority class resampling
I resampling data for minority projects to balance the data, 

In [18]:
import pandas as pd
from sklearn.utils import resample

# Calculate the target number of rows for train_hf
target_rows = 2 * len(train_add)

# Check the distribution of the 'project_a' column
project_a_counts = train_hf['project_a'].value_counts()
print("Original Distribution of project_a:")
print(project_a_counts)

# Identify the maximum count
max_count = project_a_counts.max()

# Calculate the total number of rows needed for each class to reach the target
total_rows_needed = target_rows - len(train_hf)

# Calculate the required oversampling factor for each class
oversampling_factors = {}
for project, count in project_a_counts.items():
    if count < max_count:
        oversampling_factors[project] = (total_rows_needed / len(project_a_counts)) / count
    else:
        oversampling_factors[project] = 1

# Create a list to hold the oversampled dataframes
oversampled_dfs = []

# Iterate over each unique value in the 'project_a' column
for project, count in project_a_counts.items():
    # Resample the dataframe for the current project
    df_project = train_hf[train_hf['project_a'] == project]
    n_samples = int(count * oversampling_factors[project])
    if n_samples < count:
        n_samples = count  # Ensure at least the original count
    df_project_oversampled = resample(df_project, 
                                      replace=True,     # Sample with replacement
                                      n_samples=n_samples,    # To match the required factor
                                      random_state=42) # Random state for reproducibility
    oversampled_dfs.append(df_project_oversampled)

# Concatenate all the oversampled dataframes
train_hf_oversampled = pd.concat(oversampled_dfs)

# Verify the new distribution
new_project_a_counts = train_hf_oversampled['project_a'].value_counts()
print("\nBalanced Distribution of project_a:")
print(new_project_a_counts)

# Verify the total number of rows
print(f"\nTotal number of rows in oversampled train_hf: {len(train_hf_oversampled)}")
print(f"Target number of rows: {target_rows}")

# Combine the oversampled dataset_hf with train_add  
dataset_aug = pd.concat([train_add, train_hf_oversampled], ignore_index=True)      
dataset_aug.reset_index(drop=True, inplace=True)  


Original Distribution of project_a:
project_a
https://github.com/motdotla/dotenv           66
https://github.com/sindresorhus/type-fest    66
https://github.com/zloirock/core-js          65
https://github.com/postcss/postcss           64
https://github.com/immerjs/immer             64
                                             ..
https://github.com/quic-go/quic-go            9
https://github.com/ethereum/solc-js           8
https://github.com/erigontech/erigon          8
https://github.com/grandinetech/grandine      8
https://github.com/ethereum/web3.py           1
Name: count, Length: 117, dtype: int64

Balanced Distribution of project_a:
project_a
https://github.com/formatjs/formatjs                     675
https://github.com/wooorm/markdown-table                 675
https://github.com/eth-infinitism/account-abstraction    675
https://github.com/xtuc/webassemblyjs                    675
https://github.com/vyperlang/vyper                       675
                                   

In [6]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)        # Set the display width


In [19]:

# Extract repository names for both datasets  
dataset_aug['repo_name_a'] = dataset_aug['project_a'].apply(extract_repo_name)  
dataset_aug['repo_name_b'] = dataset_aug['project_b'].apply(extract_repo_name)  
test_data['repo_name_a'] = test_data['project_a'].apply(extract_repo_name)  
test_data['repo_name_b'] = test_data['project_b'].apply(extract_repo_name)  


### Use logarithmic function to handle skewed data
Numerical metrics data in github repos can be highly skewed. e.g. some repos can have 0 used_by count,
but the other have 2M+ used by count.

In [20]:
# Apply logarithmic transformations for the new columns
for dataset in [dataset_aug, test_data]:
    for project in ['a', 'b']:
        for column in ['open_issues_count', 'closed_issues_count', 'open_prs_count', 'closed_prs_count', 'used_by', 'age_days',
                       'size', 'subscribers_count'
                       
        ]:
            col_name = f'{column}_project_{project}'
            log_col_name = f'log_{column}_project_{project}'
            if col_name in dataset.columns and log_col_name not in dataset.columns:
                dataset[log_col_name] = np.log1p(dataset[col_name])


### Use TF-IDF to help the models read Readme summary and its importance

In [21]:

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10)  # You can adjust max_features as needed

# Fit and transform the summary column for project_a in the training dataset
tfidf_a = tfidf_vectorizer.fit_transform(dataset_aug['summary_project_a'])
tfidf_a_df = pd.DataFrame(tfidf_a.toarray(), columns=[f'tfidf_a_{i}' for i in range(tfidf_a.shape[1])])
dataset_aug = pd.concat([dataset_aug, tfidf_a_df], axis=1)

# Transform the summary column for project_a in the test dataset
tfidf_a_test = tfidf_vectorizer.transform(test_data['summary_project_a'])
tfidf_a_test_df = pd.DataFrame(tfidf_a_test.toarray(), columns=[f'tfidf_a_{i}' for i in range(tfidf_a_test.shape[1])])
test_data = pd.concat([test_data, tfidf_a_test_df], axis=1)

# Fit and transform the summary column for project_b in the training dataset
tfidf_b = tfidf_vectorizer.fit_transform(dataset_aug['summary_project_b'])
tfidf_b_df = pd.DataFrame(tfidf_b.toarray(), columns=[f'tfidf_b_{i}' for i in range(tfidf_b.shape[1])])
dataset_aug = pd.concat([dataset_aug, tfidf_b_df], axis=1)

# Transform the summary column for project_b in the test dataset
tfidf_b_test = tfidf_vectorizer.transform(test_data['summary_project_b'])
tfidf_b_test_df = pd.DataFrame(tfidf_b_test.toarray(), columns=[f'tfidf_b_{i}' for i in range(tfidf_b_test.shape[1])])
test_data = pd.concat([test_data, tfidf_b_test_df], axis=1)


### Feature selection

In [22]:

# Select features and target variable  
features = [  
    'total_amount_usd',
    # Feature A  
    'is_private_project_a',
    'has_homepage_project_a',  
    'log_size_project_a', 
    'stars_project_a', 
    'watchers_project_a',  
    'has_projects_project_a', 
    'has_pages_project_a', 
    'has_wiki_project_a',  
    'has_discussions_project_a', 
    'forks_project_a', 
    'is_archived_project_a',  
    'is_disabled_project_a', 
    'open_issues_project_a', 
    'subscribers_count_project_a',  
    'age_days_project_a', 
    'days_since_update_project_a', 
    'stars_ratio_project_a',  
    'watchers_ratio_project_a', 
    'forks_ratio_project_a', 
    'size_ratio_project_a',  
    'log_stars_project_a',  
    'log_watchers_project_a', 
    'log_forks_project_a', 
    'log_commit_count_project_a',  
    # 'log_stars_b_project_a', 
    # 'log_watchers_b_project_a', 
    # 'log_forks_b_project_a',  
    # 'log_commit_count_b_project_a',
    # Feature B
    'is_private_project_b', 
    'has_homepage_project_b',  
    'log_size_project_b', 
    'stars_project_b', 
    'watchers_project_b',  
    'has_projects_project_b', 
    'has_pages_project_b', 
    'has_wiki_project_b',  
    'has_discussions_project_b', 
    'forks_project_b', 
    'is_archived_project_b',  
    'is_disabled_project_b', 
    'open_issues_project_b', 
    'subscribers_count_project_b',  
    'age_days_project_b', 
    'days_since_update_project_b', 
    'stars_ratio_project_b',  
    'watchers_ratio_project_b', 
    'forks_ratio_project_b', 
    'size_ratio_project_b',  
    'log_stars_project_b',  
    'log_watchers_project_b', 
    'log_forks_project_b', 
    'log_commit_count_project_b',  
    # 'log_stars_b_project_b', 
    # 'log_watchers_b_project_b', 
    # 'log_forks_b_project_b',  
    # 'log_commit_count_b_project_b',  
    'repo_name_b','repo_name_a',
    # Additional Logarithmic Features
    'log_open_issues_count_project_a', 'log_closed_issues_count_project_a', 'log_open_prs_count_project_a', 'log_closed_prs_count_project_a', 'log_used_by_project_a', 'log_age_days_project_a',
    'log_open_issues_count_project_b', 'log_closed_issues_count_project_b', 'log_open_prs_count_project_b', 'log_closed_prs_count_project_b', 'log_used_by_project_b', 'log_age_days_project_b'
]  

# Get the list of new TF-IDF feature names
tfidf_feature_names_a = [f'tfidf_a_{i}' for i in range(tfidf_a.shape[1])]
tfidf_feature_names_b = [f'tfidf_b_{i}' for i in range(tfidf_b.shape[1])]

# Update features list with new TF-IDF features
features.extend(tfidf_feature_names_a + tfidf_feature_names_b)
  
log_features = [  
    'log_subscribers_count_project_a','log_subscribers_count_project_b',
    'log_size_project_a', 'log_stars_project_a', 'log_watchers_project_a', 'log_forks_project_a', 'log_commit_count_project_a',  
    'log_stars_b_project_a', 'log_watchers_b_project_a', 'log_forks_b_project_a', 'log_commit_count_b_project_a',  
    'log_size_project_b', 'log_stars_project_b', 'log_watchers_project_b', 'log_forks_project_b', 'log_commit_count_project_b',  
    'log_stars_b_project_b', 'log_watchers_b_project_b', 'log_forks_b_project_b', 'log_commit_count_b_project_b',
    'log_open_issues_count_project_a', 'log_closed_issues_count_project_a', 'log_open_prs_count_project_a', 'log_closed_prs_count_project_a', 'log_used_by_project_a', 'log_age_days_project_a',
    'log_open_issues_count_project_b', 'log_closed_issues_count_project_b', 'log_open_prs_count_project_b', 'log_closed_prs_count_project_b', 'log_used_by_project_b', 'log_age_days_project_b'
]  

plain_features = [  
    'size_project_a', 'stars_project_a', 'watchers_project_a', 'forks_project_a', 'commit_count_project_a',  
    'stars_b_project_a', 'watchers_b_project_a', 'forks_b_project_a', 'commit_count_b_project_a',  
    'size_project_b', 'stars_project_b', 'watchers_project_b', 'forks_project_b', 'commit_count_project_b',  
    'stars_b_project_b', 'watchers_b_project_b', 'forks_b_project_b', 'commit_count_b_project_b',
    'subscribers_count_project_b', 'subscribers_count_project_a',
    'open_issues_count_project_a', 'closed_issues_count_project_a', 'open_prs_count_project_a', 'closed_prs_count_project_a', 'used_by_project_a', 'age_days_project_a',
    'open_issues_count_project_b', 'closed_issues_count_project_b', 'open_prs_count_project_b', 'closed_prs_count_project_b', 'used_by_project_b', 'age_days_project_b'
]  

# Remove plain features if log features are present  
features = [col for col in features if col not in plain_features or col in log_features]  

# Target variable  
target = 'weight_a'  


#### Categorical variable encoding
Quarter should be handled properly as its a time-based data. But for simplicity, i use quarter column as categorical variable.

In [23]:

# Encode categorical variables  
label_encoders = {}  
for col in ['repo_name_a', 'repo_name_b','funder','quarter']:  
    if col in dataset_aug.columns:  
        le = LabelEncoder()  
        dataset_aug[col] = le.fit_transform(dataset_aug[col])  
        label_encoders[col] = le  
  
    if col in test_data.columns:  
        # Use transform with error handling for unseen labels  
        try:  
            test_data[col] = le.transform(test_data[col])  
        except ValueError:  
            # Assign a default value for unseen labels  
            test_data[col] = -1  

# Convert boolean columns to integer  
boolean_cols = [  
    'is_private_project_a', 'has_homepage_project_a', 'has_projects_project_a',  
    'has_pages_project_a', 'has_wiki_project_a', 'has_discussions_project_a',  
    'is_archived_project_a', 'is_disabled_project_a',  
    'is_private_project_b', 'has_homepage_project_b', 'has_projects_project_b',  
    'has_pages_project_b', 'has_wiki_project_b', 'has_discussions_project_b',  
    'is_archived_project_b', 'is_disabled_project_b'  
]  

for col in boolean_cols:  
    if col in dataset_aug.columns:  
        dataset_aug[col] = dataset_aug[col].astype(int)  
    if col in test_data.columns:  
        test_data[col] = test_data[col].astype(int)  

# Filter out features that are not present in the dataset  
features = [col for col in features if col in dataset_aug.columns and col in test_data.columns]  

# Select features and target  
X = dataset_aug[features]  
y = dataset_aug[target]  

# Split the data into training and testing sets  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  


### Initialize XGBoost models
Ive running down a hyperparameter grid search to find the best parameters, and the result is used here

In [24]:

# Initialize the XGBoost regressor  
xgb_regressor = xgb.XGBRegressor(  
    objective='reg:squarederror',  
    n_estimators=750,  
    learning_rate=0.05,  
    max_depth=8,  
    subsample=1,  
    colsample_bytree=0.1,
    random_state=42  
)  

# Train the model  
xgb_regressor.fit(X_train, y_train)  

# Make predictions  
y_pred = xgb_regressor.predict(X_test)  

# Ensure predictions are within the range [0, 1]  
y_pred = y_pred.clip(min=0, max=1)  

# Calculate evaluation metrics  
mse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)  

print(f"Mean Squared Error: {mse}")  
print(f"R^2 Score: {r2}")  

test_features = [col for col in features if col in test_data.columns]  

# Select features for test data  
X_test_final = test_data[test_features]  

# Make predictions on the test dataset  
y_pred_test = xgb_regressor.predict(X_test_final)  

# Ensure predictions are within the range [0, 1]  
y_pred_test = y_pred_test.clip(min=0, max=1)  

# Calculate weight_b as 1 - weight_a  
y_pred_test_b = 1 - y_pred_test  


Mean Squared Error: 0.012649668472400157
R^2 Score: 0.8997269884330764


### Correct transitivity in prediction (Optional)

In [11]:
predictions_df = pd.DataFrame({  
    'id': test_data['id'],  
    'weight_a': y_pred_test,  
    'weight_b': y_pred_test_b  
})  
  
# Function to correct transitivity violations  
def correct_transitivity(predictions):  
    # Sort by weight_a  
    predictions = predictions.sort_values(by='weight_a').reset_index(drop=True)  
      
    # Iterate through the DataFrame and correct violations  
    for i in range(1, len(predictions)):  
        if predictions.iloc[i]['weight_a'] < predictions.iloc[i-1]['weight_a']:  
            predictions.iloc[i]['weight_a'] = predictions.iloc[i-1]['weight_a']  
            predictions.iloc[i]['weight_b'] = 1 - predictions.iloc[i]['weight_a']  # Adjust weight_b accordingly  
  
    return predictions  
  
# Apply the transitivity correction  
corrected_predictions_df = correct_transitivity(predictions_df) 
submission_df= corrected_predictions_df[['id','weight_a']] 
corrected_predictions_df = predictions_df
  
# Save the corrected predictions to a CSV file  
save_path = 'dump'  
filename = 'hf10-pondtrain_x20_750_005_8_1_wA_wB_transitivity.csv'  
corrected_predictions_df.to_csv(save_path + filename, index=False)  
  
print("Corrected predictions saved to", save_path + filename)  

Corrected predictions saved to dumphf10-pondtrain_x20_750_005_8_1_wA_wB_transitivity.csv


### native prediction

In [27]:
predictions_df = pd.DataFrame({  
    'id': test_data['id'],  
    'weight_a': y_pred_test,  
    'weight_b': y_pred_test_b  
})  
 
# Save native predictions to a CSV file  
save_path = 'dump/'  
filename = 'predictions_df.csv'  
predictions_df.to_csv(save_path + filename, index=False)  
  
print("wA wB predictions saved to", save_path + filename)  

wA wB predictions saved to dump/predictions_df.csv


In [28]:
print(predictions_df.head(40))

     id  weight_a  weight_b
0     1  0.067631  0.932369
1     8  0.444064  0.555936
2    13  0.324559  0.675441
3    15  0.358210  0.641790
4    18  0.976285  0.023715
5    23  0.877279  0.122721
6    26  0.422457  0.577543
7    27  0.434655  0.565345
8    30  0.914351  0.085649
9    31  0.710026  0.289974
10   32  0.724098  0.275902
11   33  0.608229  0.391771
12   34  0.665071  0.334929
13   44  0.820792  0.179208
14   45  0.622694  0.377306
15   46  0.801497  0.198503
16   49  0.714008  0.285992
17   52  0.336273  0.663727
18   53  0.527621  0.472379
19   57  0.701846  0.298154
20   59  0.419123  0.580877
21   60  0.492780  0.507220
22   64  0.183109  0.816891
23   68  0.003921  0.996080
24   70  0.082931  0.917069
25   71  0.086752  0.913248
26   74  0.003668  0.996332
27   77  0.089354  0.910646
28   81  0.123605  0.876395
29   88  0.000000  1.000000
30   89  0.122838  0.877162
31   94  0.024976  0.975024
32   97  0.233977  0.766023
33  103  0.069730  0.930270
34  105  0.931194  0

In [33]:
 # file_path = r'D:\0000Pond\funding comp\huggingface-funding-comp\x10_1000_005_wA_wB_transitivity.csv'  # Update with the cleaned file path  
df = pd.read_csv(save_path + filename)      

# Normalize the weights to ensure weight_a + weight_b = 1  
df['total_weight'] = df['weight_a'] + df['weight_b']  
df['weight_a'] = df['weight_a'] / df['total_weight']  
df['weight_b'] = df['weight_b'] / df['total_weight']  

# Round the weights to 11 decimal points  
df['weight_a'] = df['weight_a'].round(11)  
df['weight_b'] = df['weight_b'].round(11)  

# Rename the 'weight_a' column to 'pred'
df.rename(columns={'weight_a': 'pred'}, inplace=True)

# Drop the total_weight and weight_b columns as they're no longer needed  
df.drop(columns=['total_weight', 'weight_b'], inplace=True)  

# Sort the DataFrame by 'id'  
df.sort_values(by='id', inplace=True)  

# Save the modified DataFrame to a new CSV file  
output_path = r'submission/XGB_BART_metrics_and_summary_result.csv'  
df.to_csv(output_path, index=False)  

print("Processed CSV saved to:", output_path)  


Processed CSV saved to: submission/XGB_BART_metrics_and_summary_result.csv


In [14]:
df

Unnamed: 0,id,pred
0,1,0.067631
1,8,0.444064
2,13,0.324559
3,15,0.358210
4,18,0.976285
...,...,...
1018,3400,0.380968
1019,3403,0.554686
1020,3405,0.411174
1021,3406,0.417776


### Check Transitivity Violations
This code is from FaezehShakouri https://github.com/FaezehShakouri/deepfunding/blob/main/model/src/transivity_check.py

In [32]:
import pandas as pd
import itertools

# Load predictions and original data
predictions_df = pd.read_csv(output_path)
test_data = pd.read_csv("raw_dataset/hf/test.csv")

# Create a dictionary mapping id to prediction
id_to_pred = dict(zip(predictions_df['id'], predictions_df['pred']))

# Get pairs from original data
pairs = []
for _, row in test_data.iterrows():
    if pd.notna(row['project_a']) and pd.notna(row['project_b']):
        pairs.append((row['id'], row['project_a'], row['project_b']))

# Build graph of connected pairs
connected_pairs = set()
for id1, proj_a, proj_b in pairs:
    connected_pairs.add((id1, proj_a))
    connected_pairs.add((id1, proj_b))

# Function to check transitivity for a triplet
def check_transitivity(id1, id2, id3):
    pred1 = id_to_pred[id1]
    pred2 = id_to_pred[id2]
    pred3 = id_to_pred[id3]
    
    # Check if pred1 < pred2 and pred2 < pred3
    if pred1 < pred2 and pred2 < pred3:
        # If true, pred1 should be < pred3
        if not pred1 < pred3:
            return False
    return True

# Find all inconsistencies among connected triplets
inconsistencies = []
for id1, id2, id3 in itertools.combinations(id_to_pred.keys(), 3):
    # Check if these IDs form connected pairs in original data
    if ((id1, id2) in connected_pairs and 
        (id2, id3) in connected_pairs and
        (id1, id3) in connected_pairs):
        if not check_transitivity(id1, id2, id3):
            inconsistencies.append((id1, id2, id3))

# Print results
print(f"Found {len(inconsistencies)} transitivity violations")
print("\nDetailed violations:")
for id1, id2, id3 in inconsistencies:
    print(f"\nViolation between IDs {id1}, {id2}, {id3}:")
    print(f"Pred({id1}) = {id_to_pred[id1]:.4f}")
    print(f"Pred({id2}) = {id_to_pred[id2]:.4f}")
    print(f"Pred({id3}) = {id_to_pred[id3]:.4f}")

Found 0 transitivity violations

Detailed violations:
