In [3]:
import pandas as pd  
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import LabelEncoder  
import xgboost as xgb  
from sklearn.metrics import mean_squared_error, r2_score  
import numpy as np  
  


In [13]:
# Function to extract repository name from URL  
def extract_repo_name(url):  
    try:  
        return url.split('/')[-2] + '/' + url.split('/')[-1]  
    except:  
        return None  
  
# Load the training dataset  
training_data_path = r'D:\0000Pond\funding comp\enriched_dataset\enriched_aug_dataset.csv'  
dataset_aug = pd.read_csv(training_data_path)  
  
# Load the test dataset  
test_data_path = r'D:\0000Pond\funding comp\enriched_dataset\enriched_test.csv'  
test_data = pd.read_csv(test_data_path)  
  


In [14]:
# Extract repository names for both datasets  
dataset_aug['repo_name_a'] = dataset_aug['project_a'].apply(extract_repo_name)  
dataset_aug['repo_name_b'] = dataset_aug['project_b'].apply(extract_repo_name)  
test_data['repo_name_a'] = test_data['project_a'].apply(extract_repo_name)  
test_data['repo_name_b'] = test_data['project_b'].apply(extract_repo_name)  
  
# Check for NaN values in the training and test datasets  
print("NaN values in training dataset:")  
print(dataset_aug.isnull().sum())  
print("\nNaN values in test dataset:")  
print(test_data.isnull().sum())  
  


NaN values in training dataset:
id                              0
project_a                       0
project_b                       0
weight_a                        0
weight_b                        0
                               ..
log_watchers_b_project_b        0
log_forks_b_project_b           0
log_commit_count_b_project_b    0
repo_name_a                     0
repo_name_b                     0
Length: 112, dtype: int64

NaN values in test dataset:
id                              0
project_a                       0
project_b                       0
total_amount_usd                0
funder                          0
                               ..
log_watchers_b_project_b        0
log_forks_b_project_b           0
log_commit_count_b_project_b    0
repo_name_a                     0
repo_name_b                     0
Length: 110, dtype: int64


In [15]:
# Get unique 'funder' values from the test dataset  
test_funders = test_data['funder'].unique()  
  
# Filter the training dataset to only include rows where 'funder' is in the test dataset  
dataset_aug = dataset_aug[dataset_aug['funder'].isin(test_funders)]  
  
# Calculate logarithmic features for 'size' if they don't exist  
if 'log_size_project_a' not in dataset_aug.columns:  
    dataset_aug['log_size_project_a'] = np.log1p(dataset_aug['size_project_a'])  
if 'log_size_project_b' not in dataset_aug.columns:  
    dataset_aug['log_size_project_b'] = np.log1p(dataset_aug['size_project_b'])  
if 'subscribers_count_project_a' not in dataset_aug.columns:  
    dataset_aug['log_subscribers_count_project_a'] = np.log1p(dataset_aug['subscribers_count_project_a'])  
if 'subscribers_count_project_b' not in dataset_aug.columns:  
    dataset_aug['log_subscribers_count_project_b'] = np.log1p(dataset_aug['subscribers_count_project_b'])
  
if 'log_size_project_a' not in test_data.columns:  
    test_data['log_size_project_a'] = np.log1p(test_data['size_project_a'])  
if 'log_size_project_b' not in test_data.columns:  
    test_data['log_size_project_b'] = np.log1p(test_data['size_project_b'])  
if 'log_subscribers_count_project_a' not in test_data.columns:  
    test_data['log_subscribers_count_project_a'] = np.log1p(test_data['subscribers_count_project_a'])  
if 'log_subscribers_count_project_b' not in test_data.columns:  
    test_data['log_subscribers_count_project_b'] = np.log1p(test_data['subscribers_count_project_b']) 


# Select features and target variable  
features = [  
    'total_amount_usd',  
    'is_private_project_a', 'has_homepage_project_a',  
    'log_size_project_a', 'stars_project_a', 'watchers_project_a',  
    'has_projects_project_a', 'has_pages_project_a', 'has_wiki_project_a',  
    'has_discussions_project_a', 'forks_project_a', 'is_archived_project_a',  
    'is_disabled_project_a', 'open_issues_project_a', 'subscribers_count_project_a',  
    'age_days_project_a', 'days_since_update_project_a', 'stars_ratio_project_a',  
    'watchers_ratio_project_a', 'forks_ratio_project_a', 'size_ratio_project_a',  
    'log_stars_project_a',  
    'log_watchers_project_a', 'log_forks_project_a', 'log_commit_count_project_a',  
    'log_stars_b_project_a', 'log_watchers_b_project_a', 'log_forks_b_project_a',  
    'log_commit_count_b_project_a', 'is_private_project_b', 'has_homepage_project_b',  
    'log_size_project_b', 'stars_project_b', 'watchers_project_b',  
    'has_projects_project_b', 'has_pages_project_b', 'has_wiki_project_b',  
    'has_discussions_project_b', 'forks_project_b', 'is_archived_project_b',  
    'is_disabled_project_b', 'open_issues_project_b', 'subscribers_count_project_b',  
    'age_days_project_b', 'days_since_update_project_b', 'stars_ratio_project_b',  
    'watchers_ratio_project_b', 'forks_ratio_project_b', 'size_ratio_project_b',  
    'log_stars_project_b',  
    'log_watchers_project_b', 'log_forks_project_b', 'log_commit_count_project_b',  
    'log_stars_b_project_b', 'log_watchers_b_project_b', 'log_forks_b_project_b',  
    'log_commit_count_b_project_b',  
    'repo_name_a', 'repo_name_b'  
]  
  
# Remove plain count features if logarithmic features are present  
log_features = [  
    'log_subscribers_count_project_a','log_subscribers_count_project_b',
    'log_size_project_a', 'log_stars_project_a', 'log_watchers_project_a', 'log_forks_project_a', 'log_commit_count_project_a',  
    'log_stars_b_project_a', 'log_watchers_b_project_a', 'log_forks_b_project_a', 'log_commit_count_b_project_a',  
    'log_size_project_b', 'log_stars_project_b', 'log_watchers_project_b', 'log_forks_project_b', 'log_commit_count_project_b',  
    'log_stars_b_project_b', 'log_watchers_b_project_b', 'log_forks_b_project_b', 'log_commit_count_b_project_b'  
]  
  
plain_features = [  
    'size_project_a', 'stars_project_a', 'watchers_project_a', 'forks_project_a', 'commit_count_project_a',  
    'stars_b_project_a', 'watchers_b_project_a', 'forks_b_project_a', 'commit_count_b_project_a',  
    'size_project_b', 'stars_project_b', 'watchers_project_b', 'forks_project_b', 'commit_count_project_b',  
    'stars_b_project_b', 'watchers_b_project_b', 'forks_b_project_b', 'commit_count_b_project_b',
    'subscribers_count_project_b', 'subscribers_count_project_a'

]  
  
# Remove plain features if log features are present  
features = [col for col in features if col not in plain_features or col in log_features]  
  
# Target variable  
target = 'weight_a'  
  
# Encode categorical variables  
label_encoders = {}  
for col in ['funder', 'quarter', 'repo_name_a', 'repo_name_b']:  
    if col in dataset_aug.columns:  
        le = LabelEncoder()  
        dataset_aug[col] = le.fit_transform(dataset_aug[col])  
        label_encoders[col] = le  
  
    if col in test_data.columns:  
        # Use transform with error handling for unseen labels  
        try:  
            test_data[col] = le.transform(test_data[col])  
        except ValueError:  
            # Assign a default value for unseen labels  
            test_data[col] = -1  
  
# Convert boolean columns to integer  
boolean_cols = [  
    'is_private_project_a', 'has_homepage_project_a', 'has_projects_project_a',  
    'has_pages_project_a', 'has_wiki_project_a', 'has_discussions_project_a',  
    'is_archived_project_a', 'is_disabled_project_a',  
    'is_private_project_b', 'has_homepage_project_b', 'has_projects_project_b',  
    'has_pages_project_b', 'has_wiki_project_b', 'has_discussions_project_b',  
    'is_archived_project_b', 'is_disabled_project_b'  
]  
  
for col in boolean_cols:  
    if col in dataset_aug.columns:  
        dataset_aug[col] = dataset_aug[col].astype(int)  
    if col in test_data.columns:  
        test_data[col] = test_data[col].astype(int)  
  
# Filter out features that are not present in the dataset  
features = [col for col in features if col in dataset_aug.columns and col in test_data.columns]  

# Select features and target  
X = dataset_aug[features]  
y = dataset_aug[target]  
  
# Split the data into training and testing sets  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  
  


In [16]:


# Initialize the XGBoost regressor  
xgb_regressor = xgb.XGBRegressor(  
    objective='reg:squarederror',  
    n_estimators=100,  
    learning_rate=0.1,  
    max_depth=8,  
    subsample=0.8,  
    colsample_bytree=0.2,  
    random_state=42  
)  
  
# Train the model  
xgb_regressor.fit(X_train, y_train)  
  
# Make predictions  
y_pred = xgb_regressor.predict(X_test)  
  
# Ensure predictions are within the range [0, 1]  
y_pred = y_pred.clip(min=0, max=1)  
  
# Calculate evaluation metrics  
mse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)  
  
print(f"Mean Squared Error: {mse}")  
print(f"R^2 Score: {r2}")  
  
# Ensure all necessary features are present in the test dataset  
# If a feature is not present, you can either add it with a default value or remove it from the feature list  
# Here, we will remove features that are not present in the test dataset  
test_features = [col for col in features if col in test_data.columns]  
  
# Select features for test data  
X_test_final = test_data[test_features]  
  
# Make predictions on the test dataset  
y_pred_test = xgb_regressor.predict(X_test_final)  
  
# Ensure predictions are within the range [0, 1]  
y_pred_test = y_pred_test.clip(min=0, max=1)  
  
# Calculate weight_b as 1 - weight_a  
y_pred_test_b = 1 - y_pred_test  
  
# Create a DataFrame for the predictions  
predictions_df = pd.DataFrame({  
    'id': test_data['id'],  
    'pred': y_pred_test 
})  
  
# Save the predictions to a CSV file  
save_path = 'D:/0000Pond/funding comp/submission_csv/'
filename = 'pond_xgb_enr_aug_funderfilter_logfeat_n100_col02.csv'
predictions_df.to_csv(save_path+filename, index=False)  
  
print("Predictions saved to",save_path+filename)  


Mean Squared Error: 0.03022538089943096
R^2 Score: 0.8121208213357709
Predictions saved to D:/0000Pond/funding comp/submission_csv/pond_xgb_enr_aug_funderfilter_logfeat_n100_col02.csv


In [17]:
print(predictions_df.head(40))

       id      pred
0   20884  0.160022
1   20885  0.484619
2   20886  0.579444
3   20887  0.943566
4   20888  0.386261
5   20889  0.432100
6   20890  0.430499
7   20891  0.550144
8   20892  0.296190
9   20893  0.732867
10  20894  0.664330
11  20895  0.339684
12  20896  0.554359
13  20897  0.547065
14  20898  0.720543
15  20899  0.694483
16  20900  0.801367
17  20901  0.621893
18  20902  0.536369
19  20903  0.904986
20  20904  0.778745
21  20905  0.135827
22  20906  0.207806
23  20907  0.417224
24  20908  0.524013
25  20909  0.573763
26  20910  0.274732
27  20911  0.455478
28  20912  0.702933
29  20913  0.463720
30  20914  0.435110
31  20915  0.601142
32  20916  0.821220
33  20917  0.920835
34  20918  0.754515
35  20919  0.438765
36  20920  0.597449
37  20921  0.383764
38  20922  0.620282
39  20923  0.783024
