## Data loading

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import json

import regex as re
from python_proj.utils.util import safe_save_fig, subtract_dict, Counter
import python_proj.utils.exp_utils as exp_utils

file_name = "dataset_90_days_started_11_07_23_preprocessed"
base_path = exp_utils.BASE_PATH
data_path = f'{base_path}/final_data/{file_name}.csv'
figure_base_path = f"{base_path}/figures/modelling/{file_name}/"

# Meta stuff.
pr_merged_key = 'PullRequestIsMerged'
ftc_key = 'SubmitterIsFirstTimeContributor'
seed_counter = Counter()


# Loads regular dataframe.
df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)
if 'Unnamed: 1' in df.columns:
    df = df.drop(['Unnamed: 1', "Project Name.1"], axis=1)
print(df.columns)

# Loads first-time contributor data frame.
ftc_df = df[df[ftc_key]]
print(f'{len(df)=}')
print(f'{len(ftc_df)=}')

# Creates sub-datasets.
metadata_fields = ['ID', 'Project Name',
                   'Submitter ID', 'PR Number', 'Closed At']
dependent_fields = [pr_merged_key]
independent_fields = [field for field in df.columns
                      if (field not in metadata_fields
                          and field not in dependent_fields
                          and field != ftc_key)]

df[independent_fields].describe()


Index(['Project Name', 'ID', 'Submitter ID', 'PR Number', 'Closed At',
       'PullRequestIsMerged', 'ControlIntegratedBySameUser',
       'ControlPullRequestHasComments', 'ControlHasHashTagInDescription',
       'IntraProjectSubmitterPullRequestSuccessRate',
       'EcosystemExperienceSubmitterPullRequestSuccessRate',
       'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
       'InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
       'SubmitterIsFirstTimeContributor',
       'ControlPullRequestHasCommentByExternalUser',
       'ln(1 + ControlPullRequestLifeTimeInMinutes)',
       'ln(1 + ControlNumberOfCommitsInPullRequest)',
       'ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator)',
       'ln(1 + IntraProjectSubmitterPullRequestSubmissionCount)',
       'ln(1 + IntraProjectSubmitterPullRequestCommentCount)',
       'ln(1 + SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator)',
       'ln(1 + SharedExperiencePullReq

Unnamed: 0,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator),ln(1 + IntraProjectSubmitterPullRequestSubmissionCount),ln(1 + IntraProjectSubmitterPullRequestCommentCount),ln(1 + SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator),...,ln(1 + SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter),ln(1 + SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter),ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + EcosystemExperienceSubmitterIssueCommentCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedFirstOrderInDegreeCentrality),ln(1 + WeightedFirstOrderOutDegreeCentrality)
count,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,...,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0
mean,0.5030928,0.4031557,0.09875442,0.05957661,0.419413,0.1184107,0.2986152,0.1690142,0.1422607,0.05350922,...,0.01635594,0.0394005,0.08727819,0.1451409,0.01149311,0.01972716,0.009570043,0.0175902,0.08877349,0.0743104
std,0.4710835,0.4523914,0.2910103,0.2291538,0.2256904,0.07150637,0.1869018,0.1843826,0.1856786,0.1295702,...,0.07410553,0.1069538,0.1245486,0.181499,0.05408212,0.08011372,0.05264199,0.08248895,0.1049746,0.09861278
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.2364295,0.07525668,0.1658837,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.6842105,0.0,0.0,0.0,0.4431563,0.07525668,0.2973435,0.09315576,0.0,0.0,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.0,0.0,0.05422832,0.03548693
75%,1.0,0.9393939,0.0,0.0,0.5859729,0.1505134,0.4383811,0.3094567,0.2738132,0.0,...,0.0,0.0,0.1416667,0.2575606,0.0,0.0,0.0,0.0,0.1365664,0.1118328
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [2]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

print(f'{len(train)=}, {len(test)=}')

train_predictors = train[independent_fields]
train_dependent = train[pr_merged_key]

test_predictors = test[independent_fields]
test_dependent = test[pr_merged_key]


len(train)=979694, len(test)=244924


## Model Creation

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import f1_score, confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, roc_auc_score


def calculate_metrics(predicted_labels, true_labels, sample_weights=None):
    return {
        "accuracy": accuracy_score(true_labels, predicted_labels, sample_weight=sample_weights),
        "precision": precision_score(true_labels, predicted_labels, sample_weight=sample_weights),
        "recall": recall_score(true_labels, predicted_labels, sample_weight=sample_weights),
        "f1": f1_score(true_labels, predicted_labels, sample_weight=sample_weights)
    }


def create_model(
    train_predictors: pd.DataFrame,
    train_labels: pd.Series,
    test_predictors: pd.DataFrame,
    test_labels: pd.Series
):
    # Model creation and predictions.
    random_state = seed_counter.get_next()
    print(f'{random_state=}')
    rf = RandomForestRegressor(
        n_estimators=100,
        random_state=random_state)

    rf.fit(train_predictors, train_labels)

    predictions = rf.predict(test_predictors)
    predictions = [pred >= 0.5 for pred in predictions]

    # F1 scores.
    conf = confusion_matrix(test_labels, predictions)

    metrics = calculate_metrics(predictions, test_labels)

    print(f'{metrics=}\n')

    # Other metrics.
    print(f'Confusion matrix:\n{conf}\n')
    print("Classification report:")
    print(classification_report(test_labels, predictions))

    return rf, metrics


In [4]:
import regex as re
from python_proj.utils.util import subtract_dict

# Creates full model.
full_rf_model, full_rf_metrics = create_model(
    train_predictors,
    train_dependent,
    test_predictors,
    test_dependent
)

print("Performance metrics full model:")
print(json.dumps(full_rf_metrics, indent=4))


random_state=43
metrics={'accuracy': 0.8720337737420588, 'precision': 0.8968192397207138, 'recall': 0.9464803913440013, 'f1': 0.9209808439937676}

Confusion matrix:
[[ 30934  21014]
 [ 10328 182648]]

Classification report:
              precision    recall  f1-score   support

       False       0.75      0.60      0.66     51948
        True       0.90      0.95      0.92    192976

    accuracy                           0.87    244924
   macro avg       0.82      0.77      0.79    244924
weighted avg       0.87      0.87      0.87    244924

Performance metrics full model:
{
    "accuracy": 0.8720337737420588,
    "precision": 0.8968192397207138,
    "recall": 0.9464803913440013,
    "f1": 0.9209808439937676
}


In [5]:

# Creates control model.
control_fields = [field for field in independent_fields
                  if re.match(r'.*Control.*', field)]
print(control_fields)
control_rf_model, control_rf_metrics = create_model(
    train[control_fields],
    train_dependent,
    test[control_fields],
    test_dependent
)

print(json.dumps(control_rf_metrics, indent=4))
print("Comparison between control model and full model (negatives indicate control is better):")
diff = subtract_dict(full_rf_metrics, control_rf_metrics)
print(json.dumps(diff, indent=4))


['ControlIntegratedBySameUser', 'ControlPullRequestHasComments', 'ControlHasHashTagInDescription', 'ControlPullRequestHasCommentByExternalUser', 'ln(1 + ControlPullRequestLifeTimeInMinutes)', 'ln(1 + ControlNumberOfCommitsInPullRequest)', 'ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator)']
random_state=44
metrics={'accuracy': 0.7990437850108605, 'precision': 0.8508065106517972, 'recall': 0.9033558577232402, 'f1': 0.8762940752153335}

Confusion matrix:
[[ 21379  30569]
 [ 18650 174326]]

Classification report:
              precision    recall  f1-score   support

       False       0.53      0.41      0.46     51948
        True       0.85      0.90      0.88    192976

    accuracy                           0.80    244924
   macro avg       0.69      0.66      0.67    244924
weighted avg       0.78      0.80      0.79    244924

{
    "accuracy": 0.7990437850108605,
    "precision": 0.8508065106517972,
    "recall": 0.9033558577232402,
    "f1": 0.8762940752153335
}
Compar

In [6]:
# Creates measured information model.
non_control_fields = [field for field in independent_fields
                      if field not in control_fields]
non_control_rf_model, non_control_rf_metrics = create_model(
    train[non_control_fields],
    train_dependent,
    test[non_control_fields],
    test_dependent
)

print(json.dumps(non_control_rf_metrics, indent=4))
print("Comparison between non-control model and full model (negatives indicate non-control is better):")
diff = subtract_dict(full_rf_metrics, non_control_rf_metrics)
print(json.dumps(diff, indent=4))


random_state=45
metrics={'accuracy': 0.7909882249187503, 'precision': 0.8083762527839644, 'recall': 0.9630005803830528, 'f1': 0.8789398009762004}

Confusion matrix:
[[  7896  44052]
 [  7140 185836]]

Classification report:
              precision    recall  f1-score   support

       False       0.53      0.15      0.24     51948
        True       0.81      0.96      0.88    192976

    accuracy                           0.79    244924
   macro avg       0.67      0.56      0.56    244924
weighted avg       0.75      0.79      0.74    244924

{
    "accuracy": 0.7909882249187503,
    "precision": 0.8083762527839644,
    "recall": 0.9630005803830528,
    "f1": 0.8789398009762004
}
Comparison between non-control model and full model (negatives indicate non-control is better):
{
    "accuracy": 0.0810455488233085,
    "precision": 0.08844298693674935,
    "recall": -0.016520189039051503,
    "f1": 0.04204104301756717
}


### Non-control model without intra-project factors

In [7]:
# Creates measured information model.
non_control_fields_wo_intra = [field for field in non_control_fields
                               if not re.match(r'.*Intra.*', field)]
non_control_wo_intra_rf_model, non_control_wo_intra_rf_metrics = create_model(
    train[non_control_fields_wo_intra],
    train_dependent,
    test[non_control_fields_wo_intra],
    test_dependent
)

print(json.dumps(non_control_rf_metrics, indent=4))
print("Comparison between non-control model (without intra-project factors) and full model (negatives indicate non-control is better):")
diff = subtract_dict(full_rf_metrics, non_control_rf_metrics)
print(json.dumps(diff, indent=4))


random_state=46
metrics={'accuracy': 0.7808299717463376, 'precision': 0.7983703754476755, 'recall': 0.9657263079346654, 'f1': 0.8741099989681148}

Confusion matrix:
[[  4882  47066]
 [  6614 186362]]

Classification report:
              precision    recall  f1-score   support

       False       0.42      0.09      0.15     51948
        True       0.80      0.97      0.87    192976

    accuracy                           0.78    244924
   macro avg       0.61      0.53      0.51    244924
weighted avg       0.72      0.78      0.72    244924

{
    "accuracy": 0.7909882249187503,
    "precision": 0.8083762527839644,
    "recall": 0.9630005803830528,
    "f1": 0.8789398009762004
}
Comparison between non-control model (without intra-project factors) and full model (negatives indicate non-control is better):
{
    "accuracy": 0.0810455488233085,
    "precision": 0.08844298693674935,
    "recall": -0.016520189039051503,
    "f1": 0.04204104301756717
}


## Ablation Study

In [8]:
from typing import Iterator


def ablation_study(
    train_predictors: pd.DataFrame,
    train_labels: pd.Series,
    test_predictors: pd.DataFrame,
    test_labels: pd.Series,
    evaluated_fields: Iterator[str],
):
    f1_differences_per_feature = {}

    for excluded_feature in evaluated_fields:
        print(f"Creating model without: {excluded_feature}.")
        _train_predictors = train_predictors.drop(excluded_feature, axis=1)
        _test_predictors = test_predictors.drop(excluded_feature, axis=1)

        _, excl_rf_metrics = create_model(
            _train_predictors, train_labels, _test_predictors, test_labels
        )

        diff = subtract_dict(full_rf_metrics, excl_rf_metrics)

        f1_differences_per_feature[excluded_feature] = {
            'Weighted metrics': excl_rf_metrics,
            'Weighted metrics difference': diff,
        }

    return f1_differences_per_feature


f1_diffs_per_feature = ablation_study(
    train_predictors, train_dependent,
    test_predictors, test_dependent,
    independent_fields
)

print("Performance metrics ablation study vs. the full model (negatives suggest the ablation model is better):")
print(f1_diffs_per_feature)


Creating model without: ControlIntegratedBySameUser.
random_state=47
metrics={'accuracy': 0.8485979324198527, 'precision': 0.8795701123901907, 'recall': 0.9359972224525329, 'f1': 0.9069068013616782}

Confusion matrix:
[[ 27217  24731]
 [ 12351 180625]]

Classification report:
              precision    recall  f1-score   support

       False       0.69      0.52      0.59     51948
        True       0.88      0.94      0.91    192976

    accuracy                           0.85    244924
   macro avg       0.78      0.73      0.75    244924
weighted avg       0.84      0.85      0.84    244924

Creating model without: ControlPullRequestHasComments.
random_state=48
metrics={'accuracy': 0.8679222942627101, 'precision': 0.8921500173337304, 'recall': 0.9468275847773816, 'f1': 0.9186759482023938}

Confusion matrix:
[[ 29860  22088]
 [ 10261 182715]]

Classification report:
              precision    recall  f1-score   support

       False       0.74      0.57      0.65     51948
        

## Partial Dependence Plots

In [9]:
from sklearn.inspection import PartialDependenceDisplay
from matplotlib import pyplot as plt


def create_partial_dependence_plots(
        rf_model: RandomForestRegressor,
        used_predictors: pd.DataFrame,
        used_labels: pd.Series,
        model_name: str = ""):

    # Collective partial dependence plot.
    PartialDependenceDisplay.from_estimator(
        rf_model, used_predictors, used_labels)

    fig = plt.gcf()
    axs = fig.axes

    lines = []
    for ax in axs:
        __lines = list([(line.get_xdata(), line.get_ydata())
                        for line in ax.lines])
        lines.extend(__lines)

    for (x, y), label in zip(lines, used_predictors.columns):
        plt.clf()
        plt.plot(x, y, linestyle='-', color='#e69d00')
        plt.xlabel(label)
        plt.ylabel('Partial Dependence')
        plt.tight_layout()
        output_path = f"{figure_base_path}/partial-dependence/{model_name}/{label}.png"
        safe_save_fig(output_path)


In [10]:
create_partial_dependence_plots(
    full_rf_model,
    test_predictors,
    test_dependent,
    model_name="full_model"
)


ValueError: all features must be in [0, 36] or [-37, 0]