## Data loading

In [3]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import json

from python_proj.utils.util import safe_save_fig, subtract_dict, Counter
import python_proj.utils.exp_utils as exp_utils

file_name = "dataset_90_days_started_11_07_23_preprocessed"
base_path = exp_utils.BASE_PATH
data_path = f'{base_path}/final_data/{file_name}.csv'
figure_base_path = f"{base_path}/figures/modelling/{file_name}/"

# Meta stuff.
pr_merged_key = 'PullRequestIsMerged'
ftc_key = 'SubmitterIsFirstTimeContributor'
seed_counter = Counter()


# Loads regular dataframe.
df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)
if 'Unnamed: 1' in df.columns:
    df = df.drop(['Unnamed: 1', "Project Name.1"], axis=1)
print(df.columns)

# Loads first-time contributor data frame.
ftc_df = df[df[ftc_key]]
print(f'{len(df)=}')
print(f'{len(ftc_df)=}')

# Creates sub-datasets.
metadata_fields = ['ID', 'Project Name',
                   'Submitter ID', 'PR Number', 'Closed At']
dependent_fields = [pr_merged_key]
independent_fields = [field for field in df.columns
                      if (field not in metadata_fields
                          and field not in dependent_fields
                          and field != ftc_key)]

df[independent_fields].describe()


Index(['Project Name', 'ID', 'Submitter ID', 'PR Number', 'Closed At',
       'PullRequestIsMerged', 'ControlIntegratedBySameUser',
       'ControlPullRequestHasComments', 'ControlHasHashTagInDescription',
       'IntraProjectSubmitterPullRequestSuccessRate',
       'EcosystemExperienceSubmitterPullRequestSuccessRate',
       'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
       'InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
       'SubmitterIsFirstTimeContributor',
       'ControlPullRequestHasCommentByExternalUser',
       'ln(1 + ControlPullRequestLifeTimeInMinutes)',
       'ln(1 + ControlNumberOfCommitsInPullRequest)',
       'ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator)',
       'ln(1 + IntraProjectSubmitterPullRequestSubmissionCount)',
       'ln(1 + IntraProjectSubmitterPullRequestCommentCount)',
       'ln(1 + SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator)',
       'ln(1 + SharedExperiencePullReq

Unnamed: 0,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator),ln(1 + IntraProjectSubmitterPullRequestSubmissionCount),ln(1 + IntraProjectSubmitterPullRequestCommentCount),ln(1 + SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator),...,ln(1 + SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter),ln(1 + SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter),ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + EcosystemExperienceSubmitterIssueCommentCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedFirstOrderInDegreeCentrality),ln(1 + WeightedFirstOrderOutDegreeCentrality)
count,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,...,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0
mean,0.503273,0.4034645,0.0988974,0.05955645,0.4193296,0.1184218,0.2985197,0.1691039,0.1420826,0.05359085,...,0.01639801,0.03941424,0.0872459,0.1451003,0.0115162,0.01976215,0.009566235,0.01757943,0.08884955,0.07438409
std,0.4710991,0.4525011,0.2911891,0.229114,0.2257136,0.07155493,0.1868852,0.1844545,0.1853726,0.1297009,...,0.07423245,0.1070026,0.1245128,0.1815019,0.0541512,0.08022241,0.05261483,0.08245254,0.1050997,0.09868875
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.236372,0.07525668,0.1658047,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.6875,0.0,0.0,0.0,0.4430126,0.07525668,0.2972018,0.09318519,0.0,0.0,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.0,0.0,0.05426778,0.03550956
75%,1.0,0.9399033,0.0,0.0,0.5858949,0.1505134,0.4381723,0.3095545,0.2734346,0.0,...,0.0,0.0,0.1416667,0.2575606,0.0,0.0,0.0,0.0,0.136633,0.1119194
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

print(f'{len(train)=}, {len(test)=}')

train_predictors = train[independent_fields]
train_dependent = train[pr_merged_key]

test_predictors = test[independent_fields]
test_dependent = test[pr_merged_key]


len(train)=979694, len(test)=244924


## Model Creation

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import f1_score, confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, roc_auc_score


def calculate_metrics(predicted_labels, true_labels, sample_weights=None):
    return {
        "accuracy": accuracy_score(true_labels, predicted_labels, sample_weight=sample_weights),
        "precision": precision_score(true_labels, predicted_labels, sample_weight=sample_weights),
        "recall": recall_score(true_labels, predicted_labels, sample_weight=sample_weights),
        "f1": f1_score(true_labels, predicted_labels, sample_weight=sample_weights)
    }


def create_model(
    train_predictors: pd.DataFrame,
    train_labels: pd.Series,
    test_predictors: pd.DataFrame,
    test_labels: pd.Series
):
    # Model creation and predictions.
    random_state = seed_counter.get_next()
    print(f'{random_state=}')
    rf = RandomForestRegressor(
        n_estimators=100,
        random_state=random_state)

    rf.fit(train_predictors, train_labels)

    predictions = rf.predict(test_predictors)
    predictions = [pred >= 0.5 for pred in predictions]

    # F1 scores.
    conf = confusion_matrix(test_labels, predictions)

    metrics = calculate_metrics(predictions, test_labels)

    print(f'{metrics=}\n')

    # Other metrics.
    print(f'Confusion matrix:\n{conf}\n')
    print("Classification report:")
    print(classification_report(test_labels, predictions))

    return rf, metrics


In [6]:
import regex as re
from python_proj.utils.util import subtract_dict

# Creates full model.
full_rf_model, full_rf_metrics = create_model(
    train_predictors,
    train_dependent,
    test_predictors,
    test_dependent
)

print(json.dumps(full_rf_metrics, indent=4))


random_state=43


KeyboardInterrupt: 

In [None]:

# Creates control model.
control_fields = [field for field in df.columns[metadata_fields:]
                  if re.match(r'.*Control.*', field)]
control_rf_model, control_rf_metrics = create_model(
    train[control_fields],
    train_dependent,
    test[control_fields],
    test_dependent
)

print(json.dumps(control_rf_metrics, indent=4))
print("Comparison between control model and full model (negatives indicate control is better):")
diff = subtract_dict(full_rf_metrics, control_rf_metrics)
print(json.dumps(diff, indent=4))


In [None]:
# Creates measured information model.
non_control_fields = [field for field in df.columns[metadata_fields:]
                      if field not in control_fields]
non_control_rf_model, non_control_rf_metrics = create_model(
    train[non_control_fields],
    train_dependent,
    test[non_control_fields],
    test_dependent
)

print(json.dumps(non_control_rf_metrics, indent=4))
print("Comparison between non-control model and full model (negatives indicate non-control is better):")
diff = subtract_dict(full_rf_metrics, non_control_rf_metrics)
print(json.dumps(diff, indent=4))


## Ablation Study

In [None]:
from typing import Iterator


def ablation_study(
    train_predictors: pd.DataFrame,
    train_labels: pd.Series,
    test_predictors: pd.DataFrame,
    test_labels: pd.Series,
    evaluated_fields: Iterator[str],
):
    f1_differences_per_feature = {}

    for excluded_feature in evaluated_fields:
        print(f"Creating model without: {excluded_feature}.")
        _train_predictors = train_predictors.drop(excluded_feature, index=1)
        _test_predictors = test_predictors.drop(excluded_feature, index=1)

        _, excl_rf_metrics = create_model(
            _train_predictors, train_labels, _test_predictors, test_labels
        )

        diff = subtract_dict(full_rf_metrics, excl_rf_metrics)

        f1_differences_per_feature[excluded_feature] = {
            'Weighted metrics': excl_rf_metrics,
            'Weighted metrics difference': diff,
        }

    return f1_differences_per_feature


f1_diffs_per_feature = ablation_study(
    train_predictors, train_dependent,
    test_predictors, test_dependent,
    independent_fields
)

print("Performance metrics ablation study vs. the full model (negatives suggest the ablation model is better):")
print(f1_diffs_per_feature)


## Partial Dependence Plots

In [None]:
from sklearn.inspection import PartialDependenceDisplay
from matplotlib import pyplot as plt


def create_partial_dependence_plots(
        rf_model, 
        used_predictors,
        used_labels,
        model_name: str = ""):

    # Collective partial dependence plot.
    PartialDependenceDisplay.from_estimator(
        rf_model, used_predictors, used_labels)

    fig = plt.gcf()
    axs = fig.axes

    lines = []
    for ax in axs:
        __lines = list([(line.get_xdata(), line.get_ydata())
                        for line in ax.lines])
        lines.extend(__lines)

    for (x, y), label in zip(lines, __used_predictor_labels):
        plt.clf()
        plt.plot(x, y, linestyle='-', color='#e69d00')
        plt.xlabel(label)
        plt.ylabel('Partial Dependence')
        plt.tight_layout()
        output_path = f"{figure_base_path}/partial-dependence/{model_name}/{label}.png"
        safe_save_fig(output_path)


In [None]:
create_partial_dependence_plots(
    full_rf_model,
    test_predictors,
    test_dependents,
    model_name="full_model"
)
