In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import json

from python_proj.utils.util import safe_save_fig, subtract_dict, Counter
import python_proj.utils.exp_utils as exp_utils

file_name = "dataset_90_days_started_11_07_23_preprocessed"
base_path = exp_utils.BASE_PATH
data_path = f'{base_path}/final_data/{file_name}.csv'
figure_base_path = f"{base_path}/figures/modelling/{file_name}/"

# Meta stuff.
pr_merged_key = 'PullRequestIsMerged'
ftc_key = 'SubmitterIsFirstTimeContributor'
seed_counter = Counter()


# Loads regular dataframe.
df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)
if 'Unnamed: 1' in df.columns:
    df = df.drop(['Unnamed: 1', "Project Name.1"], axis=1)
print(df.columns)

# Loads first-time contributor data frame.
ftc_df = df[df[ftc_key]]
print(f'{len(df)=}')
print(f'{len(ftc_df)=}')

# Creates sub-datasets.
metadata_fields = ['ID', 'Project Name',
                   'Submitter ID', 'PR Number', 'Closed At']
dependent_fields = [pr_merged_key]
independent_fields = [field for field in df.columns
                      if (field not in metadata_fields
                          and field not in dependent_fields)]

df[independent_fields].describe()


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

print(f'{len(train)=}, {len(test)=}')

train_predictors = train[independent_fields]
train_dependent = train[pr_merged_key]

test_predictors = test[independent_fields]
test_dependent = test[pr_merged_key]


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import f1_score, confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, roc_auc_score


def calculate_metrics(predicted_labels, true_labels, sample_weights=None):
    return {
        "accuracy": accuracy_score(true_labels, predicted_labels, sample_weight=sample_weights),
        "precision": precision_score(true_labels, predicted_labels, sample_weight=sample_weights),
        "recall": recall_score(true_labels, predicted_labels, sample_weight=sample_weights),
        "f1": f1_score(true_labels, predicted_labels, sample_weight=sample_weights)
    }


def create_model(
    train_predictors: pd.DataFrame,
    train_labels: pd.Series,
    test_predictors: pd.DataFrame,
    test_labels: pd.Series
):
    # Model creation and predictions.
    random_state = seed_counter.get_next()
    print(f'{random_state=}')
    rf = RandomForestRegressor(
        n_estimators=100,
        random_state=random_state)

    rf.fit(train_predictors, train_labels)

    predictions = rf.predict(test_predictors)
    predictions = [pred >= 0.5 for pred in predictions]

    # F1 scores.
    conf = confusion_matrix(test_labels, predictions)

    metrics = calculate_metrics(predictions, test_labels)

    print(f'{metrics=}\n')

    # Other metrics.
    print(f'Confusion matrix:\n{conf}\n')
    print("Classification report:")
    print(classification_report(test_labels, predictions))

    return rf, metrics


In [None]:
import regex as re
from python_proj.utils.util import subtract_dict

# Creates full model.
full_rf_model, full_rf_metrics = create_model(
    train_predictors,
    train_dependent,
    test_predictors,
    test_dependent
)

print(json.dumps(full_rf_metrics, indent=4))

In [None]:

# Creates control model.
control_fields = [field for field in df.columns[metadata_fields:]
                  if re.match(r'.*Control.*', field)]
control_rf_model, control_rf_metrics = create_model(
    train[control_fields],
    train_dependent,
    test[control_fields],
    test_dependent
)

print(json.dumps(control_rf_metrics, indent=4))
print("Comparison between control model and full model (negatives indicate control is better):")
diff = subtract_dict(full_rf_metrics, control_rf_metrics)
print(json.dumps(diff, indent=4))

In [None]:
# Creates measured information model.
non_control_fields = [field for field in df.columns[metadata_fields:]
                      if field not in control_fields]
non_control_rf_model, non_control_rf_metrics = create_model(
    train[non_control_fields],
    train_dependent,
    test[non_control_fields],
    test_dependent
)

print(json.dumps(non_control_rf_metrics, indent=4))
print("Comparison between non-control model and full model (negatives indicate non-control is better):")
diff = subtract_dict(full_rf_metrics, non_control_rf_metrics)
print(json.dumps(diff, indent=4))