In [38]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import json

import regex as re
from python_proj.utils.util import safe_save_fig, subtract_dict, Counter
import python_proj.utils.exp_utils as exp_utils

file_name = "dataset_90_days_started_11_07_23_shared_experience"
base_path = exp_utils.BASE_PATH
data_path = f'{base_path}/final_data/{file_name}.csv'
figure_base_path = f"{base_path}/figures/modelling/{file_name}/"

# Meta stuff.
pr_merged_key = 'PullRequestIsMerged'
ftc_key = 'SubmitterIsFirstTimeContributor'
seed_counter = Counter()


# Loads regular dataframe.
df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)
if 'Unnamed: 1' in df.columns:
    df = df.drop(['Unnamed: 1', "Project Name.1"], axis=1)
print(df.columns)

# Creates sub-datasets.
metadata_fields = ['ID', 'Project Name',
                   'Submitter ID', 'PR Number', 'Closed At']
dependent_fields = [pr_merged_key]
independent_fields = [field for field in df.columns
                      if (field not in metadata_fields
                          and field != pr_merged_key)]

df[independent_fields].describe()

class_counts = df[pr_merged_key].value_counts()
class_imbalance = class_counts[True] / class_counts[False]
print(f'{class_imbalance=}')

meta_header_count = 5

df.describe()

Index(['Project Name', 'ID', 'Submitter ID', 'PR Number', 'Closed At',
       'PullRequestIsMerged', 'ControlIntegratedBySameUser',
       'ControlPullRequestHasComments', 'ControlHasHashTagInDescription',
       'IntraProjectSubmitterPullRequestSuccessRate',
       'EcosystemExperienceSubmitterPullRequestSuccessRate',
       'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
       'InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
       'SubmitterIsFirstTimeContributor',
       'ControlPullRequestHasCommentByExternalUser',
       'ln(1 + ControlPullRequestLifeTimeInMinutes)',
       'ln(1 + ControlNumberOfCommitsInPullRequest)',
       'ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator)',
       'ln(1 + IntraProjectSubmitterPullRequestSubmissionCount)',
       'ln(1 + IntraProjectSubmitterPullRequestCommentCount)',
       'ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount)',
       'ln(1 + EcosystemExperienceSubmitterPullReques

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator),...,ln(1 + SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator),ln(1 + SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter),ln(1 + SharedExperiencePullRequestSubmittedBySubmitterCommentedOnByIntegrator),ln(1 + SharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter),ln(1 + SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter),ln(1 + SharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator),ln(1 + SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter),ln(1 + SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter),ln(1 + WeightedFirstOrderInDegreeCentrality),ln(1 + WeightedFirstOrderOutDegreeCentrality)
count,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0,...,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0,362438.0
mean,0.389755,0.09176341,0.010994,0.638314,0.429614,0.098223,0.057801,0.447827,0.117983,0.342272,...,0.209133,0.047731,0.181829,0.054809,0.151391,0.089064,0.055443,0.133304,0.125692,0.106419
std,0.273468,0.1404224,0.027597,0.434835,0.44993,0.289794,0.225442,0.186685,0.067712,0.180285,...,0.181619,0.121037,0.186187,0.117186,0.170489,0.145192,0.128456,0.162044,0.11131,0.107209
min,0.0,5.022228e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.149408,0.008531515,0.000747,0.0,0.0,0.0,0.0,0.311083,0.075257,0.214396,...,0.109693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040483,0.025767
50%,0.35034,0.02906452,0.003001,0.904762,0.076923,0.0,0.0,0.460744,0.075257,0.345852,...,0.173859,0.0,0.118269,0.0,0.111153,0.0,0.0,0.070165,0.097443,0.074024
75%,0.603827,0.1126607,0.010099,1.0,0.941176,0.0,0.0,0.58267,0.150513,0.479503,...,0.329079,0.0,0.30572,0.0,0.267009,0.124825,0.0,0.242732,0.182396,0.155414
max,0.999949,0.9876457,0.999976,1.0,1.0,1.0,1.0,0.988177,0.993329,0.992047,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.878637,0.915254


In [2]:
import copy

default_plot_settings = {
    'edgecolor': 'black',
    'color': "#e69d00"
}

default_plot_settings_2 = copy.deepcopy(default_plot_settings)
default_plot_settings_2['color'] = "#56b3e9"


def __fix_x_label_fontsize(__column):
    # Adjusts x-label's fontsize to fit the text.
    fig = plt.gcf()
    fig_width = fig.get_figwidth()
    xlabel_fontsize = int(fig_width * 100 / len(__column))
    ax = plt.gca()
    orig_fontsize = ax.xaxis.label.get_fontsize()
    ax.xaxis.label.set_fontsize(min(xlabel_fontsize, orig_fontsize))


def create_histogram(_df: pd.DataFrame, _column: str,
                     subfolder_name: str = "",
                     show_without_value = None):
    can_create_feature_histograms = True

    if not can_create_feature_histograms:
        return

    binary_fields = _df.select_dtypes(exclude='number').columns

    print(_column)
    plt.clf()
    entries = _df[_column]

    if _column in binary_fields:
        entries = _df[_column].replace({False: 0, True: 1})
        plt.xticks([0, 1], ['False', 'True'])
        plt.hist(entries, bins=2, **default_plot_settings)
        plt.ylabel('Frequency')
    # elif __column in shown_fields_without_zeroes:
    elif not show_without_value is None:
        _, bins, _ = plt.hist(entries, bins=30, alpha=1,
                              label="All Data", **default_plot_settings)
        ax: plt.Axes = plt.gca()
        ax.set_ylabel("Frequency")
        ax.set_xlabel(_column)
        __fix_x_label_fontsize(_column)

        filtered_data = _df[_column][_df[_column] != show_without_value]
        ax2 = ax.twinx()

        ax2.hist(filtered_data, bins, alpha=0.5,
                 label=f'Excl. {show_without_value}', **default_plot_settings_2)
        ax2.set_ylabel(f"Frequency (excl. x = {show_without_value})")
        ax2.set_zorder(10)
        plt.tight_layout()
    else:
        plt.hist(entries, bins=30, **default_plot_settings)
        plt.ylabel('Frequency')

    plt.xlabel(_column)
    __fix_x_label_fontsize(_column)
    plt.tight_layout()

    output_path = f"{figure_base_path}/distributions/{subfolder_name}/{_column}.png"
    safe_save_fig(output_path)


In [40]:
print(f'{len(df)=}\n')

# binary_values = df['ControlIntegratedBySameUser'].value_counts()
# print(binary_values)
# ratio = binary_values[True] / binary_values[False]
# print(f'{ratio=:.03f}\n')

binary_values = df['ControlPullRequestHasComments'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = df['ControlHasHashTagInDescription'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = df['ControlPullRequestHasCommentByExternalUser'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')



len(df)=362438

ControlPullRequestHasComments
True     244021
False    118417
Name: count, dtype: int64
ratio=2.061

ControlHasHashTagInDescription
False    223735
True     138703
Name: count, dtype: int64
ratio=0.620

ControlPullRequestHasCommentByExternalUser
False    275196
True      87242
Name: count, dtype: int64
ratio=0.317



In [36]:
test_df = df[df['ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount)'].gt(0.3)]
print(f'{len(test_df)=}\n')

# binary_values = test_df['ControlIntegratedBySameUser'].value_counts()
# print(binary_values)
# ratio = binary_values[True] / binary_values[False]
# print(f'{ratio=:.03f}\n')

binary_values = test_df['ControlPullRequestHasComments'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = test_df['ControlHasHashTagInDescription'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = test_df['ControlPullRequestHasCommentByExternalUser'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')



len(test_df)=54678

ControlIntegratedBySameUser
True     40281
False    14397
Name: count, dtype: int64
ratio=2.798

ControlPullRequestHasComments
False    27530
True     27148
Name: count, dtype: int64
ratio=0.986

ControlHasHashTagInDescription
False    36885
True     17793
Name: count, dtype: int64
ratio=0.482

ControlPullRequestHasCommentByExternalUser
False    39467
True     15211
Name: count, dtype: int64
ratio=0.385



In [22]:
print(len(test_df))

binary_values = test_df['ControlPullRequestHasComments'].value_counts()
print(binary_values)
print(binary_values[True] / binary_values[False])

71133
ControlPullRequestHasComments
True     36813
False    34320
Name: count, dtype: int64
1.0726398601398601


In [20]:
test_df = df[['ln(1 + ControlPullRequestLifeTimeInMinutes)']]
test_df.describe()

Unnamed: 0,ln(1 + ControlPullRequestLifeTimeInMinutes)
count,223093.0
mean,0.367951
std,0.20509
min,0.0
25%,0.197168
50%,0.363286
75%,0.522944
max,0.980358
