In [3]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import json

import regex as re
from python_proj.utils.util import safe_save_fig, subtract_dict, Counter
import python_proj.utils.exp_utils as exp_utils

file_name = "dataset_90_days_started_11_07_23_preprocessed"
base_path = exp_utils.BASE_PATH
data_path = f'{base_path}/final_data/{file_name}.csv'
figure_base_path = f"{base_path}/figures/modelling/{file_name}/"

# Meta stuff.
pr_merged_key = 'PullRequestIsMerged'
ftc_key = 'SubmitterIsFirstTimeContributor'
seed_counter = Counter()


# Loads regular dataframe.
df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)
if 'Unnamed: 1' in df.columns:
    df = df.drop(['Unnamed: 1', "Project Name.1"], axis=1)
print(df.columns)

# Creates sub-datasets.
metadata_fields = ['ID', 'Project Name',
                   'Submitter ID', 'PR Number', 'Closed At']
dependent_fields = [pr_merged_key]
independent_fields = [field for field in df.columns
                      if (field not in metadata_fields
                          and field != pr_merged_key)]

df[independent_fields].describe()

class_counts = df[pr_merged_key].value_counts()
class_imbalance = class_counts[True] / class_counts[False]
print(f'{class_imbalance=}')

meta_header_count = 5

df.describe()

Index(['Project Name', 'ID', 'Submitter ID', 'PR Number', 'Closed At',
       'PullRequestIsMerged', 'ControlIntegratedBySameUser',
       'ControlPullRequestHasComments', 'ControlHasHashTagInDescription',
       'IntraProjectSubmitterPullRequestSuccessRate',
       'EcosystemExperienceSubmitterPullRequestSuccessRate',
       'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
       'InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
       'SubmitterIsFirstTimeContributor',
       'ControlPullRequestHasCommentByExternalUser',
       'ln(1 + ControlPullRequestLifeTimeInMinutes)',
       'ln(1 + ControlNumberOfCommitsInPullRequest)',
       'ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator)',
       'ln(1 + IntraProjectSubmitterPullRequestSubmissionCount)',
       'ln(1 + IntraProjectSubmitterPullRequestCommentCount)',
       'ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount)',
       'ln(1 + EcosystemExperienceSubmitterPullReques

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator),...,ln(1 + SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator),ln(1 + SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter),ln(1 + SharedExperiencePullRequestSubmittedBySubmitterCommentedOnByIntegrator),ln(1 + SharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter),ln(1 + SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter),ln(1 + SharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator),ln(1 + SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter),ln(1 + SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter),ln(1 + WeightedFirstOrderInDegreeCentrality),ln(1 + WeightedFirstOrderOutDegreeCentrality)
count,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,...,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0,1224618.0
mean,0.4075076,0.08511044,0.009717735,0.5030779,0.4031832,0.09883051,0.05955435,0.4193429,0.1184018,0.2984268,...,0.06189767,0.01406902,0.05382167,0.01615051,0.04474016,0.02627888,0.01639452,0.03938143,0.08883798,0.07433466
std,0.275325,0.1332181,0.02750959,0.4710799,0.4523974,0.2910963,0.2290927,0.2256258,0.07146971,0.1867734,...,0.1373575,0.06915216,0.1309346,0.06829825,0.1155283,0.0886801,0.0742577,0.1069851,0.1050665,0.09858844
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.1638594,0.00779418,0.0005905393,0.0,0.0,0.0,0.0,0.2364295,0.07525668,0.1657954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.3784833,0.02730277,0.002386261,0.6842105,0.0,0.0,0.0,0.4430626,0.07525668,0.2971852,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05423467,0.03550484
75%,0.6253565,0.104623,0.008231395,1.0,0.9393939,0.0,0.0,0.5857857,0.1505134,0.4381478,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1367061,0.1118951
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
import copy

default_plot_settings = {
    'edgecolor': 'black',
    'color': "#e69d00"
}

default_plot_settings_2 = copy.deepcopy(default_plot_settings)
default_plot_settings_2['color'] = "#56b3e9"


def __fix_x_label_fontsize(__column):
    # Adjusts x-label's fontsize to fit the text.
    fig = plt.gcf()
    fig_width = fig.get_figwidth()
    xlabel_fontsize = int(fig_width * 100 / len(__column))
    ax = plt.gca()
    orig_fontsize = ax.xaxis.label.get_fontsize()
    ax.xaxis.label.set_fontsize(min(xlabel_fontsize, orig_fontsize))


def create_histogram(_df: pd.DataFrame, _column: str,
                     subfolder_name: str = "",
                     show_without_value = None):
    can_create_feature_histograms = True

    if not can_create_feature_histograms:
        return

    binary_fields = _df.select_dtypes(exclude='number').columns

    print(_column)
    plt.clf()
    entries = _df[_column]

    if _column in binary_fields:
        entries = _df[_column].replace({False: 0, True: 1})
        plt.xticks([0, 1], ['False', 'True'])
        plt.hist(entries, bins=2, **default_plot_settings)
        plt.ylabel('Frequency')
    # elif __column in shown_fields_without_zeroes:
    elif not show_without_value is None:
        _, bins, _ = plt.hist(entries, bins=30, alpha=1,
                              label="All Data", **default_plot_settings)
        ax: plt.Axes = plt.gca()
        ax.set_ylabel("Frequency")
        ax.set_xlabel(_column)
        __fix_x_label_fontsize(_column)

        filtered_data = _df[_column][_df[_column] != show_without_value]
        ax2 = ax.twinx()

        ax2.hist(filtered_data, bins, alpha=0.5,
                 label=f'Excl. {show_without_value}', **default_plot_settings_2)
        ax2.set_ylabel(f"Frequency (excl. x = {show_without_value})")
        ax2.set_zorder(10)
        plt.tight_layout()
    else:
        plt.hist(entries, bins=30, **default_plot_settings)
        plt.ylabel('Frequency')

    plt.xlabel(_column)
    __fix_x_label_fontsize(_column)
    plt.tight_layout()

    output_path = f"{figure_base_path}/distributions/{subfolder_name}/{_column}.png"
    safe_save_fig(output_path)


In [5]:
print(f'{len(df)=}\n')

binary_values = df['ControlIntegratedBySameUser'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = df['ControlPullRequestHasComments'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = df['ControlHasHashTagInDescription'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = df['ControlPullRequestHasCommentByExternalUser'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')


binary_values = df['SubmitterIsFirstTimeContributor'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')



len(df)=1224618

ControlIntegratedBySameUser
False    701400
True     523218
Name: count, dtype: int64
ratio=0.746

ControlPullRequestHasComments
True     763756
False    460862
Name: count, dtype: int64
ratio=1.657

ControlHasHashTagInDescription
False    822549
True     402069
Name: count, dtype: int64
ratio=0.489

ControlPullRequestHasCommentByExternalUser
False    912555
True     312063
Name: count, dtype: int64
ratio=0.342

SubmitterIsFirstTimeContributor
False    796000
True     428618
Name: count, dtype: int64
ratio=0.538



In [6]:
test_df = df[df['ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount)'].gt(0)]
print(f'{len(test_df)=}\n')

binary_values = test_df['ControlIntegratedBySameUser'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = test_df['ControlPullRequestHasComments'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = test_df['ControlHasHashTagInDescription'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = test_df['ControlPullRequestHasCommentByExternalUser'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')

binary_values = test_df['SubmitterIsFirstTimeContributor'].value_counts()
print(binary_values)
ratio = binary_values[True] / binary_values[False]
print(f'{ratio=:.03f}\n')


len(test_df)=616125

ControlIntegratedBySameUser
True     314331
False    301794
Name: count, dtype: int64
ratio=1.042

ControlPullRequestHasComments
True     369933
False    246192
Name: count, dtype: int64
ratio=1.503

ControlHasHashTagInDescription
False    406536
True     209589
Name: count, dtype: int64
ratio=0.516

ControlPullRequestHasCommentByExternalUser
False    454535
True     161590
Name: count, dtype: int64
ratio=0.356

SubmitterIsFirstTimeContributor
False    460504
True     155621
Name: count, dtype: int64
ratio=0.338



In [7]:
import scipy


def upscale_id(id: float) -> int:
    return scipy.e ** (id * scipy.log(8.297600e+04)) - 1


In [8]:
print(len(test_df))

binary_values = test_df['ControlPullRequestHasComments'].value_counts()
print(binary_values)
print(binary_values[True] / binary_values[False])

616125
ControlPullRequestHasComments
True     369933
False    246192
Name: count, dtype: int64
1.5026199064145058


In [9]:
test_df = df[['ln(1 + ControlPullRequestLifeTimeInMinutes)']]
test_df.describe()

Unnamed: 0,ln(1 + ControlPullRequestLifeTimeInMinutes)
count,1224618.0
mean,0.4193429
std,0.2256258
min,0.0
25%,0.2364295
50%,0.4430626
75%,0.5857857
max,1.0


In [10]:
instance = df.loc[645]
real_id = upscale_id(instance['ID'])
print(real_id)

515.6005864916907


  return scipy.e ** (1 + id * scipy.log(8.297600e+04))
