This script does all the logistic regression stuff using all of the data.
It:
- goes through all assumptions for logistic regression
- alters the data wherever necessary to make things work
- creates a classifier and evaluates its performance

### Data Loading

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import json

from python_proj.utils.util import safe_save_fig, subtract_dict, Counter

file_name = "dataset_90_days_started_09_07_23_v2"
base_path = '/workspaces/msc_thesis/data/'
# base_path= '/data/s4509412/data/data/'
data_path = f'{base_path}/final_data/{file_name}.csv'
figure_base_path = f"{base_path}/figures/modelling/{file_name}/"

# Meta stuff.
pr_merged_key = 'PullRequestIsMerged'
ftc_key = 'SubmitterIsFirstTimeContributor'
seed_counter = Counter()


# Loads regular dataframe.
df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)
df = df.drop(['Unnamed: 0'], axis=1)
if True:
    df = df.sample(math.floor(len(df) * 0.1))
    print((5 * "#########\n") +
          "YOU'RE USING A RANDOM SUBSAMPLE OF 10%!!!\n" + (5 * "#########\n"))
print(df.columns)

# Loads first-time contributor data frame.
ftc_df = df[df[ftc_key]]
print(f'{len(df)=}')
print(f'{len(ftc_df)=}')

# Creates sub-datasets.
metadata_fields = ['ID', 'Project Name',
                   'Submitter ID', 'PR Number', 'Closed At']
dependent_fields = [pr_merged_key]
independent_fields = [field for field in df.columns
                      if (field not in metadata_fields
                          and field not in dependent_fields)]

# Removes fields with only columns from ftc dataset.
ftc_columns_old = ftc_df.columns
ftc_df = ftc_df.loc[:, (ftc_df != 0).any()]
ftc_columns_new = ftc_df.columns
ftc_removed = [field for field in ftc_columns_old
               if field not in ftc_columns_new]
print(f'{ftc_removed=}')

df[independent_fields].describe()


#########
#########
#########
#########
#########
YOU'RE USING A RANDOM SUBSAMPLE OF 10%!!!
#########
#########
#########
#########
#########

Index(['ID', 'Project Name', 'Submitter ID', 'PR Number', 'Closed At',
       'PullRequestIsMerged', 'ControlIntegratedBySameUser',
       'ControlPullRequestLifeTimeInMinutes', 'ControlPullRequestHasComments',
       'ControlNumberOfCommitsInPullRequest',
       'ControlPullRequestHasCommentByExternalUser',
       'ControlHasHashTagInDescription',
       'FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-In)',
       'FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-Put)',
       'FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRCommenterToSubmitter-In)',
       'FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToSubmitter-In)',
       'ControlIntraProjectPullRequestExperienceOfIntegrator',
       'IntraProjectSubmitterPullRequestSubmissionCount',
       'IntraProjectSubmitt

Unnamed: 0,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-In),FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-Put),FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRCommenterToSubmitter-In),FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToSubmitter-In),ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSubmissionCount,EcosystemExperienceSubmitterPullRequestCommentCount,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterIssueSubmissionCount,EcosystemExperienceSubmitterIssueCommentCount
count,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0,182997.0
mean,31880.54,4.884375,78.13251,476.59319,386.156893,217.734772,192.670689,17.196533,0.53661,21.76351,0.371222,11.526052,15.079094,0.100773,0.050479,5.441051,40.884452
std,140854.9,65.449596,1238.175921,2848.530265,2597.61946,1265.64745,476.747907,56.567028,0.461972,65.557013,0.447245,50.297772,53.59721,0.293878,0.212329,12.390842,104.442327
min,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,52.78333,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,978.5333,1.0,0.0,9.0,4.0,1.0,30.0,2.0,0.8,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0
75%,7307.233,3.0,11.0,145.0,118.0,74.0,143.0,17.0,1.0,15.0,0.92,4.0,6.0,0.0,0.0,5.0,33.0
max,3899225.0,10000.0,80073.0,99501.0,121056.0,97610.0,4275.0,1693.0,1.0,2007.0,1.0,993.0,1226.0,1.0,1.0,572.0,1842.0


### Log-odds independence

In [2]:
import json
from numbers import Number
from typing import Callable
import numpy as np
import statsmodels.api as sm


def box_tidwill_test(__independents: pd.DataFrame,
                     var_transform: Callable[[Number], Number]):
    continuous_vars = __independents.select_dtypes(include='number')
    continuous_copy = continuous_vars.copy()

    failed = []

    for field in continuous_copy.columns:
        # Drops zeroes.
        rows_with_zeroes = continuous_copy.loc[continuous_copy[field] == 0].index
        independents_wo_zeroes = continuous_copy.drop(rows_with_zeroes)
        test_dependent = df[pr_merged_key].drop(rows_with_zeroes)

        # Selects and transforms fields.
        test_independents = pd.DataFrame()
        test_independents[field] = independents_wo_zeroes[field]\
            .apply(var_transform)
        ln_field = f'ln(.) x {field}'
        test_independents[ln_field] = test_independents[field]\
            .apply(lambda x: np.log(x) * x)

        # Adds constant
        test_independents = sm.add_constant(test_independents)

        # Does the test.
        logit_results = sm.GLM(test_dependent,
                               test_independents,
                               family=sm.families.Binomial()).fit()

        p_value = logit_results.pvalues[ln_field]
        if p_value < 0.05:
            failed.append(field)

        is_significant = "significant" if p_value < 0.05 else "insignificant"

        print(f'({is_significant}) {field}: p={p_value} (used {len(independents_wo_zeroes)}/{len(continuous_copy)} entries).')

    return failed


failed = box_tidwill_test(df[independent_fields], lambda x: x)

print()
print(f'{len(failed)=}')
print(f'{failed=}')


(significant) ControlPullRequestLifeTimeInMinutes: p=0.0 (used 182997/182997 entries).
(significant) ControlNumberOfCommitsInPullRequest: p=4.8134508920447555e-51 (used 182610/182997 entries).
(significant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-In): p=1.8239512112021574e-07 (used 83525/182997 entries).
(significant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-Put): p=4.1090622397170733e-07 (used 110255/182997 entries).
(significant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRCommenterToSubmitter-In): p=8.560932142845168e-06 (used 97942/182997 entries).
(insignificant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToSubmitter-In): p=0.9351481985498956 (used 91827/182997 entries).
(significant) ControlIntraProjectPullRequestExperienceOfIntegrator: p=1.2153524090212365e-66 (used 170295/182997 entries).
(significant) IntraProjectSubmitterPullRequestSubmissionCount: p=2.9654816058029476e-130 (us

In [3]:
from scipy.stats import yeojohnson


df_failed = df[failed].copy()
print(f'{len(df_failed)=}')

transformations = [
    math.sqrt,
    math.cbrt,
    lambda x: math.log(1 + x),
    lambda x: math.log10(1 + x),
    lambda x: 1 / x,
    lambda x: x**2,
    lambda x: x**3,
]

for transformation in transformations:
    print(transformation)
    try:
        box_tidwill_test(df_failed, transformation)
    except:
        ...
    print()

print("Yeo-Johnson")
for field in failed:
    df_failed[field], _lambda = yeojohnson(df_failed[[field]])
    print(f'{field}: {_lambda=}')
print()
box_tidwill_test(df_failed, lambda x: x)

# IntraProjectSubmitterPullRequestSuccessRate                                       (cbrt)
# FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-In)    (ln, log10, squared)
# EcosystemExperienceSubmitterPullRequestSuccessRate                                (ln, log10)
# IntraProjectSubmitterPullRequestCommentCount                                      (recip)
# IntraProjectSubmitterPullRequestSubmissionCount                                   (squared)
# EcosystemExperienceSubmitterIssueCommentCount                                     (cubed)
# EcosystemExperienceSubmitterIssueSubmissionCount                                  (yj)


len(df_failed)=182997
<built-in function sqrt>
(significant) ControlPullRequestLifeTimeInMinutes: p=2.2615559464734852e-102 (used 182997/182997 entries).
(significant) ControlNumberOfCommitsInPullRequest: p=2.1283194971958317e-57 (used 182610/182997 entries).
(significant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-In): p=6.902615022402876e-10 (used 83525/182997 entries).
(significant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-Put): p=4.5201471161441983e-07 (used 110255/182997 entries).
(significant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRCommenterToSubmitter-In): p=1.657475879845142e-06 (used 97942/182997 entries).
(significant) ControlIntraProjectPullRequestExperienceOfIntegrator: p=0.0 (used 170295/182997 entries).
(significant) IntraProjectSubmitterPullRequestSubmissionCount: p=1.4894113807995506e-226 (used 118612/182997 entries).
(significant) IntraProjectSubmitterPullRequestSuccessRate: p=0.00058508990

  t = np.exp(-z)


(significant) EcosystemExperienceSubmitterIssueSubmissionCount: p=0.0 (used 105858/182997 entries).
(insignificant) EcosystemExperienceSubmitterIssueCommentCount: p=0.23144451591057524 (used 124542/182997 entries).

Yeo-Johnson
ControlPullRequestLifeTimeInMinutes: _lambda=array([0.00693047])
ControlNumberOfCommitsInPullRequest: _lambda=array([-1.41193138])
FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-In): _lambda=array([-0.47130602])
FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-Put): _lambda=array([-0.13329203])
FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRCommenterToSubmitter-In): _lambda=array([-0.17480616])
ControlIntraProjectPullRequestExperienceOfIntegrator: _lambda=array([-0.03020394])
IntraProjectSubmitterPullRequestSubmissionCount: _lambda=array([-0.24549937])
IntraProjectSubmitterPullRequestSuccessRate: _lambda=array([1.08656033])
IntraProjectSubmitterPullRequestCommentCount: _lambda=array([-0.35624344])
Ecosy

['ControlPullRequestLifeTimeInMinutes',
 'ControlNumberOfCommitsInPullRequest',
 'FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-In)',
 'FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRIntegratorToSubmitter-Put)',
 'FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRCommenterToSubmitter-In)',
 'ControlIntraProjectPullRequestExperienceOfIntegrator',
 'IntraProjectSubmitterPullRequestSubmissionCount',
 'IntraProjectSubmitterPullRequestSuccessRate',
 'IntraProjectSubmitterPullRequestCommentCount',
 'EcosystemExperienceSubmitterPullRequestSuccessRate',
 'EcosystemExperienceSubmitterPullRequestSubmissionCount',
 'EcosystemExperienceSubmitterPullRequestCommentCount',
 'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
 'EcosystemExperienceSubmitterIssueCommentCount']