This script does all the logistic regression stuff using all of the data.
It:
- goes through all assumptions for logistic regression
- alters the data wherever necessary to make things work
- creates a classifier and evaluates its performance

### Data Loading

In [6]:
import pandas as pd 
import numpy as np
import math
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import yeojohnson

file_name = "dataset_all_days_started_30_06_23"
data_path = f'/workspaces/msc_thesis/data/final_data/{file_name}.csv'

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)
df = df.sample(math.floor(len(df) * 0.1))
print(f'{len(df)=}')

pr_merged_key = 'PullRequestIsMerged'

dependent = df[pr_merged_key]
dropped_fields = [
        # Meta fields
        'ID', 'Project Name', 'Submitter ID', 'PR Number', 'Closed At',

        # dependent and control field
        pr_merged_key, 'SubmitterIsFirstTimeContributor',

        # Useless fields because they are (almost) all 0
        "DependencyEcosystemExperienceSubmitterIssueCommentCount",
        "DependencyEcosystemExperienceSubmitterIssueSubmissionCount",
        'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
        "DependencyEcosystemExperienceSubmitterPullRequestCommentCount",
        "DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount",

        "IntraProjectSubmitterIssueCommentCount",
        "IntraProjectSubmitterIssueSubmissionCount",

        "InversedDependencyEcosystemExperienceSubmitterIssueCommentCount",
        "InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount",
        'InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
        "InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount",
        "InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount",

        "SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter",
        'SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter',
        "SharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator",

        "SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter",
        "SharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter",
        "SharedExperiencePullRequestSubmittedBySubmitterCommentedOnByIntegrator",
    ]
independent = df
for field in dropped_fields:
    independent = independent.drop(field, axis=1)

print(len(independent.columns))
print(independent.columns)

len(df)=182997
17
Index(['ControlIntegratedBySameUser', 'ControlPullRequestLifeTimeInMinutes',
       'ControlPullRequestHasComments', 'ControlNumberOfCommitsInPullRequest',
       'ControlPullRequestHasCommentByExternalUser',
       'ControlHasHashTagInDescription',
       'ControlIntraProjectPullRequestExperienceOfIntegrator',
       'IntraProjectSubmitterPullRequestSubmissionCount',
       'IntraProjectSubmitterPullRequestSuccessRate',
       'IntraProjectSubmitterPullRequestCommentCount',
       'EcosystemExperienceSubmitterPullRequestSuccessRate',
       'EcosystemExperienceSubmitterPullRequestSubmissionCount',
       'EcosystemExperienceSubmitterPullRequestCommentCount',
       'SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator',
       'SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter',
       'EcosystemExperienceSubmitterIssueSubmissionCount',
       'EcosystemExperienceSubmitterIssueCommentCount'],
      dtype='object')


### Log-odds independence

In [29]:
import json
from numbers import Number
from typing import Callable


def box_tidwill_test(__independents: pd.DataFrame, 
                     var_transform: Callable[[Number], Number]):
    continuous_vars = __independents.select_dtypes(include='number')
    continuous_copy = continuous_vars.copy()

    for field in continuous_copy.columns:
        # Drops zeroes.
        rows_with_zeroes = continuous_copy.loc[continuous_copy[field] == 0].index
        independents_wo_zeroes = continuous_copy.drop(rows_with_zeroes)
        test_dependent = dependent.drop(rows_with_zeroes)

        # Selects and transforms fields.
        test_independents = pd.DataFrame()
        test_independents[field] = independents_wo_zeroes[field]\
            .apply(var_transform)
        ln_field = f'ln(.) x {field}'
        test_independents[ln_field] = test_independents[field]\
            .apply(lambda x: np.log(x) * x)

        # Adds constant
        test_independents = sm.add_constant(test_independents)

        # Does the test.
        logit_results = sm.GLM(test_dependent,
                               test_independents,
                               family=sm.families.Binomial()).fit()

        p_value = logit_results.pvalues[ln_field]
        is_significant = "significant" if p_value < 0.05 else "insignificant"

        print(f'({is_significant}) {field}: p={p_value} (used {len(independents_wo_zeroes)}/{len(continuous_copy)} entries).')


box_tidwill_test(independent, lambda x: x)


(significant) ControlPullRequestLifeTimeInMinutes: p=0.0 (used 182997/182997 entries).
(significant) ControlNumberOfCommitsInPullRequest: p=8.222901599325988e-67 (used 182648/182997 entries).
(significant) ControlIntraProjectPullRequestExperienceOfIntegrator: p=2.6296332022630186e-178 (used 179407/182997 entries).
(significant) IntraProjectSubmitterPullRequestSubmissionCount: p=6.836823268292763e-148 (used 129487/182997 entries).
(significant) IntraProjectSubmitterPullRequestSuccessRate: p=2.3108788109802652e-38 (used 119685/182997 entries).
(significant) IntraProjectSubmitterPullRequestCommentCount: p=1.4218884435444112e-58 (used 113561/182997 entries).
(significant) EcosystemExperienceSubmitterPullRequestSuccessRate: p=2.6127274965449894e-16 (used 117319/182997 entries).
(significant) EcosystemExperienceSubmitterPullRequestSubmissionCount: p=1.9676127472113513e-201 (used 126063/182997 entries).
(significant) EcosystemExperienceSubmitterPullRequestCommentCount: p=1.9443449795331844e-7

In [32]:
continuous_vars = independent.select_dtypes(include='number').columns
print(f'{len(continuous_vars)=}')

tr_independent = independent.copy()

transformations = [
    math.sqrt,
    math.cbrt,
    lambda x: math.log(1 + x),
    lambda x: math.log10(1 + x),
    lambda x: 1 / x,
    lambda x: x**2,
    lambda x: x**3,
]

for transformation in transformations:
    print(transformation)
    try:
        box_tidwill_test(tr_independent, transformation)
    except:
        ...
    print()

print("Yeo-Johnson")
for field in continuous_vars:
    tr_independent[field], _lambda = yeojohnson(tr_independent[[field]])
    print(f'{field}: {_lambda=}')
box_tidwill_test(tr_independent, lambda x: x)

# IntraProjectSubmitterPullRequestSuccessRate                               (sqrt)
# EcosystemExperienceSubmitterPullRequestSuccessRate                        (sqrt)
# SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter     (sqrt, cbrt, yeo-johnson)
# IntraProjectSubmitterPullRequestCommentCount                              (cubed, yeo-johnson)


len(continuous_vars)=13
<built-in function sqrt>
(significant) ControlPullRequestLifeTimeInMinutes: p=2.7971993658573044e-109 (used 182997/182997 entries).
(significant) ControlNumberOfCommitsInPullRequest: p=9.296146741691211e-50 (used 182648/182997 entries).
(significant) ControlIntraProjectPullRequestExperienceOfIntegrator: p=0.0 (used 179407/182997 entries).
(significant) IntraProjectSubmitterPullRequestSubmissionCount: p=3.630794088871638e-215 (used 129487/182997 entries).
(insignificant) IntraProjectSubmitterPullRequestSuccessRate: p=0.5217396480424269 (used 119685/182997 entries).
(significant) IntraProjectSubmitterPullRequestCommentCount: p=7.106464992025869e-170 (used 113561/182997 entries).
(insignificant) EcosystemExperienceSubmitterPullRequestSuccessRate: p=0.6175652975885051 (used 117319/182997 entries).
(significant) EcosystemExperienceSubmitterPullRequestSubmissionCount: p=9.51778281269065e-125 (used 126063/182997 entries).
(significant) EcosystemExperienceSubmitterPullR