This script does all the logistic regression stuff using all of the data.
It:
- goes through all assumptions for logistic regression
- alters the data wherever necessary to make things work
- creates a classifier and evaluates its performance

### Data Loading

In [1]:
import math
import pandas as pd

# Some general field names
pr_merged_key = 'PullRequestIsMerged'
first_time_contributor_key = 'SubmitterIsFirstTimeContributor'

# Loads all data
data_file_name = "dataset_90_days_started_09_07_23_v2"
data_path = f'/workspaces/msc_thesis/data/final_data/{data_file_name}.csv'
df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)
df = df.loc[:, (df != 0).any(axis=0)]  # Drops columns with only zeroes.

# Loads first-time contributor data and merges it with dataframe.
ftc_file_name = "dataset_90_days_started_28_06_23"
ftc_data_path = f'/workspaces/msc_thesis/data/final_data/{ftc_file_name}.csv'
ftc_flag_data: pd.DataFrame = pd.read_csv(ftc_data_path)
ftc_flag_data = ftc_flag_data[['ID', first_time_contributor_key]]
df = df.merge(ftc_flag_data, how='left', on='ID', suffixes=("_incorrect", ""))
old_ftc_key = f'{first_time_contributor_key}_incorrect'
df = df.drop(old_ftc_key, axis=1)

# Removes metadata
meta_header_count = 5
df = df.drop(df.columns[:5], axis=1)

df.describe()

df = df.sample(math.floor(len(df) * 0.1))


### Log-odds independence

In [6]:
import json
from numbers import Number
from typing import Callable
import numpy as np
import statsmodels.api as sm


def box_tidwill_test(__independents: pd.DataFrame,
                     var_transform: Callable[[Number], Number]):
    continuous_vars = __independents.select_dtypes(include='number')
    continuous_copy = continuous_vars.copy()

    failed = []

    for field in continuous_copy.columns:
        # Drops zeroes.
        rows_with_zeroes = continuous_copy.loc[continuous_copy[field] == 0].index
        independents_wo_zeroes = continuous_copy.drop(rows_with_zeroes)
        test_dependent = df[pr_merged_key].drop(rows_with_zeroes)

        # Selects and transforms fields.
        test_independents = pd.DataFrame()
        test_independents[field] = independents_wo_zeroes[field]\
            .apply(var_transform)
        ln_field = f'ln(.) x {field}'
        test_independents[ln_field] = test_independents[field]\
            .apply(lambda x: np.log(x) * x)

        # Adds constant
        test_independents = sm.add_constant(test_independents)

        # Does the test.
        logit_results = sm.GLM(test_dependent,
                               test_independents,
                               family=sm.families.Binomial()).fit()

        p_value = logit_results.pvalues[ln_field]
        if p_value < 0.05:
            failed.append(field)

        is_significant = "significant" if p_value < 0.05 else "insignificant"

        print(f'({is_significant}) {field}: p={p_value} (used {len(independents_wo_zeroes)}/{len(continuous_copy)} entries).')

    return failed


failed = box_tidwill_test(df, lambda x: x)

print()
print(f'{len(failed)=}')
print(f'{failed=}')


(significant) ControlPullRequestLifeTimeInMinutes: p=0.0 (used 182997/182997 entries).
(significant) ControlNumberOfCommitsInPullRequest: p=1.3232726803426835e-05 (used 182650/182997 entries).
(insignificant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.PRCommenterToSubmitter): p=0.05026005008269409 (used 103047/182997 entries).
(insignificant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToCommenter): p=0.9990745063377562 (used 1/182997 entries).
(insignificant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToCommenter).1: p=0.9990745063377562 (used 1/182997 entries).
(significant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToSubmitter): p=9.939051970199687e-06 (used 98665/182997 entries).
(insignificant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToSubmitter).1: p=nan (used 1/182997 entries).
(significant) ControlIntraProjectPullRequestExperienceOfIntegrator: p=2.1757783173710686e-55 (used 170265

In [5]:
from scipy.stats import yeojohnson


df_failed = df[failed].copy()
print(f'{len(df_failed)=}')

transformations = [
    math.sqrt,
    math.cbrt,
    lambda x: math.log(1 + x),
    lambda x: math.log10(1 + x),
    lambda x: 1 / x,
    lambda x: x**2,
    lambda x: x**3,
]

for transformation in transformations:
    print(transformation)
    try:
        box_tidwill_test(df_failed, transformation)
    except:
        ...
    print()

print("Yeo-Johnson")
for field in failed:
    df_failed[field], _lambda = yeojohnson(df_failed[[field]])
    print(f'{field}: {_lambda=}')
box_tidwill_test(df_failed, lambda x: x)

# IntraProjectSubmitterPullRequestSuccessRate                               (sqrt)
# EcosystemExperienceSubmitterPullRequestSuccessRate                        (sqrt)
# SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter     (sqrt, cbrt, yeo-johnson)
# IntraProjectSubmitterPullRequestCommentCount                              (cubed, yeo-johnson)


len(df_failed)=182997
<built-in function sqrt>
(significant) ControlPullRequestLifeTimeInMinutes: p=1.6590876016346648e-82 (used 182997/182997 entries).
(significant) ControlNumberOfCommitsInPullRequest: p=1.3668860326821578e-72 (used 182650/182997 entries).
(insignificant) FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToSubmitter): p=0.9247120019526409 (used 98665/182997 entries).
(significant) ControlIntraProjectPullRequestExperienceOfIntegrator: p=0.0 (used 170265/182997 entries).
(significant) IntraProjectSubmitterPullRequestSubmissionCount: p=1.9207267202991267e-235 (used 118454/182997 entries).
(significant) IntraProjectSubmitterPullRequestSuccessRate: p=0.0030356895475305294 (used 108408/182997 entries).
(significant) IntraProjectSubmitterPullRequestCommentCount: p=7.681966725826618e-164 (used 100746/182997 entries).
(significant) SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator: p=2.7515801403718652e-45 (used 46485/182997 entries).
(insig

  t = np.exp(-z)


(significant) EcosystemExperienceSubmitterIssueSubmissionCount: p=0.0 (used 105452/182997 entries).
(insignificant) EcosystemExperienceSubmitterIssueCommentCount: p=0.05083826490756153 (used 124145/182997 entries).
(significant) DependencyEcosystemExperienceSubmitterIssueSubmissionCount: p=0.01035640977840778 (used 14184/182997 entries).
(insignificant) DependencyEcosystemExperienceSubmitterIssueCommentCount: p=0.776432029127857 (used 18436/182997 entries).
(insignificant) InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount: p=0.08591610515658366 (used 9603/182997 entries).
(insignificant) InversedDependencyEcosystemExperienceSubmitterIssueCommentCount: p=0.9213493588685342 (used 12393/182997 entries).

Yeo-Johnson
ControlPullRequestLifeTimeInMinutes: _lambda=array([0.00715288])
ControlNumberOfCommitsInPullRequest: _lambda=array([-1.4231825])
FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToSubmitter): _lambda=array([-0.17683419])
ControlIntraProjectPul

['ControlPullRequestLifeTimeInMinutes',
 'ControlNumberOfCommitsInPullRequest',
 'FirstOrderDegreeCentrality(PRIntegratorToSubmitter.IssueCommenterToSubmitter)',
 'ControlIntraProjectPullRequestExperienceOfIntegrator',
 'IntraProjectSubmitterPullRequestSubmissionCount',
 'IntraProjectSubmitterPullRequestSuccessRate',
 'IntraProjectSubmitterPullRequestCommentCount',
 'SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator',
 'SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter',
 'EcosystemExperienceSubmitterPullRequestSubmissionCount',
 'EcosystemExperienceSubmitterPullRequestCommentCount',
 'DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount',
 'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
 'InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount',
 'InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount',
 'EcosystemExperienceSubmitterIssueCommentCount',
 'DependencyEcosystemExperienceSub