This script does all the logistic regression stuff using all of the data.
It:
- goes through all assumptions for logistic regression
- alters the data wherever necessary to make things work
- creates a classifier and evaluates its performance

### Data Loading

In [41]:
import pandas as pd 
import numpy as np
import math
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import yeojohnson

file_name = "dataset_all_days_started_30_06_23"
data_path = f'/workspaces/msc_thesis/data/final_data/{file_name}.csv'

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)
df = df.sample(math.floor(len(df) * 0.1))
print(f'{len(df)=}')

pr_merged_key = 'PullRequestIsMerged'

dependent = df[pr_merged_key]
dropped_fields = [
        # Meta fields
        'ID', 'Project Name', 'Submitter ID', 'PR Number', 'Closed At',

        # dependent and control field
        pr_merged_key, 'SubmitterIsFirstTimeContributor',

        # Useless fields because they are (almost) all 0
        "DependencyEcosystemExperienceSubmitterIssueCommentCount",
        "DependencyEcosystemExperienceSubmitterIssueSubmissionCount",
        'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
        "DependencyEcosystemExperienceSubmitterPullRequestCommentCount",
        "DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount",

        "IntraProjectSubmitterIssueCommentCount",
        "IntraProjectSubmitterIssueSubmissionCount",

        "InversedDependencyEcosystemExperienceSubmitterIssueCommentCount",
        "InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount",
        'InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate',
        "InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount",
        "InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount",

        "SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter",
        'SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter',
        "SharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator",

        "SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter",
        "SharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter",
        "SharedExperiencePullRequestSubmittedBySubmitterCommentedOnByIntegrator",
    ]
independent = df
for field in dropped_fields:
    independent = independent.drop(field, axis=1)

print(len(independent.columns))
print(independent.columns)

len(df)=182997
17
Index(['ControlIntegratedBySameUser', 'ControlPullRequestLifeTimeInMinutes',
       'ControlPullRequestHasComments', 'ControlNumberOfCommitsInPullRequest',
       'ControlPullRequestHasCommentByExternalUser',
       'ControlHasHashTagInDescription',
       'ControlIntraProjectPullRequestExperienceOfIntegrator',
       'IntraProjectSubmitterPullRequestSubmissionCount',
       'IntraProjectSubmitterPullRequestSuccessRate',
       'IntraProjectSubmitterPullRequestCommentCount',
       'EcosystemExperienceSubmitterPullRequestSuccessRate',
       'EcosystemExperienceSubmitterPullRequestSubmissionCount',
       'EcosystemExperienceSubmitterPullRequestCommentCount',
       'SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator',
       'SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter',
       'EcosystemExperienceSubmitterIssueSubmissionCount',
       'EcosystemExperienceSubmitterIssueCommentCount'],
      dtype='object')


### Log-odds independence

In [42]:
def show_log_odds_independence(__independent: pd.DataFrame, tested_field: str):
    # Re-running logistic regression on the original set of X and y variables
    logit_results = sm.GLM(dependent, __independent,
                           family=sm.families.Binomial()).fit()
    predicted = logit_results.predict(dependent)

    # Getting log odds values
    log_odds = np.log(predicted / (1 - predicted))

    # Visualize predictor variable vs logit values for Age
    plt.scatter(x=__independent[tested_field].values, y=log_odds)
    plt.xlabel(tested_field)
    plt.ylabel("Log-odds")
    plt.show()


def test_log_odds_independence(__independents: pd.DataFrame):
    continuous_vars = __independents.select_dtypes(include='number')
    print(f'{len(continuous_vars.columns)=}')

    # Define continuous variables
    X_lt = continuous_vars.copy()

    def x_ln_x(x):
        # 1 is added to deal with zeroes.
        return x * np.log(1 + x)

    # Add logit transform interaction terms (natural log) for continuous variables.
    for var in continuous_vars:
        ln_var_name = f'ln(.) x {var}'
        X_lt[ln_var_name] = X_lt[var].apply(x_ln_x)

    # Add constant term
    X_lt_constant = sm.add_constant(X_lt, prepend=False)

    # Building model and fit the data.
    logit_results = sm.GLM(dependent,
                           X_lt_constant,
                           family=sm.families.Binomial()
                           ).fit()

    # Display summary results
    print(logit_results.summary())


# NOTE: Works for two:
# - SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator
# - EcosystemExperienceSubmitterIssueCommentCount
test_log_odds_independence(independent)


len(continuous_vars.columns)=13
                  Generalized Linear Model Regression Results                  
Dep. Variable:     PullRequestIsMerged   No. Observations:               182997
Model:                             GLM   Df Residuals:                   182970
Model Family:                 Binomial   Df Model:                           26
Link Function:                   Logit   Scale:                          1.0000
Method:                           IRLS   Log-Likelihood:                -76978.
Date:                 Tue, 04 Jul 2023   Deviance:                   1.5396e+05
Time:                         09:16:02   Pearson chi2:                 1.88e+05
No. Iterations:                      7   Pseudo R-squ. (CS):             0.2107
Covariance Type:             nonrobust                                         
                                                                                    coef    std err          z      P>|z|      [0.025      0.975]
----------------------

In [55]:
continuous_vars = independent.select_dtypes(include='number').columns

tr_independent = independent.copy()


# NOTE: Some (3) of the variables pass after this transformation.
# - EcosystemExperienceSubmitterPullRequestSubmissionCount
# - EcosystemExperienceSubmitterPullRequestCommentCount
# - EcosystemExperienceSubmitterIssueSubmissionCount
# for field in continuous_vars:
#     tr_independent[field] = independent[field].apply(
#         lambda x: math.sqrt(1 + x))
# test_log_odds_independence(tr_independent)

# # NOTE: one of the variables pass after this transformation:
# - EcosystemExperienceSubmitterIssueSubmissionCount
# for field in continuous_vars:
#     tr_independent[field] = independent[field].apply(
#         lambda x: math.log(1 + x))
# test_log_odds_independence(tr_independent)

# NOTE: this works for one
# - EcosystemExperienceSubmitterIssueSubmissionCount
# for field in continuous_vars:
#     tr_independent[field] = independent[field].apply(
#         lambda x: math.log10(1 + x))
# test_log_odds_independence(tr_independent)

# NOTE: this works for three:
# - EcosystemExperienceSubmitterPullRequestCommentCount
# - SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter
# - EcosystemExperienceSubmitterIssueSubmissionCount
# for field in continuous_vars:
#     tr_independent[field] = independent[field].apply(lambda x: 1/(x+1))
# test_log_odds_independence(tr_independent)

# NOTE: This works for three.
# - IntraProjectSubmitterPullRequestSubmissionCount
# - EcosystemExperienceSubmitterIssueSubmissionCount
# - EcosystemExperienceSubmitterIssueCommentCount
# for field in continuous_vars:
#     tr_independent[field] = independent[field].apply(lambda x: x**2)
# test_log_odds_independence(tr_independent)

# NOTE: This works for two:
# - SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter   _lambda=-5.845365889174542 (reciprocal with sign preservation)
# - EcosystemExperienceSubmitterIssueSubmissionCount                         _lambda=-0.12955260658759882 (power transform with sign preservation)
# for field in continuous_vars:
#     transformed, _lambda = yeojohnson(independent[field])
#     print(f'{field=}, {_lambda=}')
#     tr_independent[field] = transformed
# test_log_odds_independence(tr_independent)

# NOTE: This does nothing.
# for field in continuous_vars:
#     tr_independent[f'squared_{field}'] = independent[field].apply(lambda x: x**2)
#     tr_independent[f'cubed_{field}'] = independent[field].apply(lambda x: x**3)
# test_log_odds_independence(tr_independent)

# NOTE: works for six:
# - EcosystemExperienceSubmitterPullRequestSubmissionCount
# - EcosystemExperienceSubmitterPullRequestCommentCount
# - SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator
# - SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter
# - EcosystemExperienceSubmitterIssueSubmissionCount
# - EcosystemExperienceSubmitterIssueCommentCount
sqrt_transform = ["EcosystemExperienceSubmitterPullRequestSubmissionCount",
                  "EcosystemExperienceSubmitterPullRequestCommentCount", "EcosystemExperienceSubmitterIssueSubmissionCount"]
for field in sqrt_transform:
    tr_independent[field] = independent[field].apply(
        lambda x: math.sqrt(1 + x))
log_transform = []
for field in log_transform:
    tr_independent[field] = independent[field].apply(lambda x: math.log(1 + x))
power_transform = []
for field in power_transform:
    tr_independent[field] = independent[field].apply(lambda x: x ** 2)
recip_transform = [
    "SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter"]
for field in recip_transform:
    tr_independent[field] = independent[field].apply(lambda x: 1 / (1+x))
test_log_odds_independence(tr_independent)


len(continuous_vars.columns)=13
                  Generalized Linear Model Regression Results                  
Dep. Variable:     PullRequestIsMerged   No. Observations:               182997
Model:                             GLM   Df Residuals:                   182970
Model Family:                 Binomial   Df Model:                           26
Link Function:                   Logit   Scale:                          1.0000
Method:                           IRLS   Log-Likelihood:                -76914.
Date:                 Tue, 04 Jul 2023   Deviance:                   1.5383e+05
Time:                         09:34:54   Pearson chi2:                 1.88e+05
No. Iterations:                      7   Pseudo R-squ. (CS):             0.2113
Covariance Type:             nonrobust                                         
                                                                                    coef    std err          z      P>|z|      [0.025      0.975]
----------------------