# Rare event testing

Identifies what features are relatively rare; i.e., whether it's a zero-inflated column.
This is just for insight, and isn't used for anything.

## Data loading

In [1]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "dataset_transformed"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")

meta_header_count = 5
pr_merged_key = "PullRequestIsMerged"
first_time_contributor_key = "SubmitterIsFirstTimeContributor"

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/dataset_transformed.csv'
len(df)=1216221.
len(df.columns)=43



Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedFirstOrderInDegreeCentrality),ln(1 + WeightedFirstOrderOutDegreeCentrality),ln(1 + IntegratorToSubmitterLinkIntensity),ln(1 + SubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147333200.0,5094727.0,808.2051,0.5028689,0.4028818,0.09854712,0.3752564,0.05918652,0.4195026,0.1184329,...,0.01145499,0.01968431,0.07960149,0.1344227,0.009530649,0.01751597,0.08878922,0.07431632,0.03897225,0.05331061
std,99529860.0,7972087.0,2292.125,0.4710872,0.4523492,0.2907154,0.4479614,0.2284392,0.2256374,0.0715142,...,0.05399123,0.08007008,0.1179461,0.1749526,0.05258238,0.08241773,0.105075,0.09861962,0.09868933,0.1123964
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59268780.0,467209.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2365158,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136819600.0,1634092.0,199.0,0.6774194,0.0,0.0,0.0,0.0,0.4433434,0.07525668,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.05416341,0.03541864,0.0,0.0
75%,226067600.0,6254382.0,684.0,1.0,0.9390244,0.0,0.9230769,0.0,0.5860562,0.1505134,...,0.0,0.0,0.1416667,0.2326004,0.0,0.0,0.1365586,0.1118571,0.0,0.04670041
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Filter columns

In [70]:
from wmeijer_utils.regex import get_matching

pat = r'\b(?!=.*Dependency).*Ecosystem.*'

matching = list(get_matching(df.columns, pat))
print(matching)

df = df[matching]

['EcosystemExperienceSubmitterPullRequestSuccessRate', 'ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + EcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount)', 'ln(1 + EcosystemExperienceSubmitterIssueCommentCount)']


### Predictor Zero Counts

Counts how many fields have partial zero data to see if they are worth including in this study

In [71]:
import regex as re
from typing import Callable


def count_zeroes(_df: pd.DataFrame):
    zeroes = pd.DataFrame()
    zeroes['zeroes'] = _df.eq(0).sum()
    zeroes['non_zeroes'] = len(_df) - zeroes['zeroes']
    zeroes['percentage_zeroes'] = (zeroes['zeroes'] / len(_df)) * 100

    binary_fields = _df.select_dtypes(exclude='number').columns
    ratio_fields = [field for field in _df.columns if len(
        re.findall(r'.*SuccessRate.*', field)) > 0]

    def get_fields_with_requirements(req: Callable[[str, dict], bool]) -> list[str]:
        included_rows = []
        for name, row_data in zeroes.iterrows():
            # We're only interested in continuous data here.
            # The ratio fields have a meaningful 0.
            if name in binary_fields or name in ratio_fields:
                continue

            if req(name, row_data):
                included_rows.append(name)
                print(
                    f'{name}: {row_data["zeroes"]:.0f} zeroes, {row_data["non_zeroes"]:.0f} non-zeroes ({row_data["percentage_zeroes"]:.03f}% zeroes).')

        return included_rows

    rare_cutoff = 80
    print(f"\nRare events ({rare_cutoff}+% zeroes):")
    rare_events = get_fields_with_requirements(lambda _, row_data:
                                               row_data['percentage_zeroes'] > rare_cutoff)

    print("\nNon-rare events:")
    non_rare_events = get_fields_with_requirements(lambda name, _:
                                                   name not in rare_events)

    return rare_events, non_rare_events


### Rare events over all data

In [72]:
rare_events, non_rare_events = count_zeroes(df)

print("\nEvent counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount): 604727 zeroes, 611494 non-zeroes (49.722% zeroes).
ln(1 + EcosystemExperienceSubmitterPullRequestCommentCount): 646736 zeroes, 569485 non-zeroes (53.176% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount): 689303 zeroes, 526918 non-zeroes (56.676% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueCommentCount): 580443 zeroes, 635778 non-zeroes (47.725% zeroes).

Event counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events for any variable.

In [73]:
import regex as re

df.select_dtypes(include='number')
independent_fields = [field for field in df.columns[meta_header_count:]
                      if len(list(re.findall(r'.*Control.*', field))) == 0]
print(independent_fields)
independent_df = df[df[independent_fields].gt(0).any(axis=1)]
print(f'{len(independent_df)=}\n')

rare_events, non_rare_events = count_zeroes(independent_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


[]
len(independent_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterPullRequestCommentCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueCommentCount): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events over ecosystem experience

It performs the same test but now on entries that have some form of ecosystem experience.

In [74]:
import regex as re

ecosystem_fields = [field for field in df.columns[meta_header_count:] if len(list(
    re.findall(r'.*Ecosystem.*', field))) > 0]
print(ecosystem_fields)
ecosystem_df = df[df[ecosystem_fields].gt(0).any(axis=1)]
print(f'{len(ecosystem_df)=}\n')

rare_events, non_rare_events = count_zeroes(ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')

[]
len(ecosystem_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterPullRequestCommentCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueCommentCount): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events in dependency ecosystems

In [75]:
import regex as re

dep_ecosystem_fields = [field for field in df.columns[meta_header_count:] if len(list(
    re.findall(r'.*DependencyEcosystem.*', field))) > 0]
print(dep_ecosystem_fields)
dep_ecosystem_df = df[df[dep_ecosystem_fields].gt(0).any(axis=1)]
print(f'{len(dep_ecosystem_df)=}\n')

rare_events, non_rare_events = count_zeroes(dep_ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')

[]
len(dep_ecosystem_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterPullRequestCommentCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueCommentCount): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events in shared experience

In [76]:
import regex as re

shared_experience_fields = [field for field in df.columns[meta_header_count:]
                            if len(list(re.findall(r'.*SharedExperience.*', field))) > 0]
print(shared_experience_fields)
shared_experience_df = df[df[shared_experience_fields].gt(0).any(axis=1)]
print(f'{len(shared_experience_df)=}\n')

rare_events, non_rare_events = count_zeroes(dep_ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


[]
len(shared_experience_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterPullRequestCommentCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemExperienceSubmitterIssueCommentCount): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


In [77]:
import datetime

print(datetime.datetime.now())

2024-02-15 10:00:19.504430
