# Rare event testing

Identifies what features are relatively rare; i.e., whether it's a zero-inflated column.
This is just for insight, and isn't used for anything.

## Data loading

In [10]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "dataset_transformed"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")

meta_header_count = 5
pr_merged_key = "PullRequestIsMerged"
first_time_contributor_key = "SubmitterIsFirstTimeContributor"

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/dataset_transformed.csv'
len(df)=1216221.
len(df.columns)=51



Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderInDegreeCentrality),ln(1 + WeightedEcosystemSecondOrderOutDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderInDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderOutDegreeCentrality),ln(1 + EcosystemIntegratorToSubmitterLinkIntensity),ln(1 + EcosystemSubmitterToIntegratorLinkIntensity),ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity),ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147314300.0,5093466.0,808.2764,0.502795,0.4028468,0.09859457,0.3751558,0.05921727,0.419587,0.1184307,...,0.009537892,0.01753143,0.05184194,0.04184437,0.04874358,0.04224923,0.04721868,0.05028227,0.05627708,0.07173808
std,99536310.0,7967840.0,2296.849,0.471079,0.4523912,0.2907958,0.44798,0.2284907,0.2256942,0.07152478,...,0.052598,0.08246395,0.08803506,0.08230781,0.09031063,0.08361355,0.102868,0.1043392,0.1113767,0.117461
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59258680.0,467330.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2366019,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136723700.0,1634092.0,199.0,0.6666667,0.0,0.0,0.0,0.0,0.4433628,0.07525668,...,0.0,0.0,0.004480431,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226098700.0,6254382.0,684.0,1.0,0.9393939,0.0,0.9230769,0.0,0.586308,0.1505134,...,0.0,0.0,0.07179541,0.04785956,0.0593604,0.04587383,0.02909834,0.05172123,0.06458231,0.1096827
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Filter columns

In [11]:
from wmeijer_utils.regex import get_matching

pat1 = r'\b.*LinkIntensity.*'
# pat1 = r'\b.*DegreeCentrality.*'
# pat = r'\b(?!=.*Dependency).*Ecosystem.*'

matching = list(get_matching(df.columns, pat1))
print(matching)

df = df[matching]

['ln(1 + EcosystemIntegratorToSubmitterLinkIntensity)', 'ln(1 + EcosystemSubmitterToIntegratorLinkIntensity)', 'ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity)', 'ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)']


### Predictor Zero Counts

Counts how many fields have partial zero data to see if they are worth including in this study

In [12]:
import regex as re
from typing import Callable


def count_zeroes(_df: pd.DataFrame):
    zeroes = pd.DataFrame()
    zeroes['zeroes'] = _df.eq(0).sum()
    zeroes['non_zeroes'] = len(_df) - zeroes['zeroes']
    zeroes['percentage_zeroes'] = (zeroes['zeroes'] / len(_df)) * 100

    binary_fields = _df.select_dtypes(exclude='number').columns
    ratio_fields = [field for field in _df.columns if len(
        re.findall(r'.*SuccessRate.*', field)) > 0]

    def get_fields_with_requirements(req: Callable[[str, dict], bool]) -> list[str]:
        included_rows = []
        for name, row_data in zeroes.iterrows():
            # We're only interested in continuous data here.
            # The ratio fields have a meaningful 0.
            if name in binary_fields or name in ratio_fields:
                continue

            if req(name, row_data):
                included_rows.append(name)
                print(
                    f'{name}: {row_data["zeroes"]:.0f} zeroes, {row_data["non_zeroes"]:.0f} non-zeroes ({row_data["percentage_zeroes"]:.03f}% zeroes).')

        return included_rows

    rare_cutoff = 80
    print(f"\nRare events ({rare_cutoff}+% zeroes):")
    rare_events = get_fields_with_requirements(lambda _, row_data:
                                               row_data['percentage_zeroes'] > rare_cutoff)

    print("\nNon-rare events:")
    non_rare_events = get_fields_with_requirements(lambda name, _:
                                                   name not in rare_events)

    return rare_events, non_rare_events


### Rare events over all data

In [13]:
rare_events, non_rare_events = count_zeroes(df)

print("\nEvent counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 879387 zeroes, 336834 non-zeroes (72.305% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 848569 zeroes, 367652 non-zeroes (69.771% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 824271 zeroes, 391950 non-zeroes (67.773% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 679868 zeroes, 536353 non-zeroes (55.900% zeroes).

Event counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events for any variable.

In [14]:
import regex as re

df.select_dtypes(include='number')
independent_fields = [field for field in df.columns[meta_header_count:]
                      if len(list(re.findall(r'.*Control.*', field))) == 0]
print(independent_fields)
independent_df = df[df[independent_fields].gt(0).any(axis=1)]
print(f'{len(independent_df)=}\n')

rare_events, non_rare_events = count_zeroes(independent_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


[]
len(independent_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events over ecosystem experience

It performs the same test but now on entries that have some form of ecosystem experience.

In [15]:
import regex as re

ecosystem_fields = [field for field in df.columns[meta_header_count:] if len(list(
    re.findall(r'.*Ecosystem.*', field))) > 0]
print(ecosystem_fields)
ecosystem_df = df[df[ecosystem_fields].gt(0).any(axis=1)]
print(f'{len(ecosystem_df)=}\n')

rare_events, non_rare_events = count_zeroes(ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')

[]
len(ecosystem_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events in dependency ecosystems

In [16]:
import regex as re

dep_ecosystem_fields = [field for field in df.columns[meta_header_count:] if len(list(
    re.findall(r'.*DependencyEcosystem.*', field))) > 0]
print(dep_ecosystem_fields)
dep_ecosystem_df = df[df[dep_ecosystem_fields].gt(0).any(axis=1)]
print(f'{len(dep_ecosystem_df)=}\n')

rare_events, non_rare_events = count_zeroes(dep_ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')

[]
len(dep_ecosystem_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events in shared experience

In [17]:
import regex as re

shared_experience_fields = [field for field in df.columns[meta_header_count:]
                            if len(list(re.findall(r'.*SharedExperience.*', field))) > 0]
print(shared_experience_fields)
shared_experience_df = df[df[shared_experience_fields].gt(0).any(axis=1)]
print(f'{len(shared_experience_df)=}\n')

rare_events, non_rare_events = count_zeroes(dep_ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


[]
len(shared_experience_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


In [18]:
import datetime

print(datetime.datetime.now())

2024-02-16 14:51:37.609150
