# Rare event testing

Identifies what features are relatively rare; i.e., whether it's a zero-inflated column.
This is just for insight, and isn't used for anything.

## Data loading

In [1]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "dataset_transformed"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")

meta_header_count = 5
pr_merged_key = "PullRequestIsMerged"
first_time_contributor_key = "SubmitterIsFirstTimeContributor"

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/dataset_transformed.csv'
len(df)=1216221.
len(df.columns)=55



Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderInDegreeCentrality),ln(1 + WeightedEcosystemSecondOrderOutDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderInDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderOutDegreeCentrality),ln(1 + EcosystemIntegratorToSubmitterLinkIntensity),ln(1 + EcosystemSubmitterToIntegratorLinkIntensity),ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity),ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.5029367,0.4028501,0.09853429,0.3751776,0.05922709,0.4195426,0.1183858,...,0.009542167,0.01752697,0.05187147,0.04183969,0.04879307,0.04224746,0.00739138,0.0126345,0.009940854,0.0302356
std,99544090.0,7968202.0,2304.617,0.4710606,0.4523397,0.2907111,0.4479337,0.2285036,0.2256901,0.07147764,...,0.05263118,0.08245257,0.08805299,0.0822795,0.09041389,0.08358805,0.04788073,0.06305883,0.04629326,0.08392113
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2365158,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.68,0.0,0.0,0.0,0.0,0.4433349,0.07525668,...,0.0,0.0,0.004497554,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,1.0,0.9391304,0.0,0.9230769,0.0,0.5863557,0.1505134,...,0.0,0.0,0.07187211,0.04786833,0.05941411,0.04589478,0.0,0.0,0.0,0.0
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Filter columns

In [2]:
from wmeijer_utils.regex import get_matching

pat1 = r'\b.*LinkIntensity.*'
# pat1 = r'\b.*DegreeCentrality.*'
# pat = r'\b(?!=.*Dependency).*Ecosystem.*'

matching = list(get_matching(df.columns, pat1))
print(matching)

df = df[matching]

['ln(1 + EcosystemIntegratorToSubmitterLinkIntensity)', 'ln(1 + EcosystemSubmitterToIntegratorLinkIntensity)', 'ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity)', 'ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)']


### Predictor Zero Counts

Counts how many fields have partial zero data to see if they are worth including in this study

In [3]:
import regex as re
from typing import Callable


def count_zeroes(_df: pd.DataFrame):
    zeroes = pd.DataFrame()
    zeroes['zeroes'] = _df.eq(0).sum()
    zeroes['non_zeroes'] = len(_df) - zeroes['zeroes']
    zeroes['percentage_zeroes'] = (zeroes['zeroes'] / len(_df)) * 100

    binary_fields = _df.select_dtypes(exclude='number').columns
    ratio_fields = [field for field in _df.columns if len(
        re.findall(r'.*SuccessRate.*', field)) > 0]

    def get_fields_with_requirements(req: Callable[[str, dict], bool]) -> list[str]:
        included_rows = []
        for name, row_data in zeroes.iterrows():
            # We're only interested in continuous data here.
            # The ratio fields have a meaningful 0.
            if name in binary_fields or name in ratio_fields:
                continue

            if req(name, row_data):
                included_rows.append(name)
                print(
                    f'{name}: {row_data["zeroes"]:.0f} zeroes, {row_data["non_zeroes"]:.0f} non-zeroes ({row_data["percentage_zeroes"]:.03f}% zeroes).')

        return included_rows

    rare_cutoff = 80
    print(f"\nRare events ({rare_cutoff}+% zeroes):")
    rare_events = get_fields_with_requirements(lambda _, row_data:
                                               row_data['percentage_zeroes'] > rare_cutoff)

    print("\nNon-rare events:")
    non_rare_events = get_fields_with_requirements(lambda name, _:
                                                   name not in rare_events)

    return rare_events, non_rare_events


### Rare events over all data

In [4]:
rare_events, non_rare_events = count_zeroes(df)

print("\nEvent counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


Rare events (80+% zeroes):
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 1170511 zeroes, 45710 non-zeroes (96.242% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 1139728 zeroes, 76493 non-zeroes (93.711% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 1127214 zeroes, 89007 non-zeroes (92.682% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 982904 zeroes, 233317 non-zeroes (80.816% zeroes).

Non-rare events:

Event counts:
len(rare_events)=4
len(non_rare_events)=0


### Rare events for any variable.

In [5]:
import regex as re

df.select_dtypes(include='number')
independent_fields = [field for field in df.columns[meta_header_count:]
                      if len(list(re.findall(r'.*Control.*', field))) == 0]
print(independent_fields)
independent_df = df[df[independent_fields].gt(0).any(axis=1)]
print(f'{len(independent_df)=}\n')

rare_events, non_rare_events = count_zeroes(independent_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


[]
len(independent_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events over ecosystem experience

It performs the same test but now on entries that have some form of ecosystem experience.

In [6]:
import regex as re

ecosystem_fields = [field for field in df.columns[meta_header_count:] if len(list(
    re.findall(r'.*Ecosystem.*', field))) > 0]
print(ecosystem_fields)
ecosystem_df = df[df[ecosystem_fields].gt(0).any(axis=1)]
print(f'{len(ecosystem_df)=}\n')

rare_events, non_rare_events = count_zeroes(ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')

[]
len(ecosystem_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events in dependency ecosystems

In [7]:
import regex as re

dep_ecosystem_fields = [field for field in df.columns[meta_header_count:] if len(list(
    re.findall(r'.*DependencyEcosystem.*', field))) > 0]
print(dep_ecosystem_fields)
dep_ecosystem_df = df[df[dep_ecosystem_fields].gt(0).any(axis=1)]
print(f'{len(dep_ecosystem_df)=}\n')

rare_events, non_rare_events = count_zeroes(dep_ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')

[]
len(dep_ecosystem_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


### Rare events in shared experience

In [8]:
import regex as re

shared_experience_fields = [field for field in df.columns[meta_header_count:]
                            if len(list(re.findall(r'.*SharedExperience.*', field))) > 0]
print(shared_experience_fields)
shared_experience_df = df[df[shared_experience_fields].gt(0).any(axis=1)]
print(f'{len(shared_experience_df)=}\n')

rare_events, non_rare_events = count_zeroes(dep_ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


[]
len(shared_experience_df)=0


Rare events (80+% zeroes):

Non-rare events:
ln(1 + EcosystemIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + EcosystemSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).
ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity): 0 zeroes, 0 non-zeroes (nan% zeroes).

Event Counts:
len(rare_events)=0
len(non_rare_events)=4


In [9]:
import datetime

print(datetime.datetime.now())

2024-02-19 10:19:53.905835
