# Rare event testing

Identifies what features are relatively rare; i.e., whether it's a zero-inflated column.
This is just for insight, and isn't used for anything.

## Data loading

In [1]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "dataset_transformed"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")

meta_header_count = 5
pr_merged_key = "PullRequestIsMerged"
first_time_contributor_key = "SubmitterIsFirstTimeContributor"

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/dataset_transformed.csv'
len(df)=1216221.
len(df.columns)=43



Unnamed: 0,ID,Project Name,Submitter ID,PR Number,Closed At,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderDegreeCentrality),ln(1 + EcosystemLinkIntensity),ln(1 + IntraProjectLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,908116.4,8677.039,0.0,0.0,0.0,0.5027905,0.4027001,0.09860918,0.3749753,0.05920373,...,0.01145965,0.0197203,0.07959556,0.1343719,0.009534855,0.01751229,0.06396949,0.06042839,0.02191224,0.04468485
std,524353.1,5806.587,0.0,0.0,0.0,0.4710578,0.452342,0.2908001,0.447916,0.2284681,...,0.0540139,0.08011601,0.1179346,0.1749533,0.05258916,0.08239552,0.09937732,0.1003326,0.08736962,0.1013914
min,0.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,454250.0,3224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,907870.0,8413.0,0.0,0.0,0.0,0.6666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.008438514,0.0,0.0,0.0
75%,1362318.0,13570.0,0.0,0.0,0.0,1.0,0.9387755,0.0,0.9230769,0.0,...,0.0,0.0,0.1416667,0.2326004,0.0,0.0,0.0989143,0.08847414,0.0,0.02582313
max,1815970.0,20094.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Filter columns

In [2]:
# from wmutils.regex import get_matching

# pat1 = r'\b.*LinkIntensity.*'
# # pat1 = r'\b.*DegreeCentrality.*'
# # pat = r'\b(?!=.*Dependency).*Ecosystem.*'

# matching = list(get_matching(df.columns, pat1))
# print(matching)

# df = df[matching]

### Predictor Zero Counts

Counts how many fields have partial zero data to see if they are worth including in this study

In [3]:
import regex as re
from typing import Callable


def count_zeroes(_df: pd.DataFrame):
    zeroes = pd.DataFrame()
    zeroes['zeroes'] = _df.eq(0).sum()
    zeroes['non_zeroes'] = len(_df) - zeroes['zeroes']
    zeroes['percentage_zeroes'] = (zeroes['zeroes'] / len(_df)) * 100

    binary_fields = _df.select_dtypes(exclude='number').columns
    ratio_fields = [field for field in _df.columns if len(
        re.findall(r'.*SuccessRate.*', field)) > 0]

    def get_fields_with_requirements(req: Callable[[str, dict], bool]) -> list[str]:
        included_rows = []
        for name, row_data in zeroes.iterrows():
            # We're only interested in continuous data here.
            # The ratio fields have a meaningful 0.
            if name in binary_fields or name in ratio_fields:
                continue

            if req(name, row_data):
                included_rows.append(name)
                print(
                    f'{name}: {row_data["zeroes"]:.0f} zeroes, {row_data["non_zeroes"]:.0f} non-zeroes ({row_data["percentage_zeroes"]:.03f}% zeroes).')

        return included_rows

    rare_cutoff = 80
    print(f"\nRare events ({rare_cutoff}+% zeroes):")
    rare_events = get_fields_with_requirements(lambda _, row_data:
                                               row_data['percentage_zeroes'] > rare_cutoff)

    print("\nNon-rare events:")
    non_rare_events = get_fields_with_requirements(lambda name, _:
                                                   name not in rare_events)

    return rare_events, non_rare_events


### Rare events over all data

In [4]:
rare_events, non_rare_events = count_zeroes(df)

print("\nEvent counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


Rare events (80+% zeroes):
Submitter ID: 1216221 zeroes, 0 non-zeroes (100.000% zeroes).
PR Number: 1216221 zeroes, 0 non-zeroes (100.000% zeroes).
Closed At: 1216221 zeroes, 0 non-zeroes (100.000% zeroes).
ln(1 + DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount): 1081753 zeroes, 134468 non-zeroes (88.944% zeroes).
ln(1 + DependencyEcosystemExperienceSubmitterPullRequestCommentCount): 1100176 zeroes, 116045 non-zeroes (90.459% zeroes).
ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount): 1134875 zeroes, 81346 non-zeroes (93.312% zeroes).
ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount): 1142067 zeroes, 74154 non-zeroes (93.903% zeroes).
ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount): 1147466 zeroes, 68755 non-zeroes (94.347% zeroes).
ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount): 1123197 zeroes, 93024 non-zeroes (92.351% zeroes).
ln(1 + InversedDependencyEcosystemExperienceS

### Rare events for any variable.

In [5]:
import regex as re

df.select_dtypes(include='number')
independent_fields = [field for field in df.columns[meta_header_count:]
                      if len(list(re.findall(r'.*Control.*', field))) == 0]
print(independent_fields)
independent_df = df[df[independent_fields].gt(0).any(axis=1)]
print(f'{len(independent_df)=}\n')

rare_events, non_rare_events = count_zeroes(independent_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


['PullRequestIsMerged', 'IntraProjectSubmitterPullRequestSuccessRate', 'EcosystemExperienceSubmitterPullRequestSuccessRate', 'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate', 'NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate', 'InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate', 'SubmitterIsFirstTimeContributor', 'ln(1 + IntraProjectSubmitterPullRequestSubmissionCount)', 'ln(1 + IntraProjectSubmitterPullRequestCommentCount)', 'ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + EcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + DependencyEcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + NonDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + NonDependencyEcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1

### Rare events over ecosystem experience

It performs the same test but now on entries that have some form of ecosystem experience.

In [6]:
import regex as re

ecosystem_fields = [field for field in df.columns[meta_header_count:] if len(list(
    re.findall(r'.*Ecosystem.*', field))) > 0]
print(ecosystem_fields)
ecosystem_df = df[df[ecosystem_fields].gt(0).any(axis=1)]
print(f'{len(ecosystem_df)=}\n')

rare_events, non_rare_events = count_zeroes(ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')

['EcosystemExperienceSubmitterPullRequestSuccessRate', 'DependencyEcosystemExperienceSubmitterPullRequestSuccessRate', 'NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate', 'InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate', 'ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + EcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + DependencyEcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + NonDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + NonDependencyEcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount)', 'ln(1 + EcosystemExperienceSubmitterIssueCommentCount)', 'ln(1 + DependencyEcosystem

### Rare events in dependency ecosystems

In [7]:
import regex as re

dep_ecosystem_fields = [field for field in df.columns[meta_header_count:] if len(list(
    re.findall(r'.*DependencyEcosystem.*', field))) > 0]
print(dep_ecosystem_fields)
dep_ecosystem_df = df[df[dep_ecosystem_fields].gt(0).any(axis=1)]
print(f'{len(dep_ecosystem_df)=}\n')

rare_events, non_rare_events = count_zeroes(dep_ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')

['DependencyEcosystemExperienceSubmitterPullRequestSuccessRate', 'NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate', 'InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate', 'ln(1 + DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + DependencyEcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + NonDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + NonDependencyEcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)', 'ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount)', 'ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount)', 'ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount)', 'ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount)', 'ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount)', 'ln(1 + InversedDependencyEcosystemExperienceSu

### Rare events in shared experience

In [8]:
import regex as re

shared_experience_fields = [field for field in df.columns[meta_header_count:]
                            if len(list(re.findall(r'.*SharedExperience.*', field))) > 0]
print(shared_experience_fields)
shared_experience_df = df[df[shared_experience_fields].gt(0).any(axis=1)]
print(f'{len(shared_experience_df)=}\n')

rare_events, non_rare_events = count_zeroes(dep_ecosystem_df)

print("\nEvent Counts:")
print(f'{len(rare_events)=}')
print(f'{len(non_rare_events)=}')


[]
len(shared_experience_df)=0


Rare events (80+% zeroes):
Submitter ID: 793273 zeroes, 0 non-zeroes (100.000% zeroes).
PR Number: 793273 zeroes, 0 non-zeroes (100.000% zeroes).
Closed At: 793273 zeroes, 0 non-zeroes (100.000% zeroes).
ln(1 + DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount): 658805 zeroes, 134468 non-zeroes (83.049% zeroes).
ln(1 + DependencyEcosystemExperienceSubmitterPullRequestCommentCount): 677228 zeroes, 116045 non-zeroes (85.371% zeroes).
ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount): 711927 zeroes, 81346 non-zeroes (89.746% zeroes).
ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount): 719119 zeroes, 74154 non-zeroes (90.652% zeroes).
ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount): 724518 zeroes, 68755 non-zeroes (91.333% zeroes).
ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount): 700249 zeroes, 93024 non-zeroes (88.273% zeroes).
ln(1 + InversedDepende

In [9]:
import datetime

print(datetime.datetime.now())

2024-03-21 12:14:46.606844
