# Feature Construction

This script constructs the following features:
- Is First-time contributor field
- Second-order degree centrality
- Link intensity

These fields are added to the dataset, removing the fields they are composed of (in case they are aggregated features).
The final dataset is outputted to a `.csv` file.

## Dataset loading

In [27]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "non_ftc_data"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")


df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/non_ftc_data.csv'
len(df)=1815972.
len(df.columns)=97



Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.PRIntegratorToSubmitter-In),FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.PRCommenterToSubmitter-In),FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.PRCommenterToCommenter-In),FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.IssueCommenterToCommenter-In),FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.IssueCommenterToSubmitter-In),...,SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter,SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter,EcosystemExperienceSubmitterIssueSubmissionCount,EcosystemExperienceSubmitterIssueCommentCount,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount
count,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,...,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0
mean,147475100.0,5198276.0,3807.207,31901.11,4.947501,42.6489,13.826,15.39485,10.21316,10.22213,...,0.4025249,7.930369,2.615631,17.85944,0.1743463,0.8645552,2.274088,15.70265,0.1854588,1.355166
std,100429400.0,8100370.0,8071.615,139933.5,72.83362,2501.879,353.9189,473.5927,378.2511,204.3811,...,3.021098,74.31202,7.98554,67.00519,1.580686,11.86131,7.064174,61.3108,2.446396,16.67164
min,687.0,1.0,1.0,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,58073220.0,463157.0,100.0,52.9,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,137194000.0,1620265.0,548.0,979.9167,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,227802600.0,6384100.0,2817.0,7328.35,3.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,2.0,7.0,0.0,0.0,2.0,6.0,0.0,0.0
max,361775600.0,59761230.0,82976.0,4489490.0,10000.0,346128.0,87892.0,122792.0,109504.0,47640.0,...,219.0,19511.0,2332.0,7630.0,373.0,927.0,2332.0,7630.0,741.0,2764.0


## Construct first-time contributor

This is done by merging the datasets generated by `sliding_window_2` and `sliding_window_3`.
The latter is cannot generate FTC status accurately because it's multithreaded.

In [28]:
def replace(
    df1: pd.DataFrame, df2: pd.DataFrame, match_column: str, overwritten_column: str
) -> pd.DataFrame:
    replace_data = df2[[match_column, overwritten_column]]

    old_counts = df1[overwritten_column].value_counts()

    df1 = df1.merge(
        replace_data, how="left", on=match_column, suffixes=("_incorrect", "")
    )
    old_key = f"{overwritten_column}_incorrect"
    df1 = df1.drop(old_key, axis=1)

    new_counts = df1[overwritten_column].value_counts()

    diff = new_counts - old_counts
    print(f"Impact overwriting '{overwritten_column}':\n{diff}")

    return df1


other_data_name = "ftc_data"
other_data_path = f"{exp_utils.BASE_PATH}/final_data/{other_data_name}.csv"
print(f"{other_data_path=}\n")

other_df: pd.DataFrame = pd.read_csv(other_data_path)

first_time_contributor_key = "SubmitterIsFirstTimeContributor"
df = replace(df, other_df, "ID", first_time_contributor_key)

del other_df

other_data_path='/workspaces/msc_thesis/data//final_data/ftc_data.csv'

Impact overwriting 'SubmitterIsFirstTimeContributor':
SubmitterIsFirstTimeContributor
False    215739
True    -215739
Name: count, dtype: int64


## Construct second-order degree centrality

Calculates 2nd-order degree centrality by aggregating the 2nd-order in-/out-degree values weighing them according to their prevalence, using normalized reciprocal of the edge count; i.e., such that more common edges are weighted less.

_Note, the fields are named 1st-order degree centrality. This is a mistake, but I don't want to rewrite all the data retrieval code._

In [29]:
import regex as re

fo_in_exp = r".*FirstOrderDegree.*In\).*"
so_degree_in_features = list(
    [field for field in df.columns if re.match(fo_in_exp, field)]
)

fo_out_exp = r".*FirstOrderDegree.*Out\).*"
so_degree_out_features = list(
    [field for field in df.columns if re.match(fo_out_exp, field)]
)

print(f"{len(so_degree_in_features)=}, {len(so_degree_out_features)=}.")

len(so_degree_in_features)=25, len(so_degree_out_features)=25.


In [30]:
import json

# Calculates the edge weights.

# NOTE: There's nothing automated about these numbers;
# I copy-pasted them from the terminal after running ``sliding_window_2.py`` on the dataset.
edge_counts = {
    "PRIntegratorToSubmitter": 1043559,
    "PRCommenterToSubmitter": 2808889,
    "PRCommenterToCommenter": 24595282,
    "IssueCommenterToCommenter": 73500466,
    "IssueCommenterToSubmitter": 5806894,
}
total_edges = sum(edge_counts.values())

# ahp_weights = {
#     "PRIntegratorToSubmitter": 0.09,
#     "PRCommenterToSubmitter": 0.16,
#     "PRCommenterToCommenter": 0.30,
#     "IssueCommenterToCommenter": 0.16,
#     "IssueCommenterToSubmitter": 0.30
# }


def calculate_weight(value):
    """Calculates normalized complement of the weight."""
    percentage_complement = 1 - (value / (total_edges))
    normalized_perc_compl = percentage_complement / (len(edge_counts) - 1)
    return normalized_perc_compl


edge_weights = {
    key: calculate_weight(value)  # * ahp_weights[key]
    for key, value in edge_counts.items()
}
total_weight = sum(edge_weights.values())
edge_weights = {key: value / total_weight for key, value in edge_weights.items()}

print("Edge weights:\n", json.dumps(edge_weights, indent=4))

Edge weights:
 {
    "PRIntegratorToSubmitter": 0.24757886379195637,
    "PRCommenterToSubmitter": 0.24348316399717174,
    "PRCommenterToCommenter": 0.19293707610471117,
    "IssueCommenterToCommenter": 0.07947333160781547,
    "IssueCommenterToSubmitter": 0.23652756449834528
}


In [31]:
from functools import partial
import itertools


# Feature construction
base_field_name = (
    "FirstOrderDegreeCentralityV2({connecting_edge}.{experience_edge}-{direction})"
)


def calculate_weighted_so_centrality(series: pd.Series, direction: str):
    """Aggregates the 2nd-order degrees into one centrality measure."""

    centrality = 0.0
    partial_field_name = partial(base_field_name.format, direction=direction)

    # Adds centrality contribution per pair of edges.
    for connecting_edge, experience_edge in itertools.product(
        edge_weights.keys(), edge_weights.keys()
    ):
        connection_weight = edge_weights[connecting_edge]
        experience_weight = edge_weights[experience_edge]

        current_field = partial_field_name(
            connecting_edge=connecting_edge, experience_edge=experience_edge
        )
        experience = series[current_field]

        centrality += connection_weight * experience_weight * experience

    return centrality


weighted_in_fod_key = "WeightedFirstOrderInDegreeCentrality"
df[weighted_in_fod_key] = df[so_degree_in_features].apply(
    partial(calculate_weighted_so_centrality, direction="In"), axis=1
)

weighted_out_fod_key = "WeightedFirstOrderOutDegreeCentrality"
df[weighted_out_fod_key] = df[so_degree_out_features].apply(
    partial(calculate_weighted_so_centrality, direction="Out"), axis=1
)

centr_fields = [weighted_in_fod_key, weighted_out_fod_key]
df[centr_fields].describe()

Unnamed: 0,WeightedFirstOrderInDegreeCentrality,WeightedFirstOrderOutDegreeCentrality
count,1815972.0,1815972.0
mean,50.62849,51.59898
std,1718.804,2461.762
min,0.0,0.0
25%,0.06129529,0.0
50%,1.398842,0.7980457
75%,6.308026,4.22292
max,1035309.0,1488839.0


In [32]:
features_to_drop = [*so_degree_in_features, *so_degree_out_features]
print(f"{len(features_to_drop)=}")
print(features_to_drop)
df = df.drop(features_to_drop, axis=1)

len(features_to_drop)=50
['FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.PRIntegratorToSubmitter-In)', 'FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.PRCommenterToSubmitter-In)', 'FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.PRCommenterToCommenter-In)', 'FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.IssueCommenterToCommenter-In)', 'FirstOrderDegreeCentralityV2(PRIntegratorToSubmitter.IssueCommenterToSubmitter-In)', 'FirstOrderDegreeCentralityV2(PRCommenterToSubmitter.PRIntegratorToSubmitter-In)', 'FirstOrderDegreeCentralityV2(PRCommenterToSubmitter.PRCommenterToSubmitter-In)', 'FirstOrderDegreeCentralityV2(PRCommenterToSubmitter.PRCommenterToCommenter-In)', 'FirstOrderDegreeCentralityV2(PRCommenterToSubmitter.IssueCommenterToCommenter-In)', 'FirstOrderDegreeCentralityV2(PRCommenterToSubmitter.IssueCommenterToSubmitter-In)', 'FirstOrderDegreeCentralityV2(PRCommenterToCommenter.PRIntegratorToSubmitter-In)', 'FirstOrderDegreeCentralityV2(PRCommenterToCommenter

## Construct Link intensity

Calculates link intensity by aggregating the individual fields, weighting them using the normalized reciprocal of the edge count (the same weights as used for 2nd-order degree centrality).

In [33]:
sub_to_int_fields = [
    "SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator",
    "SharedExperiencePullRequestSubmittedBySubmitterCommentedOnByIntegrator",
    "SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter",
    "SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter",
    "SharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator",
]

int_to_sub_fields = [
    "SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter",
    "SharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter",
    "SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter",
    "SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter",
    "SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter",
]

In [34]:
def calculate_intensity(
    series: pd.Series, source_fields: list[str], source_weights: list[str]
) -> int:
    """Calculates weighted sum of link intensity fields."""
    return sum(
        [series[field] * weight for field, weight in zip(source_fields, source_weights)]
    )


its_key = "IntegratorToSubmitterLinkIntensity"
df[its_key] = df.apply(
    partial(
        calculate_intensity,
        source_fields=int_to_sub_fields,
        source_weights=edge_weights.values(),
    ),
    axis=1,
)

tsi_key = "SubmitterToIntegratorLinkIntensity"
df[tsi_key] = df.apply(
    partial(
        calculate_intensity,
        source_fields=sub_to_int_fields,
        source_weights=edge_weights.values(),
    ),
    axis=1,
)

df[[its_key, tsi_key]].describe()

Unnamed: 0,IntegratorToSubmitterLinkIntensity,SubmitterToIntegratorLinkIntensity
count,1815972.0,1815972.0
mean,3.33147,4.316357
std,29.71514,30.35006
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.07947333,0.6582846
max,4436.984,4238.017


In [35]:
features_to_drop = [*sub_to_int_fields, *int_to_sub_fields]
print(f"{len(features_to_drop)=}")
print(features_to_drop)
df = df.drop(features_to_drop, axis=1)

len(features_to_drop)=10
['SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator', 'SharedExperiencePullRequestSubmittedBySubmitterCommentedOnByIntegrator', 'SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter', 'SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter', 'SharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator', 'SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter', 'SharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter', 'SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter', 'SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter', 'SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter']


## Save dataset to file

In [36]:
output_data_file_name = "dataset_with_constructed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df.to_csv(output_path, index=False)

df.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_with_constructed.csv'


Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedFirstOrderInDegreeCentrality,WeightedFirstOrderOutDegreeCentrality,IntegratorToSubmitterLinkIntensity,SubmitterToIntegratorLinkIntensity
count,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,...,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0
mean,147475100.0,5198276.0,3807.207,31901.11,4.947501,193.2087,16.7935,0.5352468,21.70048,0.3702776,...,0.1743463,0.8645552,2.274088,15.70265,0.1854588,1.355166,50.62849,51.59898,3.33147,4.316357
std,100429400.0,8100370.0,8071.615,139933.5,72.83362,482.7151,55.48605,0.462326,65.63799,0.4472133,...,1.580686,11.86131,7.064174,61.3108,2.446396,16.67164,1718.804,2461.762,29.71514,30.35006
min,687.0,1.0,1.0,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,58073220.0,463157.0,100.0,52.9,1.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.06129529,0.0,0.0,0.0
50%,137194000.0,1620265.0,548.0,979.9167,1.0,30.0,2.0,0.8,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.398842,0.7980457,0.0,0.0
75%,227802600.0,6384100.0,2817.0,7328.35,3.0,141.0,17.0,1.0,15.0,0.9186047,...,0.0,0.0,2.0,6.0,0.0,0.0,6.308026,4.22292,0.07947333,0.6582846
max,361775600.0,59761230.0,82976.0,4489490.0,10000.0,4281.0,1703.0,1.0,2038.0,1.0,...,373.0,927.0,2332.0,7630.0,741.0,2764.0,1035309.0,1488839.0,4436.984,4238.017


In [38]:
import datetime

print(datetime.datetime.now())

2024-01-29 13:46:02.478242
