# Feature Construction

This script constructs the following features:
- Is First-time contributor field
- Second-order degree centrality
- Link intensity

These fields are added to the dataset, removing the fields they are composed of (in case they are aggregated features).
The final dataset is outputted to a `.csv` file.

## Dataset loading

In [77]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "non_ftc_data"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")


df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/non_ftc_data.csv'
len(df)=1815972.
len(df.columns)=151



Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.PRIntegratorToSubmitterV2-In),IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.PRCommenterToSubmitterV2-In),IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.PRCommenterToCommenterV2-In),IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.IssueCommenterToCommenterV2-In),IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.IssueCommenterToSubmitterV2-In),...,EcosystemSharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator,EcosystemSharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter,EcosystemExperienceSubmitterIssueSubmissionCount,EcosystemExperienceSubmitterIssueCommentCount,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount
count,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,...,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0
mean,147475100.0,5198276.0,3807.207,31901.11,4.947501,39.26438,10.76084,12.62702,8.083162,8.484018,...,1.720615,1.699145,2.615631,17.85944,0.1743463,0.8645552,2.274088,15.70265,0.1854588,1.355166
std,100429400.0,8100370.0,8071.615,139933.5,72.83362,2497.234,318.6353,450.6868,365.2262,196.2613,...,9.780338,9.77201,7.98554,67.00519,1.580686,11.86131,7.064174,61.3108,2.446396,16.67164
min,687.0,1.0,1.0,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,58073220.0,463157.0,100.0,52.9,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,137194000.0,1620265.0,548.0,979.9167,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,227802600.0,6384100.0,2817.0,7328.35,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,7.0,0.0,0.0,2.0,6.0,0.0,0.0
max,361775600.0,59761230.0,82976.0,4489490.0,10000.0,346128.0,87892.0,122792.0,109504.0,47640.0,...,7556.0,7556.0,2332.0,7630.0,373.0,927.0,2332.0,7630.0,741.0,2764.0


In [78]:
print(list(df.columns))

['ID', 'Project Name', 'Submitter ID', 'PR Number', 'Closed At', 'PullRequestIsMerged', 'SubmitterIsFirstTimeContributor', 'ControlIntegratedBySameUser', 'ControlPullRequestLifeTimeInMinutes', 'ControlPullRequestHasComments', 'ControlNumberOfCommitsInPullRequest', 'ControlPullRequestHasCommentByExternalUser', 'ControlHasHashTagInDescription', 'IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.PRIntegratorToSubmitterV2-In)', 'IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.PRCommenterToSubmitterV2-In)', 'IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.PRCommenterToCommenterV2-In)', 'IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.IssueCommenterToCommenterV2-In)', 'IntraProjectSecondOrderDegreeCentrality(PRIntegratorToSubmitterV2.IssueCommenterToSubmitterV2-In)', 'IntraProjectSecondOrderDegreeCentrality(PRCommenterToSubmitterV2.PRIntegratorToSubmitterV2-In)', 'IntraProjectSecondOrderDegreeCentrality(PRCommenterToSubmit

## Construct first-time contributor

The FTC field calculated by `sliding_window_3` is by definition wrong (because it's multithreaded and FTC is calculated using a global operator whereas each thread only sees a subset of the data). This error is corrected here.

It can be done in two fashions: 1, with a pre-existing dataset (e.g., one generated by `sliding_window_2`), or 2, by inferring it from the data. It picks a method automatically based on whether a pre-existing file exists. Because inferring it is a little slow, an FTC data file is created, which can be re-used in any future runs.

In [79]:
def replace(
    df1: pd.DataFrame, df2: pd.DataFrame, match_column: str, overwritten_column: str
) -> pd.DataFrame:
    replace_data = df2[[match_column, overwritten_column]]

    old_counts = df1[overwritten_column].value_counts()

    df1 = df1.merge(
        replace_data, how="left", on=match_column, suffixes=("_incorrect", "")
    )
    old_key = f"{overwritten_column}_incorrect"
    df1 = df1.drop(old_key, axis=1)

    new_counts = df1[overwritten_column].value_counts()

    diff = new_counts - old_counts
    print(f"Impact overwriting '{overwritten_column}':\n{diff}")

    return df1


pre_existing_ftc_data_path = f"{exp_utils.BASE_PATH}/final_data/ftc_data.csv"
ftc_key = "SubmitterIsFirstTimeContributor"


def replace_with_pre_existing_data():
    global df
    print(f"{pre_existing_ftc_data_path=}\n")

    other_df: pd.DataFrame = pd.read_csv(pre_existing_ftc_data_path)
    df = replace(df, other_df, "ID", ftc_key)

    del other_df

In [80]:
def replace_without_pre_existing_data():
    """
    Simply tracks any contributor who is not an FTC,
    and if someone is, it checks with the tracked list.
    If they are in the list, this means the variable is,
    wrong and it's set to False.
    """

    global df

    non_ftcs = set()

    total_updated = 0
    for index, row in df.iterrows():
        # It can only illegally go back to FTC,
        # so other cases don't need to be checked.
        is_ftc = row[ftc_key]
        if not is_ftc:
            continue

        project_name = row["Project Name"]
        submitter_id = row["Submitter ID"]
        key = hash(f"{project_name}:{submitter_id}")

        if key in non_ftcs:
            total_updated += 1
            df.at[index, ftc_key] = False
        else:
            non_ftcs.add(key)

    print(f"{total_updated=}")

    ftc_df: pd.DataFrame = df[[*df.columns[:6], ftc_key]]
    ftc_df.to_csv(pre_existing_ftc_data_path, index=False)

In [81]:
from os import path

if path.exists(pre_existing_ftc_data_path):
    print("Using FTC data file.")
    replace_with_pre_existing_data()
else:
    print("Inferring FTC data.")
    replace_without_pre_existing_data()

Using FTC data file.
pre_existing_ftc_data_path='/workspaces/msc_thesis/data//final_data/ftc_data.csv'



Impact overwriting 'SubmitterIsFirstTimeContributor':
SubmitterIsFirstTimeContributor
False    215739
True    -215739
Name: count, dtype: int64


## Construct second-order degree centrality

Calculates 2nd-order degree centrality by aggregating the 2nd-order in-/out-degree values weighing them according to their prevalence, using normalized reciprocal of the edge count; i.e., such that more common edges are weighted less.

_Note, the fields are named 1st-order degree centrality. This is a mistake, but I don't want to rewrite all the data retrieval code._

### Calculate edge weights

In [82]:
import json


# NOTE: There's nothing automated about these numbers;
# I copy-pasted them from the terminal after running ``sliding_window_3.py`` on the dataset.
# The order of these scores matters (the link intensity features assume order).
edge_counts = {
    "PRIntegratorToSubmitterV2": 1043559,
    "PRCommenterToSubmitterV2": 2808889,
    "PRCommenterToCommenterV2": 24595282,
    "IssueCommenterToCommenterV2": 73500466,
    "IssueCommenterToSubmitterV2": 5806894,
}
total_edges = sum(edge_counts.values())


def calculate_weight(value):
    """Calculates normalized complement of the weight."""
    percentage_complement = 1 - (value / (total_edges))
    normalized_perc_compl = percentage_complement / (len(edge_counts) - 1)
    return normalized_perc_compl


edge_weights = {key: calculate_weight(value) for key, value in edge_counts.items()}
total_weight = sum(edge_weights.values())
edge_weights = {key: value / total_weight for key, value in edge_weights.items()}

print("Edge weights:\n", json.dumps(edge_weights, indent=4))

Edge weights:
 {
    "PRIntegratorToSubmitterV2": 0.24757886379195637,
    "PRCommenterToSubmitterV2": 0.24348316399717174,
    "PRCommenterToCommenterV2": 0.19293707610471117,
    "IssueCommenterToCommenterV2": 0.07947333160781547,
    "IssueCommenterToSubmitterV2": 0.23652756449834528
}


### Calculate centrality

In [83]:
from itertools import product

scopes = ["Ecosystem", "IntraProject"]
edge_types = edge_weights.keys()
directions = ["In", "Out"]
combinations = product(scopes, edge_types, edge_types, directions)

BASE_EDGE_FIELD = "{scope}SecondOrderDegreeCentrality({connecting_edge}.{experience_edge}-{direction})"
BASE_TARGET_FIELD = "Weighted{scope}SecondOrder{direction}DegreeCentrality"

centrality_features = []

# Iterates through all edge variables.
for scope, connecting_edge, experience_edge, direction in combinations:
    edge_field = BASE_EDGE_FIELD.format(
        scope=scope,
        connecting_edge=connecting_edge,
        experience_edge=experience_edge,
        direction=direction,
    )

    # Applies the weights to edge.
    connecting_weight = edge_weights[connecting_edge]
    experience_weight = edge_weights[experience_edge]
    total_weight = connecting_weight * experience_weight

    df[edge_field] = df[edge_field].multiply(total_weight)

    # Add contribution to centrality.
    target_field = BASE_TARGET_FIELD.format(scope=scope, direction=direction)
    # Sets initial value if it doesn't exist yet.
    if target_field not in df.columns:
        df[target_field] = 0
        centrality_features.append(target_field)

    df[target_field] = df[target_field].add(df[edge_field])

    # Drops the edge column, as it's only necessary once.
    df = df.drop(edge_field, axis=1)

print(f"{len(df.columns)=}")
df[centrality_features].describe()

len(df.columns)=55


Unnamed: 0,WeightedEcosystemSecondOrderInDegreeCentrality,WeightedEcosystemSecondOrderOutDegreeCentrality,WeightedIntraProjectSecondOrderInDegreeCentrality,WeightedIntraProjectSecondOrderOutDegreeCentrality
count,1815972.0,1815972.0,1815972.0,1815972.0
mean,12.58389,10.38033,37.53797,40.61421
std,587.5369,517.6232,1609.692,2403.549
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.1567563,0.04697693
75%,1.027667,0.5467396,2.462693,1.611341
max,281122.0,272374.5,1035309.0,1488838.0


## Construct Link intensity

Calculates link intensity by aggregating the individual fields, weighting them using the normalized reciprocal of the edge count (the same weights as used for 2nd-order degree centrality).

In [84]:
directions = ["IntegratorToSubmitter", "SubmitterToIntegrator"]

direction_fields = {
    directions[0]: [
        "SharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter",
        "SharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter",
        "SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter",
        "SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter",
        "SharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter",
    ],
    directions[1]: [
        "SharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator",
        "SharedExperiencePullRequestSubmittedBySubmitterCommentedOnByIntegrator",
        "SharedExperiencePullRequestDiscussionParticipationByIntegratorAndSubmitter",
        "SharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter",
        "SharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator",
    ],
}

from wmeijer_utils.collections.list_access import flatten


flt = list(flatten(direction_fields.values()))
flt = [f'{scope}{field}' for scope, field in product(scopes, flt) if f'{scope}{field}' in df.columns]
df[flt].describe()

Unnamed: 0,EcosystemSharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter,EcosystemSharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter,EcosystemSharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter,EcosystemSharedExperiencePullRequestSubmittedBySubmitterCommentedOnByIntegrator,EcosystemSharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator,IntraProjectSharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter,IntraProjectSharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter,IntraProjectSharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter,IntraProjectSharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator,IntraProjectSharedExperiencePullRequestSubmittedBySubmitterCommentedOnByIntegrator,IntraProjectSharedExperienceIssueSubmittedBySubmitterCommentedOnByIntegrator
count,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0
mean,6.105694,3.927382,1.699145,4.113105,1.720615,10.91525,5.984465,2.256623,12.78633,7.292949,2.418588
std,38.96921,25.70824,9.77201,25.79123,9.780338,52.54254,22.17244,10.51257,52.83821,22.77851,10.60089
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,6.0,2.0,0.0,10.0,5.0,1.0
max,786.0,1021.0,7556.0,1021.0,7556.0,1703.0,1034.0,1824.0,1703.0,1034.0,1824.0


In [85]:
BASE_TARGET_FIELD = "{scope}{direction}LinkIntensity"

combinations = product(scopes, directions)
link_intensity_fields = []

edge_weights_list = list(edge_weights.values())
for scope, direction in combinations:
    target_field = BASE_TARGET_FIELD.format(scope=scope, direction=direction)
    fields = direction_fields[direction]

    # Adds initial value for the field.
    df[target_field] = 0
    link_intensity_fields.append(target_field)

    for index, field in enumerate(fields):

        if index in {0, 2, 3}:
            # TODO: REMOVE THIS; THIS SHOULDN'T BE PRESENT IN THE FINAL VERSION.
            continue

        field = f"{scope}{field}"

        # Applies weight
        field_weight = edge_weights_list[index]
        df[field] = df[field].multiply(field_weight)

        # Adds field contribution to link intensity.
        df[target_field] = df[target_field].add(df[field])

        # Drops field as its only used once.
        df = df.drop(field, axis=1)

print(f"{len(df.columns)=}")
df[link_intensity_fields].describe()

len(df.columns)=51


Unnamed: 0,EcosystemIntegratorToSubmitterLinkIntensity,EcosystemSubmitterToIntegratorLinkIntensity,IntraProjectIntegratorToSubmitterLinkIntensity,IntraProjectSubmitterToIntegratorLinkIntensity
count,1815972.0,1815972.0,1815972.0,1815972.0
mean,1.358146,1.408445,1.99087,2.347773
std,7.069803,7.091583,6.74343,6.874004
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.2365276
75%,0.2365276,0.2434832,0.9600215,1.669604
max,1787.202,1787.202,431.9132,431.9132


## Save dataset to file

In [86]:
output_data_file_name = "dataset_with_constructed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df.to_csv(output_path, index=False)

print(f'{len(df)}')
print(f'{len(df.columns)}')

df.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_with_constructed.csv'
1815972
51


Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,IntraProjectSharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator,...,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderInDegreeCentrality,WeightedEcosystemSecondOrderOutDegreeCentrality,WeightedIntraProjectSecondOrderInDegreeCentrality,WeightedIntraProjectSecondOrderOutDegreeCentrality,EcosystemIntegratorToSubmitterLinkIntensity,EcosystemSubmitterToIntegratorLinkIntensity,IntraProjectIntegratorToSubmitterLinkIntensity,IntraProjectSubmitterToIntegratorLinkIntensity
count,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,...,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0
mean,147475100.0,5198276.0,3807.207,31901.11,4.947501,193.2087,16.7935,0.5352468,21.70048,12.78633,...,0.1854588,1.355166,12.58389,10.38033,37.53797,40.61421,1.358146,1.408445,1.99087,2.347773
std,100429400.0,8100370.0,8071.615,139933.5,72.83362,482.7151,55.48605,0.462326,65.63799,52.83821,...,2.446396,16.67164,587.5369,517.6232,1609.692,2403.549,7.069803,7.091583,6.74343,6.874004
min,687.0,1.0,1.0,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,58073220.0,463157.0,100.0,52.9,1.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,137194000.0,1620265.0,548.0,979.9167,1.0,30.0,2.0,0.8,1.0,1.0,...,0.0,0.0,0.0,0.0,0.1567563,0.04697693,0.0,0.0,0.0,0.2365276
75%,227802600.0,6384100.0,2817.0,7328.35,3.0,141.0,17.0,1.0,15.0,10.0,...,0.0,0.0,1.027667,0.5467396,2.462693,1.611341,0.2365276,0.2434832,0.9600215,1.669604
max,361775600.0,59761230.0,82976.0,4489490.0,10000.0,4281.0,1703.0,1.0,2038.0,1703.0,...,741.0,2764.0,281122.0,272374.5,1035309.0,1488838.0,1787.202,1787.202,431.9132,431.9132


In [87]:
import datetime

print(datetime.datetime.now())

2024-02-16 15:23:50.177984
