# Feature transformation

Used to test the impact of various feature transformations on the dataset, and ultimately outputs a dataset with those transformations applied on it.

## Dataset loading

In [1]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "dataset_subsampled"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")

meta_header_count = 5

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/dataset_subsampled.csv'
len(df)=1216221.
len(df.columns)=43



Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderDegreeCentrality,WeightedIntraProjectSecondOrderDegreeCentrality,EcosystemLinkIntensity,IntraProjectLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,36820.84,4.214399,40.2085,9.853376,0.5029367,10.18656,0.4028501,...,0.20159,0.8592304,2.50162,17.59048,0.2578898,1.804803,28.9286,60.84022,1.099021,2.639473
std,99544090.0,7968202.0,2304.617,153859.8,54.61975,116.919,33.99695,0.4710606,32.70153,0.4523397,...,1.705789,9.331199,7.584065,65.2604,2.926212,19.02319,1209.489,2846.961,9.258315,24.68806
min,687.0,1.0,1.0,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,36.91667,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,896.7333,1.0,11.0,1.0,0.68,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.1182791,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,8006.983,3.0,38.0,9.0,1.0,7.0,0.9391304,...,0.0,0.0,2.0,7.0,0.0,0.0,2.699642,2.679358,0.0,0.2475789
max,361775600.0,59734440.0,82976.0,4489490.0,10000.0,4260.0,1702.0,1.0,1820.0,1.0,...,373.0,927.0,2332.0,7630.0,741.0,2764.0,553496.5,2524147.0,904.8289,5249.874


In [2]:
predictors = df.columns[meta_header_count:]
numeric_predictors = df[predictors].select_dtypes(include="number")

## Log-transformation

Applies one-off log-transform on count predictors (PR submission count, etc.).

In [3]:
import numpy as np
import regex as re

df_transformed = df.copy()

sr_re = r".*SuccessRate.*"

count_predictors = [
    field for field in numeric_predictors.columns if not re.match(sr_re, field)
]

print(f"{count_predictors=}")

count_predictors=['ControlPullRequestLifeTimeInMinutes', 'ControlNumberOfCommitsInPullRequest', 'ControlIntraProjectPullRequestExperienceOfIntegrator', 'IntraProjectSubmitterPullRequestSubmissionCount', 'IntraProjectSubmitterPullRequestCommentCount', 'EcosystemExperienceSubmitterPullRequestSubmissionCount', 'EcosystemExperienceSubmitterPullRequestCommentCount', 'DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'DependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'IntraProjectSubmitterIssueSubmissionCount', 'IntraProjectSubmitterIssueCommentCount', 'EcosystemExperienceSubmitterIssueSubmissionCount', 'EcosystemExperienceSubmitterIssueCommentCount', 'DependencyEcosystemExperie

In [4]:
# Applies one-off log-transform
for field in count_predictors:
    new_key = f"ln(1 + {field})"
    df_transformed[new_key] = df_transformed[field].apply(lambda x: np.log(1 + x))
    df_transformed = df_transformed.drop(field, axis=1)

df_transformed.describe()

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderDegreeCentrality),ln(1 + EcosystemLinkIntensity),ln(1 + IntraProjectLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.5029367,0.4028501,0.09853429,0.3751776,0.05922709,6.435834,1.090386,...,0.06780013,0.1345284,0.6174783,1.201789,0.06306752,0.1388976,0.845911,0.8903891,0.1494755,0.383146
std,99544090.0,7968202.0,2304.617,0.4710606,0.4523397,0.2907111,0.4479337,0.2285036,3.453221,0.6583405,...,0.3197231,0.5471591,0.9149239,1.564542,0.3478579,0.6534198,1.313748,1.478552,0.5952032,0.8685605
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0165293,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,0.0,0.0,0.0,0.0,0.0,3.635391,0.6931472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.68,0.0,0.0,0.0,0.0,6.799873,0.6931472,...,0.0,0.0,0.0,0.6931472,0.0,0.0,0.111791,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,1.0,0.9391304,0.0,0.9230769,0.0,8.988194,1.386294,...,0.0,0.0,1.098612,2.079442,0.0,0.0,1.308236,1.302738,0.0,0.2212048
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,15.31725,9.21044,...,5.924256,6.833032,7.75491,8.939974,6.609349,7.924796,13.22401,14.74141,6.80885,8.56615


In [5]:
# Collects the new numeric fields for the transformed dataframe.
transformed_predictors = df_transformed.columns[meta_header_count:]
transformed_numeric_predictors = df_transformed[transformed_predictors].select_dtypes(
    include="number"
)

## Feature scaling

Applies min-max scaling on the features.
This is applied to the transformed and the untransformed data.

In [6]:
def scale(_df: pd.DataFrame, scaled_fields: pd.Series):
    scaled_df = _df.copy()

    for feature in scaled_fields:
        feature_min = scaled_df[feature].min()
        feature_max = scaled_df[feature].max()
        feature_delta = feature_max - feature_min

        scaled_df[feature] = (
            scaled_df[feature].subtract(feature_min).divide(feature_delta)
        )

    return scaled_df

In [7]:
df_transformed = scale(df_transformed, transformed_numeric_predictors.columns)
df_transformed.describe()

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderDegreeCentrality),ln(1 + EcosystemLinkIntensity),ln(1 + IntraProjectLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.5029367,0.4028501,0.09853429,0.3751776,0.05922709,0.4195426,0.1183858,...,0.0114445,0.01968795,0.07962418,0.1344287,0.009542167,0.01752697,0.0639678,0.06040052,0.02195312,0.04472791
std,99544090.0,7968202.0,2304.617,0.4710606,0.4523397,0.2907111,0.4479337,0.2285036,0.2256901,0.07147764,...,0.05396848,0.0800756,0.1179799,0.1750052,0.05263118,0.08245257,0.09934566,0.1002992,0.0874161,0.1013945
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2365158,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.68,0.0,0.0,0.0,0.0,0.4433349,0.07525668,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.008453635,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,1.0,0.9391304,0.0,0.9230769,0.0,0.5863557,0.1505134,...,0.0,0.0,0.1416667,0.2326004,0.0,0.0,0.09892883,0.08837267,0.0,0.02582313
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
df = scale(df, numeric_predictors.columns)
df.describe()

Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderDegreeCentrality,WeightedIntraProjectSecondOrderDegreeCentrality,EcosystemLinkIntensity,IntraProjectLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.00820156,0.0004214399,0.009438615,0.005789293,0.5029367,0.00559701,0.4028501,...,0.0005404558,0.0009268936,0.001072736,0.002305436,0.0003480294,0.0006529677,5.22652e-05,2.410327e-05,0.001214617,0.0005027688
std,99544090.0,7968202.0,2304.617,0.03427111,0.005461975,0.02744578,0.01997471,0.4710606,0.01796787,0.4523397,...,0.004573162,0.01006602,0.003252172,0.008553132,0.003949004,0.006882486,0.002185179,0.00112789,0.01023212,0.0047026
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,8.219197e-06,0.0001,0.0007042254,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.0001997369,0.0001,0.00258216,0.0005875441,0.68,0.0,0.0,...,0.0,0.0,0.0,0.0001310616,0.0,0.0,2.136944e-07,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,0.001783491,0.0003,0.008920188,0.005287897,1.0,0.003846154,0.9391304,...,0.0,0.0,0.0008576329,0.0009174312,0.0,0.0,4.877433e-06,1.06149e-06,0.0,4.715901e-05
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Dataset safe to file

In [9]:
output_data_file_name = "dataset_transformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df_transformed.to_csv(output_path, index=False)

df_transformed.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_transformed.csv'


Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderDegreeCentrality),ln(1 + EcosystemLinkIntensity),ln(1 + IntraProjectLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.5029367,0.4028501,0.09853429,0.3751776,0.05922709,0.4195426,0.1183858,...,0.0114445,0.01968795,0.07962418,0.1344287,0.009542167,0.01752697,0.0639678,0.06040052,0.02195312,0.04472791
std,99544090.0,7968202.0,2304.617,0.4710606,0.4523397,0.2907111,0.4479337,0.2285036,0.2256901,0.07147764,...,0.05396848,0.0800756,0.1179799,0.1750052,0.05263118,0.08245257,0.09934566,0.1002992,0.0874161,0.1013945
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2365158,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.68,0.0,0.0,0.0,0.0,0.4433349,0.07525668,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.008453635,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,1.0,0.9391304,0.0,0.9230769,0.0,0.5863557,0.1505134,...,0.0,0.0,0.1416667,0.2326004,0.0,0.0,0.09892883,0.08837267,0.0,0.02582313
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
output_data_file_name = "dataset_untransformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df.to_csv(output_path, index=False)

df.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_untransformed.csv'


Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderDegreeCentrality,WeightedIntraProjectSecondOrderDegreeCentrality,EcosystemLinkIntensity,IntraProjectLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.00820156,0.0004214399,0.009438615,0.005789293,0.5029367,0.00559701,0.4028501,...,0.0005404558,0.0009268936,0.001072736,0.002305436,0.0003480294,0.0006529677,5.22652e-05,2.410327e-05,0.001214617,0.0005027688
std,99544090.0,7968202.0,2304.617,0.03427111,0.005461975,0.02744578,0.01997471,0.4710606,0.01796787,0.4523397,...,0.004573162,0.01006602,0.003252172,0.008553132,0.003949004,0.006882486,0.002185179,0.00112789,0.01023212,0.0047026
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,8.219197e-06,0.0001,0.0007042254,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.0001997369,0.0001,0.00258216,0.0005875441,0.68,0.0,0.0,...,0.0,0.0,0.0,0.0001310616,0.0,0.0,2.136944e-07,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,0.001783491,0.0003,0.008920188,0.005287897,1.0,0.003846154,0.9391304,...,0.0,0.0,0.0008576329,0.0009174312,0.0,0.0,4.877433e-06,1.06149e-06,0.0,4.715901e-05
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
import datetime

print(datetime.datetime.now())

2024-07-18 13:06:27.136307
