# Feature transformation

Used to test the impact of various feature transformations on the dataset, and ultimately outputs a dataset with those transformations applied on it.

## Dataset loading

In [1]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "dataset_subsampled"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")

meta_header_count = 5

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/dataset_subsampled.csv'
len(df)=1216221.
len(df.columns)=47



Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderInDegreeCentrality,WeightedEcosystemSecondOrderOutDegreeCentrality,WeightedIntraProjectSecondOrderInDegreeCentrality,WeightedIntraProjectSecondOrderOutDegreeCentrality,EcosystemIntegratorToSubmitterLinkIntensity,EcosystemSubmitterToIntegratorLinkIntensity,IntraProjectIntegratorToSubmitterLinkIntensity,IntraProjectSubmitterToIntegratorLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,36820.84,4.214399,40.2085,9.853376,0.5029367,10.18656,0.4028501,...,0.2578898,1.804803,15.78182,13.14678,31.13959,29.70063,0.6723003,0.4267205,1.662326,0.9771472
std,99544090.0,7968202.0,2304.617,153859.8,54.61975,116.919,33.99695,0.4710606,32.70153,0.4523397,...,2.926212,19.02319,684.8061,601.8461,1310.139,1745.382,6.586612,3.542364,20.03288,5.201973
min,687.0,1.0,1.0,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,36.91667,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,896.7333,1.0,11.0,1.0,0.68,0.0,0.0,...,0.0,0.0,0.05805125,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,8006.983,3.0,38.0,9.0,1.0,7.0,0.9391304,...,0.0,0.0,1.463904,0.8204235,1.277089,0.9200011,0.0,0.0,0.0,0.07444943
max,361775600.0,59734440.0,82976.0,4489490.0,10000.0,4260.0,1702.0,1.0,1820.0,1.0,...,741.0,2764.0,281122.0,272374.5,1035309.0,1488838.0,709.0438,195.7852,4432.496,817.3781


In [2]:
predictors = df.columns[meta_header_count:]
numeric_predictors = df[predictors].select_dtypes(include="number")

## Log-transformation

Applies one-off log-transform on count predictors (PR submission count, etc.).

In [3]:
import numpy as np
import regex as re

df_transformed = df.copy()

sr_re = r".*SuccessRate.*"

count_predictors = [
    field for field in numeric_predictors.columns if not re.match(sr_re, field)
]

print(f"{count_predictors=}")

count_predictors=['ControlPullRequestLifeTimeInMinutes', 'ControlNumberOfCommitsInPullRequest', 'ControlIntraProjectPullRequestExperienceOfIntegrator', 'IntraProjectSubmitterPullRequestSubmissionCount', 'IntraProjectSubmitterPullRequestCommentCount', 'EcosystemExperienceSubmitterPullRequestSubmissionCount', 'EcosystemExperienceSubmitterPullRequestCommentCount', 'DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'DependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'IntraProjectSubmitterIssueSubmissionCount', 'IntraProjectSubmitterIssueCommentCount', 'EcosystemExperienceSubmitterIssueSubmissionCount', 'EcosystemExperienceSubmitterIssueCommentCount', 'DependencyEcosystemExperie

In [4]:
# Applies one-off log-transform
for field in count_predictors:
    new_key = f"ln(1 + {field})"
    df_transformed[new_key] = df_transformed[field].apply(lambda x: np.log(1 + x))
    df_transformed = df_transformed.drop(field, axis=1)

df_transformed.describe()

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderInDegreeCentrality),ln(1 + WeightedEcosystemSecondOrderOutDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderInDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderOutDegreeCentrality),ln(1 + EcosystemIntegratorToSubmitterLinkIntensity),ln(1 + EcosystemSubmitterToIntegratorLinkIntensity),ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity),ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.5029367,0.4028501,0.09853429,0.3751776,0.05922709,6.435834,1.090386,...,0.06306752,0.1388976,0.6508078,0.5236211,0.6757944,0.6004845,0.1074344,0.09898465,0.2532331,0.2733463
std,99544090.0,7968202.0,2304.617,0.4710606,0.4523397,0.2907111,0.4479337,0.2285036,3.453221,0.6583405,...,0.3478579,0.6534198,1.104761,1.029723,1.252251,1.188079,0.4893979,0.4273641,0.7140225,0.6361024
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0165293,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,0.0,0.0,0.0,0.0,0.0,3.635391,0.6931472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.68,0.0,0.0,0.0,0.0,6.799873,0.6931472,...,0.0,0.0,0.05642878,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,1.0,0.9391304,0.0,0.9230769,0.0,8.988194,1.386294,...,0.0,0.0,0.9017469,0.5990692,0.822898,0.6523258,0.0,0.0,0.0,0.07180837
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,15.31725,9.21044,...,6.609349,7.924796,12.54655,12.51494,13.85021,14.21351,6.565327,5.282113,8.396944,6.707324


In [5]:
# Collects the new numeric fields for the transformed dataframe.
transformed_predictors = df_transformed.columns[meta_header_count:]
transformed_numeric_predictors = df_transformed[transformed_predictors].select_dtypes(
    include="number"
)

## Feature scaling

Applies min-max scaling on the features.
This is applied to the transformed and the untransformed data.

In [6]:
def scale(_df: pd.DataFrame, scaled_fields: pd.Series):
    scaled_df = _df.copy()

    for feature in scaled_fields:
        feature_min = scaled_df[feature].min()
        feature_max = scaled_df[feature].max()
        feature_delta = feature_max - feature_min

        scaled_df[feature] = (
            scaled_df[feature].subtract(feature_min).divide(feature_delta)
        )

    return scaled_df

In [7]:
df_transformed = scale(df_transformed, transformed_numeric_predictors.columns)
df_transformed.describe()

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderInDegreeCentrality),ln(1 + WeightedEcosystemSecondOrderOutDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderInDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderOutDegreeCentrality),ln(1 + EcosystemIntegratorToSubmitterLinkIntensity),ln(1 + EcosystemSubmitterToIntegratorLinkIntensity),ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity),ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.5029367,0.4028501,0.09853429,0.3751776,0.05922709,0.4195426,0.1183858,...,0.009542167,0.01752697,0.05187147,0.04183969,0.04879307,0.04224746,0.01636391,0.0187396,0.03015777,0.0407534
std,99544090.0,7968202.0,2304.617,0.4710606,0.4523397,0.2907111,0.4479337,0.2285036,0.2256901,0.07147764,...,0.05263118,0.08245257,0.08805299,0.0822795,0.09041389,0.08358805,0.07454282,0.08090779,0.08503362,0.09483697
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2365158,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.68,0.0,0.0,0.0,0.0,0.4433349,0.07525668,...,0.0,0.0,0.004497554,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,1.0,0.9391304,0.0,0.9230769,0.0,0.5863557,0.1505134,...,0.0,0.0,0.07187211,0.04786833,0.05941411,0.04589478,0.0,0.0,0.0,0.01070596
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
df = scale(df, numeric_predictors.columns)
df.describe()

Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderInDegreeCentrality,WeightedEcosystemSecondOrderOutDegreeCentrality,WeightedIntraProjectSecondOrderInDegreeCentrality,WeightedIntraProjectSecondOrderOutDegreeCentrality,EcosystemIntegratorToSubmitterLinkIntensity,EcosystemSubmitterToIntegratorLinkIntensity,IntraProjectIntegratorToSubmitterLinkIntensity,IntraProjectSubmitterToIntegratorLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.00820156,0.0004214399,0.009438615,0.005789293,0.5029367,0.00559701,0.4028501,...,0.0003480294,0.0006529677,5.613869e-05,4.826731e-05,3.007758e-05,1.994886e-05,0.0009481789,0.002179534,0.0003750315,0.001195465
std,99544090.0,7968202.0,2304.617,0.03427111,0.005461975,0.02744578,0.01997471,0.4710606,0.01796787,0.4523397,...,0.003949004,0.006882486,0.002435975,0.002209627,0.001265457,0.001172311,0.009289429,0.01809311,0.004519548,0.006364219
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,8.219197e-06,0.0001,0.0007042254,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.0001997369,0.0001,0.00258216,0.0005875441,0.68,0.0,0.0,...,0.0,0.0,2.064984e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,0.001783491,0.0003,0.008920188,0.005287897,1.0,0.003846154,0.9391304,...,0.0,0.0,5.20736e-06,3.012116e-06,1.233534e-06,6.179322e-07,0.0,0.0,0.0,9.108322e-05
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Dataset safe to file

In [9]:
output_data_file_name = "dataset_transformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df_transformed.to_csv(output_path, index=False)

df_transformed.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_transformed.csv'


Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderInDegreeCentrality),ln(1 + WeightedEcosystemSecondOrderOutDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderInDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderOutDegreeCentrality),ln(1 + EcosystemIntegratorToSubmitterLinkIntensity),ln(1 + EcosystemSubmitterToIntegratorLinkIntensity),ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity),ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.5029367,0.4028501,0.09853429,0.3751776,0.05922709,0.4195426,0.1183858,...,0.009542167,0.01752697,0.05187147,0.04183969,0.04879307,0.04224746,0.01636391,0.0187396,0.03015777,0.0407534
std,99544090.0,7968202.0,2304.617,0.4710606,0.4523397,0.2907111,0.4479337,0.2285036,0.2256901,0.07147764,...,0.05263118,0.08245257,0.08805299,0.0822795,0.09041389,0.08358805,0.07454282,0.08090779,0.08503362,0.09483697
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2365158,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.68,0.0,0.0,0.0,0.0,0.4433349,0.07525668,...,0.0,0.0,0.004497554,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,1.0,0.9391304,0.0,0.9230769,0.0,0.5863557,0.1505134,...,0.0,0.0,0.07187211,0.04786833,0.05941411,0.04589478,0.0,0.0,0.0,0.01070596
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
output_data_file_name = "dataset_untransformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df.to_csv(output_path, index=False)

df.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_untransformed.csv'


Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderInDegreeCentrality,WeightedEcosystemSecondOrderOutDegreeCentrality,WeightedIntraProjectSecondOrderInDegreeCentrality,WeightedIntraProjectSecondOrderOutDegreeCentrality,EcosystemIntegratorToSubmitterLinkIntensity,EcosystemSubmitterToIntegratorLinkIntensity,IntraProjectIntegratorToSubmitterLinkIntensity,IntraProjectSubmitterToIntegratorLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147309200.0,5092504.0,808.913,0.00820156,0.0004214399,0.009438615,0.005789293,0.5029367,0.00559701,0.4028501,...,0.0003480294,0.0006529677,5.613869e-05,4.826731e-05,3.007758e-05,1.994886e-05,0.0009481789,0.002179534,0.0003750315,0.001195465
std,99544090.0,7968202.0,2304.617,0.03427111,0.005461975,0.02744578,0.01997471,0.4710606,0.01796787,0.4523397,...,0.003949004,0.006882486,0.002435975,0.002209627,0.001265457,0.001172311,0.009289429,0.01809311,0.004519548,0.006364219
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59248670.0,467471.0,50.0,8.219197e-06,0.0001,0.0007042254,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136671400.0,1633887.0,199.0,0.0001997369,0.0001,0.00258216,0.0005875441,0.68,0.0,0.0,...,0.0,0.0,2.064984e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226099400.0,6254382.0,684.0,0.001783491,0.0003,0.008920188,0.005287897,1.0,0.003846154,0.9391304,...,0.0,0.0,5.20736e-06,3.012116e-06,1.233534e-06,6.179322e-07,0.0,0.0,0.0,9.108322e-05
max,361775600.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
import datetime

print(datetime.datetime.now())

2024-02-19 11:10:45.511294
