# Feature transformation

Used to test the impact of various feature transformations on the dataset, and ultimately outputs a dataset with those transformations applied on it.

## Dataset loading

In [47]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "dataset_subsampled"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")

meta_header_count = 5

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/dataset_subsampled.csv'
len(df)=1216221.
len(df.columns)=43



Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedFirstOrderInDegreeCentrality,WeightedFirstOrderOutDegreeCentrality,IntegratorToSubmitterLinkIntensity,SubmitterToIntegratorLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147333200.0,5094727.0,808.2051,36775.7,4.2427,40.22172,9.851682,0.5028689,10.17565,0.4028818,...,0.2018975,0.8584813,2.498322,17.58419,0.2573463,1.802841,47.4015,44.52494,2.32646,3.080777
std,99529860.0,7972087.0,2292.125,153599.9,56.50046,117.6947,34.01228,0.4710872,32.4744,0.4523492,...,1.71068,9.28874,7.560292,65.32814,2.924358,19.00969,1523.153,1889.718,21.05608,21.67002
min,687.0,1.0,1.0,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59268780.0,467209.0,50.0,36.91667,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136819600.0,1634092.0,199.0,896.85,1.0,11.0,1.0,0.6774194,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.11737,0.6543746,0.0,0.0
75%,226067600.0,6254382.0,684.0,7970.367,3.0,38.0,9.0,1.0,7.0,0.9390244,...,0.0,0.0,2.0,7.0,0.0,0.0,5.628415,3.903172,0.0,0.47684
max,361773400.0,59734440.0,82976.0,4489490.0,10000.0,4273.0,1699.0,1.0,1887.0,1.0,...,373.0,927.0,2332.0,7630.0,741.0,2764.0,1035309.0,1488839.0,4426.61,4225.219


In [48]:
predictors = df.columns[meta_header_count:]
numeric_predictors = df[predictors].select_dtypes(include="number")

## Log-transformation

Applies one-off log-transform on count predictors (PR submission count, etc.).

In [49]:
import numpy as np
import regex as re

df_transformed = df.copy()

sr_re = r".*SuccessRate.*"

count_predictors = [
    field for field in numeric_predictors.columns if not re.match(sr_re, field)
]

print(f"{count_predictors=}")

count_predictors=['ControlPullRequestLifeTimeInMinutes', 'ControlNumberOfCommitsInPullRequest', 'ControlIntraProjectPullRequestExperienceOfIntegrator', 'IntraProjectSubmitterPullRequestSubmissionCount', 'IntraProjectSubmitterPullRequestCommentCount', 'EcosystemExperienceSubmitterPullRequestSubmissionCount', 'EcosystemExperienceSubmitterPullRequestCommentCount', 'DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'DependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'IntraProjectSubmitterIssueSubmissionCount', 'IntraProjectSubmitterIssueCommentCount', 'EcosystemExperienceSubmitterIssueSubmissionCount', 'EcosystemExperienceSubmitterIssueCommentCount', 'DependencyEcosystemExperie

In [50]:
# Applies one-off log-transform
for field in count_predictors:
    new_key = f"ln(1 + {field})"
    df_transformed[new_key] = df_transformed[field].apply(lambda x: np.log(1 + x))
    df_transformed = df_transformed.drop(field, axis=1)

df_transformed.describe()

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedFirstOrderInDegreeCentrality),ln(1 + WeightedFirstOrderOutDegreeCentrality),ln(1 + IntegratorToSubmitterLinkIntensity),ln(1 + SubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147333200.0,5094727.0,808.2051,0.5028689,0.4028818,0.09854712,0.3752564,0.05918652,6.435221,1.090819,...,0.06786232,0.1345035,0.6173024,1.201735,0.06299139,0.1388105,1.22975,1.056296,0.327196,0.4450936
std,99529860.0,7972087.0,2292.125,0.4710872,0.4523492,0.2907154,0.4479614,0.2284392,3.452414,0.6586773,...,0.3198579,0.5471214,0.9146615,1.564071,0.3475353,0.6531437,1.455311,1.401731,0.8285576,0.9384045
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0165293,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59268780.0,467209.0,50.0,0.0,0.0,0.0,0.0,0.0,3.635391,0.6931472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136819600.0,1634092.0,199.0,0.6774194,0.0,0.0,0.0,0.0,6.800003,0.6931472,...,0.0,0.0,0.0,0.6931472,0.0,0.0,0.7501748,0.5034231,0.0,0.0
75%,226067600.0,6254382.0,684.0,1.0,0.9390244,0.0,0.9230769,0.0,8.983611,1.386294,...,0.0,0.0,1.098612,2.079442,0.0,0.0,1.891366,1.589882,0.0,0.3899047
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,15.31725,9.21044,...,5.924256,6.833032,7.75491,8.939974,6.609349,7.924796,13.85021,14.21351,8.395615,8.349063


In [51]:
# Collects the new numeric fields for the transformed dataframe.
transformed_predictors = df_transformed.columns[meta_header_count:]
transformed_numeric_predictors = df_transformed[transformed_predictors].select_dtypes(
    include="number"
)

## Feature scaling

Applies min-max scaling on the features.
This is applied to the transformed and the untransformed data.

In [52]:
def min_max_scale(min, max, x):
    return (x - min) / (max - min)


def scale(_df: pd.DataFrame, scaled_fields: pd.Series):
    scaled_df = _df.copy()

    for feature in scaled_fields:
        mn = scaled_df[feature].min()
        mx = scaled_df[feature].max()

        def __apply_scale(x):
            return min_max_scale(mn, mx, x)

        scaled_df[feature] = scaled_df[feature].apply(__apply_scale)

    return scaled_df

In [53]:
df_transformed = scale(df_transformed, transformed_numeric_predictors.columns)
df_transformed.describe()

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedFirstOrderInDegreeCentrality),ln(1 + WeightedFirstOrderOutDegreeCentrality),ln(1 + IntegratorToSubmitterLinkIntensity),ln(1 + SubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147333200.0,5094727.0,808.2051,0.5028689,0.4028818,0.09854712,0.3752564,0.05918652,0.4195026,0.1184329,...,0.01145499,0.01968431,0.07960149,0.1344227,0.009530649,0.01751597,0.08878922,0.07431632,0.03897225,0.05331061
std,99529860.0,7972087.0,2292.125,0.4710872,0.4523492,0.2907154,0.4479614,0.2284392,0.2256374,0.0715142,...,0.05399123,0.08007008,0.1179461,0.1749526,0.05258238,0.08241773,0.105075,0.09861962,0.09868933,0.1123964
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59268780.0,467209.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2365158,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136819600.0,1634092.0,199.0,0.6774194,0.0,0.0,0.0,0.0,0.4433434,0.07525668,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.05416341,0.03541864,0.0,0.0
75%,226067600.0,6254382.0,684.0,1.0,0.9390244,0.0,0.9230769,0.0,0.5860562,0.1505134,...,0.0,0.0,0.1416667,0.2326004,0.0,0.0,0.1365586,0.1118571,0.0,0.04670041
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [54]:
df = scale(df, numeric_predictors.columns)
df.describe()

Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedFirstOrderInDegreeCentrality,WeightedFirstOrderOutDegreeCentrality,IntegratorToSubmitterLinkIntensity,SubmitterToIntegratorLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147333200.0,5094727.0,808.2051,0.008191507,0.00042427,0.009412992,0.005798518,0.5028689,0.005392499,0.4028818,...,0.0005412802,0.0009260856,0.001071322,0.002304612,0.000347296,0.0006522579,4.578487e-05,2.990582e-05,0.0005255625,0.0007291402
std,99529860.0,7972087.0,2292.125,0.03421321,0.005650046,0.02754381,0.020019,0.4710872,0.01720954,0.4523492,...,0.004586273,0.01002022,0.003241978,0.00856201,0.003946502,0.006877602,0.001471206,0.001269256,0.004756705,0.005128734
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59268780.0,467209.0,50.0,8.219197e-06,0.0001,0.0007020828,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136819600.0,1634092.0,199.0,0.0001997629,0.0001,0.002574304,0.0005885815,0.6774194,0.0,0.0,...,0.0,0.0,0.0,0.0001310616,0.0,0.0,1.079262e-06,4.395202e-07,0.0,0.0
75%,226067600.0,6254382.0,684.0,0.001775335,0.0003,0.008893049,0.005297234,1.0,0.003709592,0.9390244,...,0.0,0.0,0.0008576329,0.0009174312,0.0,0.0,5.436457e-06,2.621622e-06,0.0,0.0001128557
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Dataset safe to file

In [55]:
output_data_file_name = "dataset_transformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df_transformed.to_csv(output_path, index=False)

df_transformed.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_transformed.csv'


Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedFirstOrderInDegreeCentrality),ln(1 + WeightedFirstOrderOutDegreeCentrality),ln(1 + IntegratorToSubmitterLinkIntensity),ln(1 + SubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147333200.0,5094727.0,808.2051,0.5028689,0.4028818,0.09854712,0.3752564,0.05918652,0.4195026,0.1184329,...,0.01145499,0.01968431,0.07960149,0.1344227,0.009530649,0.01751597,0.08878922,0.07431632,0.03897225,0.05331061
std,99529860.0,7972087.0,2292.125,0.4710872,0.4523492,0.2907154,0.4479614,0.2284392,0.2256374,0.0715142,...,0.05399123,0.08007008,0.1179461,0.1749526,0.05258238,0.08241773,0.105075,0.09861962,0.09868933,0.1123964
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59268780.0,467209.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2365158,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136819600.0,1634092.0,199.0,0.6774194,0.0,0.0,0.0,0.0,0.4433434,0.07525668,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.05416341,0.03541864,0.0,0.0
75%,226067600.0,6254382.0,684.0,1.0,0.9390244,0.0,0.9230769,0.0,0.5860562,0.1505134,...,0.0,0.0,0.1416667,0.2326004,0.0,0.0,0.1365586,0.1118571,0.0,0.04670041
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [56]:
output_data_file_name = "dataset_untransformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df.to_csv(output_path, index=False)

df.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_untransformed.csv'


Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,EcosystemExperienceSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedFirstOrderInDegreeCentrality,WeightedFirstOrderOutDegreeCentrality,IntegratorToSubmitterLinkIntensity,SubmitterToIntegratorLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147333200.0,5094727.0,808.2051,0.008191507,0.00042427,0.009412992,0.005798518,0.5028689,0.005392499,0.4028818,...,0.0005412802,0.0009260856,0.001071322,0.002304612,0.000347296,0.0006522579,4.578487e-05,2.990582e-05,0.0005255625,0.0007291402
std,99529860.0,7972087.0,2292.125,0.03421321,0.005650046,0.02754381,0.020019,0.4710872,0.01720954,0.4523492,...,0.004586273,0.01002022,0.003241978,0.00856201,0.003946502,0.006877602,0.001471206,0.001269256,0.004756705,0.005128734
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59268780.0,467209.0,50.0,8.219197e-06,0.0001,0.0007020828,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136819600.0,1634092.0,199.0,0.0001997629,0.0001,0.002574304,0.0005885815,0.6774194,0.0,0.0,...,0.0,0.0,0.0,0.0001310616,0.0,0.0,1.079262e-06,4.395202e-07,0.0,0.0
75%,226067600.0,6254382.0,684.0,0.001775335,0.0003,0.008893049,0.005297234,1.0,0.003709592,0.9390244,...,0.0,0.0,0.0008576329,0.0009174312,0.0,0.0,5.436457e-06,2.621622e-06,0.0,0.0001128557
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [57]:
import datetime

print(datetime.datetime.now())

2024-01-29 14:02:53.576409
