# Feature transformation

Used to test the impact of various feature transformations on the dataset, and ultimately outputs a dataset with those transformations applied on it.

## Dataset loading

In [1]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "dataset_subsampled"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")

meta_header_count = 5

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/dataset_subsampled.csv'
len(df)=1216221.
len(df.columns)=51



Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,IntraProjectSharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator,...,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderInDegreeCentrality,WeightedEcosystemSecondOrderOutDegreeCentrality,WeightedIntraProjectSecondOrderInDegreeCentrality,WeightedIntraProjectSecondOrderOutDegreeCentrality,EcosystemIntegratorToSubmitterLinkIntensity,EcosystemSubmitterToIntegratorLinkIntensity,IntraProjectIntegratorToSubmitterLinkIntensity,IntraProjectSubmitterToIntegratorLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147314300.0,5093466.0,808.2764,36851.36,4.194136,40.19659,9.860225,0.502795,10.1891,7.86663,...,0.2574277,1.805102,15.83814,13.33045,31.20385,31.10127,1.676589,1.7392,1.122238,1.352802
std,99536310.0,7967840.0,2296.849,153871.4,53.5766,116.4114,34.60383,0.471079,32.97641,33.19649,...,2.923879,19.01014,675.8305,605.5197,1377.436,1938.964,8.313866,8.337128,4.751919,4.800951
min,687.0,1.0,1.0,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59258680.0,467330.0,50.0,36.96667,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136723700.0,1634092.0,199.0,897.1167,1.0,11.0,1.0,0.6666667,0.0,1.0,...,0.0,0.0,0.05782397,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226098700.0,6254382.0,684.0,8001.133,3.0,38.0,9.0,1.0,7.0,6.0,...,0.0,0.0,1.461533,0.8202236,1.275396,0.9194294,0.2434832,0.4730551,0.4800107,0.9461103
max,361773400.0,59734440.0,82976.0,4489490.0,10000.0,4274.0,1703.0,1.0,1984.0,1703.0,...,741.0,2764.0,281122.0,272374.5,1035309.0,1488838.0,1787.202,1787.202,431.9132,431.9132


In [2]:
predictors = df.columns[meta_header_count:]
numeric_predictors = df[predictors].select_dtypes(include="number")

## Log-transformation

Applies one-off log-transform on count predictors (PR submission count, etc.).

In [3]:
import numpy as np
import regex as re

df_transformed = df.copy()

sr_re = r".*SuccessRate.*"

count_predictors = [
    field for field in numeric_predictors.columns if not re.match(sr_re, field)
]

print(f"{count_predictors=}")

count_predictors=['ControlPullRequestLifeTimeInMinutes', 'ControlNumberOfCommitsInPullRequest', 'ControlIntraProjectPullRequestExperienceOfIntegrator', 'IntraProjectSubmitterPullRequestSubmissionCount', 'IntraProjectSubmitterPullRequestCommentCount', 'IntraProjectSharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator', 'IntraProjectSharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter', 'EcosystemSharedExperiencePullRequestSubmittedByIntegratorIntegratedBySubmitter', 'EcosystemSharedExperiencePullRequestSubmittedByIntegratorCommentedOnBySubmitter.1', 'EcosystemExperienceSubmitterPullRequestSubmissionCount', 'EcosystemExperienceSubmitterPullRequestCommentCount', 'DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'DependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'InversedDependencyEcosyste

In [4]:
# Applies one-off log-transform
for field in count_predictors:
    new_key = f"ln(1 + {field})"
    df_transformed[new_key] = df_transformed[field].apply(lambda x: np.log(1 + x))
    df_transformed = df_transformed.drop(field, axis=1)

df_transformed.describe()

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderInDegreeCentrality),ln(1 + WeightedEcosystemSecondOrderOutDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderInDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderOutDegreeCentrality),ln(1 + EcosystemIntegratorToSubmitterLinkIntensity),ln(1 + EcosystemSubmitterToIntegratorLinkIntensity),ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity),ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147314300.0,5093466.0,808.2764,0.502795,0.4028468,0.09859457,0.3751558,0.05921727,6.436513,1.090798,...,0.06303926,0.138933,0.6504374,0.5236796,0.6751089,0.6005097,0.3536191,0.3765622,0.3416321,0.4354887
std,99536310.0,7967840.0,2296.849,0.471079,0.4523912,0.2907958,0.44798,0.2284907,3.453285,0.6587747,...,0.3476386,0.65351,1.104536,1.030077,1.250821,1.188442,0.770375,0.7813927,0.6761163,0.7130512
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0165293,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59258680.0,467330.0,50.0,0.0,0.0,0.0,0.0,0.0,3.636709,0.6931472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136723700.0,1634092.0,199.0,0.6666667,0.0,0.0,0.0,0.0,6.8003,0.6931472,...,0.0,0.0,0.05621394,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226098700.0,6254382.0,684.0,1.0,0.9393939,0.0,0.9230769,0.0,8.987463,1.386294,...,0.0,0.0,0.9007845,0.5989593,0.8221541,0.652028,0.2179164,0.3873386,0.3920493,0.6658326
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,15.31725,9.21044,...,6.609349,7.924796,12.54655,12.51494,13.85021,14.21351,7.488966,7.488966,6.070537,6.070537


In [5]:
# Collects the new numeric fields for the transformed dataframe.
transformed_predictors = df_transformed.columns[meta_header_count:]
transformed_numeric_predictors = df_transformed[transformed_predictors].select_dtypes(
    include="number"
)

## Feature scaling

Applies min-max scaling on the features.
This is applied to the transformed and the untransformed data.

In [6]:
def scale(_df: pd.DataFrame, scaled_fields: pd.Series):
    scaled_df = _df.copy()

    for feature in scaled_fields:
        feature_min = scaled_df[feature].min()
        feature_max = scaled_df[feature].max()
        feature_delta = feature_max - feature_min

        scaled_df[feature] = (
            scaled_df[feature].subtract(feature_min).divide(feature_delta)
        )

    return scaled_df

In [7]:
df_transformed = scale(df_transformed, transformed_numeric_predictors.columns)
df_transformed.describe()

Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderInDegreeCentrality),ln(1 + WeightedEcosystemSecondOrderOutDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderInDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderOutDegreeCentrality),ln(1 + EcosystemIntegratorToSubmitterLinkIntensity),ln(1 + EcosystemSubmitterToIntegratorLinkIntensity),ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity),ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147314300.0,5093466.0,808.2764,0.502795,0.4028468,0.09859457,0.3751558,0.05921727,0.419587,0.1184307,...,0.009537892,0.01753143,0.05184194,0.04184437,0.04874358,0.04224923,0.04721868,0.05028227,0.05627708,0.07173808
std,99536310.0,7967840.0,2296.849,0.471079,0.4523912,0.2907958,0.44798,0.2284907,0.2256942,0.07152478,...,0.052598,0.08246395,0.08803506,0.08230781,0.09031063,0.08361355,0.102868,0.1043392,0.1113767,0.117461
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59258680.0,467330.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2366019,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136723700.0,1634092.0,199.0,0.6666667,0.0,0.0,0.0,0.0,0.4433628,0.07525668,...,0.0,0.0,0.004480431,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226098700.0,6254382.0,684.0,1.0,0.9393939,0.0,0.9230769,0.0,0.586308,0.1505134,...,0.0,0.0,0.07179541,0.04785956,0.0593604,0.04587383,0.02909834,0.05172123,0.06458231,0.1096827
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
df = scale(df, numeric_predictors.columns)
df.describe()

Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,IntraProjectSharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator,...,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderInDegreeCentrality,WeightedEcosystemSecondOrderOutDegreeCentrality,WeightedIntraProjectSecondOrderInDegreeCentrality,WeightedIntraProjectSecondOrderOutDegreeCentrality,EcosystemIntegratorToSubmitterLinkIntensity,EcosystemSubmitterToIntegratorLinkIntensity,IntraProjectIntegratorToSubmitterLinkIntensity,IntraProjectSubmitterToIntegratorLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147314300.0,5093466.0,808.2764,0.00820836,0.0004194136,0.009404911,0.005789915,0.502795,0.005135633,0.004619278,...,0.0003474058,0.000653076,5.633901e-05,4.894162e-05,3.013965e-05,2.088962e-05,0.0009381081,0.0009731412,0.002598294,0.003132114
std,99536310.0,7967840.0,2296.849,0.03427369,0.00535766,0.0272371,0.02031933,0.471079,0.01662117,0.01949295,...,0.003945856,0.006877763,0.002404047,0.002223115,0.001330459,0.001302334,0.004651889,0.004664904,0.01100202,0.01111554
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59258680.0,467330.0,50.0,8.230334e-06,0.0001,0.0007019186,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136723700.0,1634092.0,199.0,0.0001998223,0.0001,0.002573701,0.0005871991,0.6666667,0.0,0.0005871991,...,0.0,0.0,2.056899e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226098700.0,6254382.0,684.0,0.001782188,0.0003,0.008890969,0.005284792,1.0,0.003528226,0.003523194,...,0.0,0.0,5.198929e-06,3.011382e-06,1.231899e-06,6.175482e-07,0.000136237,0.0002646903,0.001111359,0.00219051
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Dataset safe to file

In [9]:
output_data_file_name = "dataset_transformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df_transformed.to_csv(output_path, index=False)

df_transformed.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_transformed.csv'


Unnamed: 0,ID,Submitter ID,PR Number,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,ln(1 + ControlPullRequestLifeTimeInMinutes),ln(1 + ControlNumberOfCommitsInPullRequest),...,ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderInDegreeCentrality),ln(1 + WeightedEcosystemSecondOrderOutDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderInDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderOutDegreeCentrality),ln(1 + EcosystemIntegratorToSubmitterLinkIntensity),ln(1 + EcosystemSubmitterToIntegratorLinkIntensity),ln(1 + IntraProjectIntegratorToSubmitterLinkIntensity),ln(1 + IntraProjectSubmitterToIntegratorLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147314300.0,5093466.0,808.2764,0.502795,0.4028468,0.09859457,0.3751558,0.05921727,0.419587,0.1184307,...,0.009537892,0.01753143,0.05184194,0.04184437,0.04874358,0.04224923,0.04721868,0.05028227,0.05627708,0.07173808
std,99536310.0,7967840.0,2296.849,0.471079,0.4523912,0.2907958,0.44798,0.2284907,0.2256942,0.07152478,...,0.052598,0.08246395,0.08803506,0.08230781,0.09031063,0.08361355,0.102868,0.1043392,0.1113767,0.117461
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59258680.0,467330.0,50.0,0.0,0.0,0.0,0.0,0.0,0.2366019,0.07525668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136723700.0,1634092.0,199.0,0.6666667,0.0,0.0,0.0,0.0,0.4433628,0.07525668,...,0.0,0.0,0.004480431,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226098700.0,6254382.0,684.0,1.0,0.9393939,0.0,0.9230769,0.0,0.586308,0.1505134,...,0.0,0.0,0.07179541,0.04785956,0.0593604,0.04587383,0.02909834,0.05172123,0.06458231,0.1096827
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
output_data_file_name = "dataset_untransformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df.to_csv(output_path, index=False)

df.describe()

Outputting to '/workspaces/msc_thesis/data//final_data/dataset_untransformed.csv'


Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,IntraProjectSubmitterPullRequestCommentCount,IntraProjectSharedExperiencePullRequestSubmittedBySubmitterIntegratedByIntegrator,...,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderInDegreeCentrality,WeightedEcosystemSecondOrderOutDegreeCentrality,WeightedIntraProjectSecondOrderInDegreeCentrality,WeightedIntraProjectSecondOrderOutDegreeCentrality,EcosystemIntegratorToSubmitterLinkIntensity,EcosystemSubmitterToIntegratorLinkIntensity,IntraProjectIntegratorToSubmitterLinkIntensity,IntraProjectSubmitterToIntegratorLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,147314300.0,5093466.0,808.2764,0.00820836,0.0004194136,0.009404911,0.005789915,0.502795,0.005135633,0.004619278,...,0.0003474058,0.000653076,5.633901e-05,4.894162e-05,3.013965e-05,2.088962e-05,0.0009381081,0.0009731412,0.002598294,0.003132114
std,99536310.0,7967840.0,2296.849,0.03427369,0.00535766,0.0272371,0.02031933,0.471079,0.01662117,0.01949295,...,0.003945856,0.006877763,0.002404047,0.002223115,0.001330459,0.001302334,0.004651889,0.004664904,0.01100202,0.01111554
min,687.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59258680.0,467330.0,50.0,8.230334e-06,0.0001,0.0007019186,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,136723700.0,1634092.0,199.0,0.0001998223,0.0001,0.002573701,0.0005871991,0.6666667,0.0,0.0005871991,...,0.0,0.0,2.056899e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,226098700.0,6254382.0,684.0,0.001782188,0.0003,0.008890969,0.005284792,1.0,0.003528226,0.003523194,...,0.0,0.0,5.198929e-06,3.011382e-06,1.231899e-06,6.175482e-07,0.000136237,0.0002646903,0.001111359,0.00219051
max,361773400.0,59734440.0,82976.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
import datetime

print(datetime.datetime.now())

2024-02-16 14:47:34.213568
