# Feature transformation

Used to test the impact of various feature transformations on the dataset, and ultimately outputs a dataset with those transformations applied on it.

## Dataset loading

In [1]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "dataset_subsampled"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")

meta_header_count = 5

df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='./data//final_data/dataset_subsampled.csv'


len(df)=1216221.
len(df.columns)=43



Unnamed: 0,ID,Project Name,Submitter ID,PR Number,Closed At,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderDegreeCentrality,WeightedIntraProjectSecondOrderDegreeCentrality,EcosystemLinkIntensity,IntraProjectLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,908116.4,8677.039,0.0,0.0,0.0,36850.69,4.210476,40.14674,9.869163,0.5027905,...,0.2019806,0.8583325,2.498338,17.57198,0.2573036,1.79815,29.18379,64.03151,1.098749,2.662755
std,524353.1,5806.587,0.0,0.0,0.0,153795.0,55.12267,116.082,34.58581,0.4710578,...,1.710005,9.254259,7.565769,65.27831,2.921277,18.91477,1216.683,3492.823,9.274709,27.789
min,0.0,43.0,0.0,0.0,0.0,0.01666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,454250.0,3224.0,0.0,0.0,0.0,37.08333,1.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,907870.0,8413.0,0.0,0.0,0.0,898.7833,1.0,11.0,1.0,0.6666667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.1180555,0.0,0.0,0.0
75%,1362318.0,13570.0,0.0,0.0,0.0,7997.85,3.0,38.0,9.0,1.0,...,0.0,0.0,2.0,7.0,0.0,0.0,2.698931,2.684865,0.0,0.2475789
max,1815970.0,20094.0,0.0,0.0,0.0,4489490.0,10000.0,4271.0,1699.0,1.0,...,373.0,927.0,2332.0,7630.0,741.0,2764.0,553496.5,2524147.0,904.8289,5249.874


In [2]:
predictors = df.columns[meta_header_count:]
numeric_predictors = df[predictors].select_dtypes(include="number")

## Log-transformation

Applies one-off log-transform on count predictors (PR submission count, etc.).

In [3]:
import numpy as np
import regex as re

df_transformed = df.copy()

sr_re = r".*SuccessRate.*"

count_predictors = [
    field for field in numeric_predictors.columns if not re.match(sr_re, field)
]

print(f"{count_predictors=}")

count_predictors=['ControlPullRequestLifeTimeInMinutes', 'ControlNumberOfCommitsInPullRequest', 'ControlIntraProjectPullRequestExperienceOfIntegrator', 'IntraProjectSubmitterPullRequestSubmissionCount', 'IntraProjectSubmitterPullRequestCommentCount', 'EcosystemExperienceSubmitterPullRequestSubmissionCount', 'EcosystemExperienceSubmitterPullRequestCommentCount', 'DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'DependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'NonDependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount', 'InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount', 'IntraProjectSubmitterIssueSubmissionCount', 'IntraProjectSubmitterIssueCommentCount', 'EcosystemExperienceSubmitterIssueSubmissionCount', 'EcosystemExperienceSubmitterIssueCommentCount', 'DependencyEcosystemExperie

In [4]:
# Applies one-off log-transform
for field in count_predictors:
    new_key = f"ln(1 + {field})"
    df_transformed[new_key] = df_transformed[field].apply(lambda x: np.log(1 + x))
    df_transformed = df_transformed.drop(field, axis=1)

df_transformed.describe()

Unnamed: 0,ID,Project Name,Submitter ID,PR Number,Closed At,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderDegreeCentrality),ln(1 + EcosystemLinkIntensity),ln(1 + IntraProjectLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,908116.4,8677.039,0.0,0.0,0.0,0.5027905,0.4027001,0.09860918,0.3749753,0.05920373,...,0.06788991,0.1347494,0.6172564,1.201281,0.06301919,0.1387813,0.8459333,0.8907999,0.1491971,0.3827771
std,524353.1,5806.587,0.0,0.0,0.0,0.4710578,0.452342,0.2908001,0.447916,0.2284681,...,0.3199921,0.5474352,0.9145723,1.564078,0.3475801,0.6529677,1.314167,1.479044,0.5948867,0.868534
min,0.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,454250.0,3224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,907870.0,8413.0,0.0,0.0,0.0,0.6666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.6931472,0.0,0.0,0.111591,0.0,0.0,0.0
75%,1362318.0,13570.0,0.0,0.0,0.0,1.0,0.9387755,0.0,0.9230769,0.0,...,0.0,0.0,1.098612,2.079442,0.0,0.0,1.308044,1.304234,0.0,0.2212048
max,1815970.0,20094.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,5.924256,6.833032,7.75491,8.939974,6.609349,7.924796,13.22401,14.74141,6.80885,8.56615


In [5]:
# Collects the new numeric fields for the transformed dataframe.
transformed_predictors = df_transformed.columns[meta_header_count:]
transformed_numeric_predictors = df_transformed[transformed_predictors].select_dtypes(
    include="number"
)

## Feature scaling

Applies min-max scaling on the features.
This is applied to the transformed and the untransformed data.

In [6]:
def scale(_df: pd.DataFrame, scaled_fields: pd.Series):
    scaled_df = _df.copy()

    for feature in scaled_fields:
        feature_min = scaled_df[feature].min()
        feature_max = scaled_df[feature].max()
        feature_delta = feature_max - feature_min

        scaled_df[feature] = (
            scaled_df[feature].subtract(feature_min).divide(feature_delta)
        )

    return scaled_df

In [7]:
df_transformed = scale(df_transformed, transformed_numeric_predictors.columns)
df_transformed.describe()

Unnamed: 0,ID,Project Name,Submitter ID,PR Number,Closed At,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderDegreeCentrality),ln(1 + EcosystemLinkIntensity),ln(1 + IntraProjectLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,908116.4,8677.039,0.0,0.0,0.0,0.5027905,0.4027001,0.09860918,0.3749753,0.05920373,...,0.01145965,0.0197203,0.07959556,0.1343719,0.009534855,0.01751229,0.06396949,0.06042839,0.02191224,0.04468485
std,524353.1,5806.587,0.0,0.0,0.0,0.4710578,0.452342,0.2908001,0.447916,0.2284681,...,0.0540139,0.08011601,0.1179346,0.1749533,0.05258916,0.08239552,0.09937732,0.1003326,0.08736962,0.1013914
min,0.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,454250.0,3224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,907870.0,8413.0,0.0,0.0,0.0,0.6666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.008438514,0.0,0.0,0.0
75%,1362318.0,13570.0,0.0,0.0,0.0,1.0,0.9387755,0.0,0.9230769,0.0,...,0.0,0.0,0.1416667,0.2326004,0.0,0.0,0.0989143,0.08847414,0.0,0.02582313
max,1815970.0,20094.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
df = scale(df, numeric_predictors.columns)
df.describe()

Unnamed: 0,ID,Project Name,Submitter ID,PR Number,Closed At,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderDegreeCentrality,WeightedIntraProjectSecondOrderDegreeCentrality,EcosystemLinkIntensity,IntraProjectLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,908116.4,8677.039,0.0,0.0,0.0,0.00820821,0.0004210476,0.009399846,0.005808807,0.5027905,...,0.0005415028,0.000925925,0.001071328,0.002303012,0.0003472383,0.0006505608,5.272624e-05,2.536758e-05,0.001214317,0.0005072037
std,524353.1,5806.587,0.0,0.0,0.0,0.03425667,0.005512267,0.02717911,0.02035657,0.4710578,...,0.004584464,0.009983019,0.003244327,0.00855548,0.003942344,0.006843261,0.002198177,0.001383764,0.01025023,0.005293271
min,0.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,454250.0,3224.0,0.0,0.0,0.0,8.256321e-06,0.0001,0.0007024116,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,907870.0,8413.0,0.0,0.0,0.0,0.0002001935,0.0001,0.002575509,0.0005885815,0.6666667,...,0.0,0.0,0.0,0.0001310616,0.0,0.0,2.132904e-07,0.0,0.0,0.0
75%,1362318.0,13570.0,0.0,0.0,0.0,0.001781457,0.0003,0.008897214,0.005297234,1.0,...,0.0,0.0,0.0008576329,0.0009174312,0.0,0.0,4.876149e-06,1.063672e-06,0.0,4.715901e-05
max,1815970.0,20094.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Dataset safe to file

In [9]:
output_data_file_name = "dataset_transformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df_transformed.to_csv(output_path, index=False)

df_transformed.describe()

Outputting to './data//final_data/dataset_transformed.csv'


Unnamed: 0,ID,Project Name,Submitter ID,PR Number,Closed At,IntraProjectSubmitterPullRequestSuccessRate,EcosystemExperienceSubmitterPullRequestSuccessRate,DependencyEcosystemExperienceSubmitterPullRequestSuccessRate,NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate,...,ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount),ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount),ln(1 + WeightedEcosystemSecondOrderDegreeCentrality),ln(1 + WeightedIntraProjectSecondOrderDegreeCentrality),ln(1 + EcosystemLinkIntensity),ln(1 + IntraProjectLinkIntensity)
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,908116.4,8677.039,0.0,0.0,0.0,0.5027905,0.4027001,0.09860918,0.3749753,0.05920373,...,0.01145965,0.0197203,0.07959556,0.1343719,0.009534855,0.01751229,0.06396949,0.06042839,0.02191224,0.04468485
std,524353.1,5806.587,0.0,0.0,0.0,0.4710578,0.452342,0.2908001,0.447916,0.2284681,...,0.0540139,0.08011601,0.1179346,0.1749533,0.05258916,0.08239552,0.09937732,0.1003326,0.08736962,0.1013914
min,0.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,454250.0,3224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,907870.0,8413.0,0.0,0.0,0.0,0.6666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07753347,0.0,0.0,0.008438514,0.0,0.0,0.0
75%,1362318.0,13570.0,0.0,0.0,0.0,1.0,0.9387755,0.0,0.9230769,0.0,...,0.0,0.0,0.1416667,0.2326004,0.0,0.0,0.0989143,0.08847414,0.0,0.02582313
max,1815970.0,20094.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
output_data_file_name = "dataset_untransformed"
output_path = data_path_format.format(data_file_name=output_data_file_name)
print(f"Outputting to '{output_path}'")

df.to_csv(output_path, index=False)

df.describe()

Outputting to './data//final_data/dataset_untransformed.csv'


Unnamed: 0,ID,Project Name,Submitter ID,PR Number,Closed At,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,ControlIntraProjectPullRequestExperienceOfIntegrator,IntraProjectSubmitterPullRequestSubmissionCount,IntraProjectSubmitterPullRequestSuccessRate,...,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount,WeightedEcosystemSecondOrderDegreeCentrality,WeightedIntraProjectSecondOrderDegreeCentrality,EcosystemLinkIntensity,IntraProjectLinkIntensity
count,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,...,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0,1216221.0
mean,908116.4,8677.039,0.0,0.0,0.0,0.00820821,0.0004210476,0.009399846,0.005808807,0.5027905,...,0.0005415028,0.000925925,0.001071328,0.002303012,0.0003472383,0.0006505608,5.272624e-05,2.536758e-05,0.001214317,0.0005072037
std,524353.1,5806.587,0.0,0.0,0.0,0.03425667,0.005512267,0.02717911,0.02035657,0.4710578,...,0.004584464,0.009983019,0.003244327,0.00855548,0.003942344,0.006843261,0.002198177,0.001383764,0.01025023,0.005293271
min,0.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,454250.0,3224.0,0.0,0.0,0.0,8.256321e-06,0.0001,0.0007024116,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,907870.0,8413.0,0.0,0.0,0.0,0.0002001935,0.0001,0.002575509,0.0005885815,0.6666667,...,0.0,0.0,0.0,0.0001310616,0.0,0.0,2.132904e-07,0.0,0.0,0.0
75%,1362318.0,13570.0,0.0,0.0,0.0,0.001781457,0.0003,0.008897214,0.005297234,1.0,...,0.0,0.0,0.0008576329,0.0009174312,0.0,0.0,4.876149e-06,1.063672e-06,0.0,4.715901e-05
max,1815970.0,20094.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
import datetime

print(datetime.datetime.now())

2024-02-21 16:45:01.702181
