# Data obfuscation

This script removes personally identifiable information from the dataset.
This regards the following features:
- Submitter ID
- Project ID
- PR Number
- Closed At

Each of these variables are set to 0.

In addition, the dataframe's datapoint IDs are randomized, so no information can be derived from the order in which the data points are presented.

Because the `IsFirstTimeContributor` field is handled separately in `feature_construction`, it is merged into the dataset here to make sure the outputed obfuscated dataset is correct. Consequently, for this script to work properly, you need to have run the `feature_construction` script on the non-obfuscated data once already.

### Dataset loading

In [1]:
import pandas as pd
import python_proj.utils.exp_utils as exp_utils

input_data_file_name = "non_ftc_data"

data_path_format = exp_utils.BASE_PATH + "/final_data/{data_file_name}.csv"
data_path = data_path_format.format(data_file_name=input_data_file_name)
print(f"{data_path=}")


df: pd.DataFrame = pd.read_csv(filepath_or_buffer=data_path, header=0)

print(f"{len(df)=}.")
print(f"{len(df.columns)=}\n")

df.describe()

data_path='/workspaces/msc_thesis/data//final_data/non_ftc_data.csv'
len(df)=1815972.
len(df.columns)=155



Unnamed: 0,ID,Submitter ID,PR Number,ControlPullRequestLifeTimeInMinutes,ControlNumberOfCommitsInPullRequest,IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.PRIntegratorToSubmitterV2-In),IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.PRCommenterToSubmitterV2-In),IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.PRCommenterToCommenterV2-In),IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.IssueCommenterToCommenterV2-In),IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.IssueCommenterToSubmitterV2-In),...,EcosystemSharedExperienceIssueSubmittedByIntegratorCommentedOnBySubmitter,EcosystemSharedExperienceIssueDiscussionParticipationByIntegratorAndSubmitter,EcosystemExperienceSubmitterIssueSubmissionCount,EcosystemExperienceSubmitterIssueCommentCount,DependencyEcosystemExperienceSubmitterIssueSubmissionCount,DependencyEcosystemExperienceSubmitterIssueCommentCount,NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount,NonDependencyEcosystemExperienceSubmitterIssueCommentCount,InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount,InversedDependencyEcosystemExperienceSubmitterIssueCommentCount
count,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,...,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0,1815972.0
mean,147475100.0,5198276.0,3807.207,31901.11,4.947501,39.26438,10.76084,12.62702,8.083162,8.484018,...,0.0859413,1.475467,2.615631,17.85944,0.1743463,0.8645552,2.274088,15.70265,0.1854588,1.355166
std,100429400.0,8100370.0,8071.615,139933.5,72.83362,2497.234,318.6353,450.6868,365.2262,196.2613,...,1.285316,23.82904,7.98554,67.00519,1.580686,11.86131,7.064174,61.3108,2.446396,16.67164
min,687.0,1.0,1.0,0.01666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,58073220.0,463157.0,100.0,52.9,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,137194000.0,1620265.0,548.0,979.9167,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,227802600.0,6384100.0,2817.0,7328.35,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,7.0,0.0,0.0,2.0,6.0,0.0,0.0
max,361775600.0,59761230.0,82976.0,4489490.0,10000.0,346128.0,87892.0,122792.0,109504.0,47640.0,...,219.0,5687.0,2332.0,7630.0,373.0,927.0,2332.0,7630.0,741.0,2764.0


In [2]:
print(list(df.columns))

['ID', 'Project Name', 'Submitter ID', 'PR Number', 'Closed At', 'PullRequestIsMerged', 'SubmitterIsFirstTimeContributor', 'ControlIntegratedBySameUser', 'ControlPullRequestLifeTimeInMinutes', 'ControlPullRequestHasComments', 'ControlNumberOfCommitsInPullRequest', 'ControlPullRequestHasCommentByExternalUser', 'ControlHasHashTagInDescription', 'IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.PRIntegratorToSubmitterV2-In)', 'IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.PRCommenterToSubmitterV2-In)', 'IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.PRCommenterToCommenterV2-In)', 'IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.IssueCommenterToCommenterV2-In)', 'IntraProjectSecondOrderInDegreeCentrality(PRIntegratorToSubmitterV2.IssueCommenterToSubmitterV2-In)', 'IntraProjectSecondOrderInDegreeCentrality(PRCommenterToSubmitterV2.PRIntegratorToSubmitterV2-In)', 'IntraProjectSecondOrderInDegreeCentrality(PRCom

### Overwrite FTC data

In [3]:
def replace(
    df1: pd.DataFrame, df2: pd.DataFrame, match_column: str, overwritten_column: str
) -> pd.DataFrame:
    replace_data = df2[[match_column, overwritten_column]]

    old_counts = df1[overwritten_column].value_counts()

    df1 = df1.merge(
        replace_data, how="left", on=match_column, suffixes=("_incorrect", "")
    )
    old_key = f"{overwritten_column}_incorrect"
    df1 = df1.drop(old_key, axis=1)

    new_counts = df1[overwritten_column].value_counts()

    diff = new_counts - old_counts
    print(f"Impact overwriting '{overwritten_column}':\n{diff}")

    return df1


pre_existing_ftc_data_path = f"{exp_utils.BASE_PATH}/final_data/ftc_data.csv"
ftc_key = "SubmitterIsFirstTimeContributor"


def replace_with_pre_existing_data():
    global df
    print(f"{pre_existing_ftc_data_path=}\n")

    other_df: pd.DataFrame = pd.read_csv(pre_existing_ftc_data_path)
    df = replace(df, other_df, "ID", ftc_key)

    del other_df


print("Using FTC data file.")
replace_with_pre_existing_data()

Using FTC data file.
pre_existing_ftc_data_path='/workspaces/msc_thesis/data//final_data/ftc_data.csv'

Impact overwriting 'SubmitterIsFirstTimeContributor':
SubmitterIsFirstTimeContributor
False    215739
True    -215739
Name: count, dtype: int64


### Overwriting / removing problematic fields.

In [4]:
problematic_columns = ["Project Name", "Submitter ID", "PR Number", "Closed At"]

for col in problematic_columns:
    df[col] = 0

df[problematic_columns].describe()

Unnamed: 0,Project Name,Submitter ID,PR Number,Closed At
count,1815972.0,1815972.0,1815972.0,1815972.0
mean,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0


In [5]:
# Resets the data point ID.

df = df.sample(frac=1).reset_index(drop=True)
df = df.drop(["ID"], axis=1)

In [6]:
import python_proj.utils.exp_utils as exp_utils

output_file = f"{exp_utils.BASE_PATH}/final_data/non_ftc_data_obfuscated.csv"
df.to_csv(output_file, index_label='ID')

In [7]:
import datetime

print(datetime.datetime.now())

2024-02-21 14:28:47.634145
