# Feature importance plot

Creates figures representing the feature importance of the FTC, nFTC and general random forest experiments.

## Data loading

Loads the `.txt` files outputted by the experiment notebooks.

In [16]:
%reload_ext autoreload
%autoreload 2

import python_proj.utils.exp_utils as exp_utils

base_path = exp_utils.BASE_PATH
file_name = "dataset_transformed"

experiments = ["rf_ftc", "rf_nftc", "rf_full"]

figure_base_path = f"{base_path}/figures/modelling/{file_name}/"

data = {}
for experiment in experiments:
    data_file = f"{figure_base_path}/{experiment}/feature_importance.txt"
    print(f'Loading from "{data_file}".')
    with open(data_file, "r", encoding="utf-8") as input_file:
        data[experiment] = input_file.read()

Loading from "/workspaces/msc_thesis/data//figures/modelling/dataset_transformed//rf_ftc/feature_importance.txt".
Loading from "/workspaces/msc_thesis/data//figures/modelling/dataset_transformed//rf_nftc/feature_importance.txt".
Loading from "/workspaces/msc_thesis/data//figures/modelling/dataset_transformed//rf_full/feature_importance.txt".


## Name mappings

Translates experimental names to human readable ones.

In [17]:
# Feature mapping
name_mapping = {
    "ln(1 + ControlPullRequestLifeTimeInMinutes)": "PR lifetime",
    "ln(1 + ControlIntraProjectPullRequestExperienceOfIntegrator)": "Integrator exp.",
    "IntraProjectSubmitterPullRequestSuccessRate": "Intra. PR merge rate",
    "ln(1 + IntraProjectSubmitterPullRequestSubmissionCount)": "Intra. PRs",
    "ln(1 + WeightedEcosystemSecondOrderDegreeCentrality)": "SO degree",
    "ControlIntegratedBySameUser": "PR is self-integrated",
    "ln(1 + ControlNumberOfCommitsInPullRequest)": "PR commit count",
    "EcosystemExperienceSubmitterPullRequestSuccessRate": "Eco. PR merge rate",
    "ControlPullRequestHasComments": "PR has comments",
    "ln(1 + IntraProjectSubmitterPullRequestCommentCount)": "Intra. PR comments",
    "ln(1 + IntraProjectSubmitterIssueCommentCount)": "Intra. Issue Comments",
    "ln(1 + EcosystemLinkIntensity)": "Link strength",
    "ln(1 + EcosystemExperienceSubmitterIssueCommentCount)": "Eco. issue comments",
    "ln(1 + NonDependencyEcosystemExperienceSubmitterIssueCommentCount)": "Non-dep. issue comments",
    "SubmitterIsFirstTimeContributor": "First-time contr.",
    "ln(1 + EcosystemExperienceSubmitterPullRequestSubmissionCount)": "Eco. PRs",
    "ln(1 + EcosystemExperienceSubmitterPullRequestCommentCount)": "Eco. PR comments",
    "ln(1 + NonDependencyEcosystemExperienceSubmitterPullRequestCommentCount)": "Non-dep. PR comments",
    "ControlHasHashTagInDescription": "PR contains '#'",
    "ln(1 + NonDependencyEcosystemExperienceSubmitterIssueSubmissionCount)": "Non-dep. issues",
    "ln(1 + IntraProjectSubmitterIssueSubmissionCount)": "Intra. issues",
    "ln(1 + EcosystemExperienceSubmitterIssueSubmissionCount)": "Eco. issues",
    "ln(1 + NonDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)": "Non-dep. PRs",
    "ControlPullRequestHasCommentByExternalUser": "PR has com. ext. contr.",
    "NonDependencyEcosystemExperienceSubmitterPullRequestSuccessRate": "Non-dep PR merge rate",
    "ln(1 + DependencyEcosystemExperienceSubmitterPullRequestCommentCount)": "Downstr. PR comments",
    "ln(1 + DependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)": "Downstr. PRs",
    "ln(1 + DependencyEcosystemExperienceSubmitterIssueCommentCount)": "Downstr. issue comments",
    "ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueCommentCount)": "Upstr. issue comments",
    "DependencyEcosystemExperienceSubmitterPullRequestSuccessRate": "Downstr. PR merge rate",
    "ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestCommentCount)": "Upstr. PR comments",
    "ln(1 + InversedDependencyEcosystemExperienceSubmitterPullRequestSubmissionCount)": "Upstr. PRs",
    "InversedDependencyEcosystemExperienceSubmitterPullRequestSuccessRate": "Upstr. PR merge rate",
    "ln(1 + DependencyEcosystemExperienceSubmitterIssueSubmissionCount)": "Downstr. issues",
    "ln(1 + InversedDependencyEcosystemExperienceSubmitterIssueSubmissionCount)": "Upstr. issues",
}

## Data parsing

In [18]:
from wmutils.collections.safe_dict import SafeDict

# parses the entries into separate dicts.
importances = SafeDict(default_value=dict)
clean_importances = SafeDict(default_value=dict)

for label, scores in data.items():
    for entry in scores.split("\n"):
        elements = entry.split(":")
        field = elements[0].strip()
        if len(field) == 0:
            continue
        renamed_field = name_mapping[field]
        score = float(elements[1].strip())
        importances[label][renamed_field] = score
        clean_importances[label][field] = score

full_importances = importances["rf_full"]
ftc_importances = importances["rf_ftc"]
non_ftc_importances = importances["rf_nftc"]

# Feature group comparison

In [19]:
group_to_feature_prefix = {""}

exp_types = [
    "{group}SubmitterPullRequestSuccessRate",
    "ln(1 + {group}SubmitterPullRequestSubmissionCount)",
    "ln(1 + {group}SubmitterPullRequestCommentCount)",
    "ln(1 + {group}SubmitterIssueSubmissionCount)",
    "ln(1 + {group}SubmitterIssueCommentCount)",

]

groups = [
    "IntraProject",
    "EcosystemExperience",
    "NonDependencyEcosystemExperience",
    "DependencyEcosystemExperience",
    "InversedDependencyEcosystemExperience",
]

In [20]:
from typing import Dict


def compare_feature_groups(
    importances: Dict[str, float], group_a: str, group_b: str
) -> None:
    """
    Returns percentage of group a features
    that score above group b features.
    """

    higher = 0
    total = 0

    for exp_type in exp_types:
        field_a = exp_type.format(group=group_a)
        field_b = exp_type.format(group=group_b)

        if field_a in importances and field_b in importances:
            if importances[field_a] != 0 and importances[field_b] != 0:
                if importances[field_a] > importances[field_b]:
                    higher += 1

                total += 1

    rat = higher / total * 100
    print(f"{group_a} > {group_b}: {higher} / {total} = {rat:.2f}%")

In [21]:
def compare_feature_groups_in_model(model: str):
    print(f"{model=}")
    # Compare Ecosystem with Intra
    compare_feature_groups(
        clean_importances[model], "IntraProject", "EcosystemExperience"
    )

    #  Compare Upstream with non-dependency
    compare_feature_groups(
        clean_importances[model],
        "NonDependencyEcosystemExperience",
        "InversedDependencyEcosystemExperience",
    )

    # Compare downstream with non-dependency
    compare_feature_groups(
        clean_importances[model],
        "NonDependencyEcosystemExperience",
        "DependencyEcosystemExperience",
    )

    #  Compare Upstream with intra
    compare_feature_groups(
        clean_importances[model],
        "IntraProject",
        "InversedDependencyEcosystemExperience",
    )

    # Compare downstream with intra
    compare_feature_groups(
        clean_importances[model],
        "IntraProject",
        "DependencyEcosystemExperience",
    )

    # Compare non-dependency with intra
    compare_feature_groups(
        clean_importances[model],
        "IntraProject",
        "NonDependencyEcosystemExperience",
    )

    # Compare upstream with downstream
    compare_feature_groups(
        clean_importances[model],
        "DependencyEcosystemExperience",
        "InversedDependencyEcosystemExperience",
    )

    print()

In [22]:
compare_feature_groups_in_model("rf_full")
compare_feature_groups_in_model("rf_ftc")
compare_feature_groups_in_model("rf_nftc")

model='rf_full'
IntraProject > EcosystemExperience: 5 / 5 = 100.00%
NonDependencyEcosystemExperience > InversedDependencyEcosystemExperience: 5 / 5 = 100.00%
NonDependencyEcosystemExperience > DependencyEcosystemExperience: 5 / 5 = 100.00%
IntraProject > InversedDependencyEcosystemExperience: 5 / 5 = 100.00%
IntraProject > DependencyEcosystemExperience: 5 / 5 = 100.00%
IntraProject > NonDependencyEcosystemExperience: 5 / 5 = 100.00%
DependencyEcosystemExperience > InversedDependencyEcosystemExperience: 5 / 5 = 100.00%

model='rf_ftc'
IntraProject > EcosystemExperience: 1 / 3 = 33.33%
NonDependencyEcosystemExperience > InversedDependencyEcosystemExperience: 5 / 5 = 100.00%
NonDependencyEcosystemExperience > DependencyEcosystemExperience: 5 / 5 = 100.00%
IntraProject > InversedDependencyEcosystemExperience: 3 / 3 = 100.00%
IntraProject > DependencyEcosystemExperience: 3 / 3 = 100.00%
IntraProject > NonDependencyEcosystemExperience: 1 / 3 = 33.33%
DependencyEcosystemExperience > InversedD

In [23]:
import pandas as pd
from itertools import product

df = pd.DataFrame()

# Get all relevant fields.
fields = [field.format(group=group) for (field, group) in product(exp_types, groups)]
fields = list(fields)
models = ["rf_full", "rf_ftc", "rf_nftc"]

# Set fields
df["fields"] = fields
df["fields"] = df["fields"].transform(lambda x: name_mapping[x])

# Set values
for model in models:
    df[model] = [clean_importances[model][field] for field in fields]

df

Unnamed: 0,fields,rf_full,rf_ftc,rf_nftc
0,Intra. PR merge rate,0.080382,0.0,0.154959
1,Eco. PR merge rate,0.028069,0.031635,0.01362
2,Non-dep PR merge rate,0.006601,0.0069,0.007121
3,Downstr. PR merge rate,0.001373,0.002361,0.001787
4,Upstr. PR merge rate,0.00119,0.001722,0.001249
5,Intra. PRs,0.076593,0.0,0.163854
6,Eco. PRs,0.012907,0.013942,0.011276
7,Non-dep. PRs,0.010569,0.012093,0.010184
8,Downstr. PRs,0.001791,0.001199,0.00255
9,Upstr. PRs,0.00134,0.001193,0.001502


## Figure creation code

In [24]:
from typing import Any
import matplotlib.pyplot as plt
from wmutils.pandas.figures.base import safe_save_fig


from python_proj.utils.exp_utils import BASE_PATH


def create_figure(max_entries: int):
    # Creates dot plot.
    sorted_full_importances = {
        k: v for k, v in sorted(full_importances.items(), key=lambda item: -item[1])
    }
    sorted_full_importances = {
        k: v
        for i, (k, v) in enumerate(sorted_full_importances.items())
        if i < max_entries
    }
    sorted_ftc_importances = [
        ftc_importances[k] if k in non_ftc_importances else 0
        for k in sorted_full_importances.keys()
    ]
    sorted_non_ftc_importances = [
        non_ftc_importances[k] if k in non_ftc_importances else 0
        for k in sorted_full_importances.keys()
    ]
    sorted_full_importances_keys = sorted_full_importances.keys()
    sorted_full_importances = sorted_full_importances.values()

    def inverse(series: list[Any]) -> list[Any]:
        new_list = []
        series = list(series)
        length = len(series)
        for i in range(length - 1, -1, -1):
            new_list.append(series[i])
        return new_list

    sorted_full_importances = inverse(sorted_full_importances)
    sorted_ftc_importances = inverse(sorted_ftc_importances)
    sorted_non_ftc_importances = inverse(sorted_non_ftc_importances)
    sorted_full_importances_keys = inverse(sorted_full_importances_keys)

    # Increased width to provide more space for the left side
    height = 1.75 / 10 * max_entries + 1.25
    plt.figure(figsize=(5.35, height))

    # Plot each data series
    plt.plot(
        sorted_full_importances,
        sorted_full_importances_keys,
        "o",
        fillstyle="none",
        markersize=6,
        label="Full",
    )
    plt.plot(
        sorted_ftc_importances,
        sorted_full_importances_keys,
        "s",
        fillstyle="none",
        markersize=6,
        label="FTC",
    )
    plt.plot(
        sorted_non_ftc_importances,
        sorted_full_importances_keys,
        "D",
        fillstyle="none",
        markersize=6,
        label="Non-FTC",
    )

    # Add labels and title
    plt.xlabel("Mean Decrease in Gini")
    plt.ylabel("Predictors")
    plt.subplots_adjust(left=0.45)

    # Add a legend
    plt.legend()

    # Display the plot
    plt.tight_layout()
    # plt.show()

    output_path = f"{BASE_PATH}/figures/importance_figure_{max_entries}.png"
    print(f"{output_path=}")
    safe_save_fig(output_path, output_format=['pdf', 'png'])

    # plt.show()

## Figure creation

In [25]:
with plt.style.context('ggplot'):
    create_figure(10)

output_path='/workspaces/msc_thesis/data//figures/importance_figure_10.png'


In [26]:
with plt.style.context('ggplot'):
    create_figure(len(name_mapping))

output_path='/workspaces/msc_thesis/data//figures/importance_figure_35.png'
