Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add threshold function that uses entropy to maximize uncertainty #88

Open
gsheni opened this issue Jul 6, 2023 · 0 comments
Open

Add threshold function that uses entropy to maximize uncertainty #88

gsheni opened this issue Jul 6, 2023 · 0 comments
Assignees

Comments

@gsheni
Copy link
Contributor

gsheni commented Jul 6, 2023

    def find_threshold_to_maximize_uncertainty(
        self,
        df,
        label_col,
        entity_col,
        max_num_unique_values=10,
        max_number_of_rows=2000,
        random_state=None,
    ):
        original_threshold = self.threshold

        unique_vals = sample_unique_values(
            df[label_col],
            max_num_unique_values,
            random_state,
        )

        # if len(df) > max_number_of_rows:
        #     df = df.sample(max_number_of_rows, random_state=random_state)

        best_entropy = 0
        best_parameter_value = 0

        # return the one that results in the most entropy (contains the most randomness)
        # more entropy means more unpredictability
        # goal of ML is to reduce uncertainty
        # so we want to output the dataframe with the most entropy
        unique_vals = set(df[label_col])
        for unique_val in unique_vals:
            self.set_parameters(threshold=unique_val)

            output_df = df.groupby(entity_col).apply(self.label_function)
            current_entropy = entropy_of_list(output_df[label_col])

            if current_entropy > best_entropy:
                best_entropy = current_entropy
                best_parameter_value = unique_val

        self.set_parameters(threshold=original_threshold)
        return best_parameter_value
def test_find_threshold_to_maximize_uncertanity(df):
    op = GreaterFilterOp("col")
    op.set_parameters(threshold=30.0)
    best_parameter_value = op.find_threshold_to_maximize_uncertainty(
        df,
        label_col="col",
        entity_col="id",
        random_state=0,
        max_num_unique_values=2,
    )
    # 10 will keep most of the values in col and maximize unpredictability
    # 10 is the lowest number
    assert best_parameter_value == 10
    assert op.threshold == 30.0
@gsheni gsheni self-assigned this Oct 8, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant