You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
deffind_threshold_to_maximize_uncertainty(
self,
df,
label_col,
entity_col,
max_num_unique_values=10,
max_number_of_rows=2000,
random_state=None,
):
original_threshold=self.thresholdunique_vals=sample_unique_values(
df[label_col],
max_num_unique_values,
random_state,
)
# if len(df) > max_number_of_rows:# df = df.sample(max_number_of_rows, random_state=random_state)best_entropy=0best_parameter_value=0# return the one that results in the most entropy (contains the most randomness)# more entropy means more unpredictability# goal of ML is to reduce uncertainty# so we want to output the dataframe with the most entropyunique_vals=set(df[label_col])
forunique_valinunique_vals:
self.set_parameters(threshold=unique_val)
output_df=df.groupby(entity_col).apply(self.label_function)
current_entropy=entropy_of_list(output_df[label_col])
ifcurrent_entropy>best_entropy:
best_entropy=current_entropybest_parameter_value=unique_valself.set_parameters(threshold=original_threshold)
returnbest_parameter_value
def test_find_threshold_to_maximize_uncertanity(df):
op = GreaterFilterOp("col")
op.set_parameters(threshold=30.0)
best_parameter_value = op.find_threshold_to_maximize_uncertainty(
df,
label_col="col",
entity_col="id",
random_state=0,
max_num_unique_values=2,
)
# 10 will keep most of the values in col and maximize unpredictability
# 10 is the lowest number
assert best_parameter_value == 10
assert op.threshold == 30.0
The text was updated successfully, but these errors were encountered:
The text was updated successfully, but these errors were encountered: