Scope

- Estimate Accuracy for Binary Classification
- Single Partition - Segments from a single column
- External Visualization (Matplotlib,etc.) 

In [1]:
import pandas as pd

df = pd.read_csv("df_with_predictions.csv", index_col=0)

In [5]:
import numpy as np


def random_subsample_on_column(df, column, lower_pct=0.3 , upper_pct=1, classes = 'all'):
    """Subsample each class in a column to a random percentage of the total.

    The percentage is sampled uniformly between lower_pct and upper_pct.
    If classes is not 'all', then only subsample the classes in classes.

    Args:
        df (pd.DataFrame): The dataframe to subsample.
        column (str): The column to subsample on.
        lower_pct (float): The lower bound of the percentage to subsample.
        upper_pct (float): The upper bound of the percentage to subsample.
        classes (list): The classes to subsample. If 'all', then subsample all classes.

    """
    if classes == 'all':
        class_names = df[column].unique()
    elif isinstance(classes, list):
        assert all([c in df[column].unique() for c in classes]), "Classes must be in the column"
        class_names = classes
    for c in class_names:
        sub_df = df.loc[df[column]==c]
        n = int(len(sub_df) * (lower_pct + (upper_pct - lower_pct) * np.random.random()))
        # remove n rows from the class
        df = df.loc[df[column] != c].append(sub_df.sample(n=n))
    return df

df_2 = random_subsample_on_column(df, 'group', lower_pct=0.3, upper_pct=1, classes = ['black','white'])

df_2.groupby('group').size()
for value,sub_value in zip(df.groupby('group').size(),df_2.groupby('group').size()):
    print(sub_value/value)


1.0
0.7777777777777778
1.0
1.0
1.0
1.0
1.0
1.0
0.4482758620689655


In [6]:
#https://huggingface.co/s-nlp/roberta_toxicity_classifier

import whylogs as why
from whylogs.core.segmentation_partition import segment_on_column
from whylogs.core.schema import DatasetSchema

def log_with_metrics(df):
    segment_column = "group"
    segmented_schema = DatasetSchema(segments=segment_on_column(segment_column))
    results = why.log_classification_metrics(
        df,
        target_column = "output_toxicity",
        prediction_column = "output_prediction",
        score_column="output_score",
        schema=segmented_schema,
        log_full_data=True
    )
    return results
results = log_with_metrics(df)

In [2]:

def calculate_real_accuracy(df):
    metrics_df = df[['output_toxicity','output_prediction']]
    correct = 0
    incorrect = 0
    for index,row in metrics_df.iterrows():
        
        if row['output_toxicity'] == row['output_prediction']:
            correct += 1
        else:
            incorrect += 1
    acc = correct/(correct+incorrect)
    return acc


In [3]:
calculate_real_accuracy(df)

0.946

In [8]:
df_perturbed = random_subsample_on_column(df, 'group', lower_pct=0.3, upper_pct=1)
perturbed_results = log_with_metrics(df_perturbed)

In [58]:
def estimate_accuracy(reference_results, target_results):
    
    if len(reference_results.partitions)> 1 and len(target_results.partitions)> 1:
        warnings.warn("More than one partition found. Only the first partition will be used for the estimation.")
    if len(reference_results.partitions) != len(target_results.partitions):
        raise ValueError("The number of partitions in the reference and target results must be the same.")

    reference_partition = reference_results.partitions[0]
    target_partition = target_results.partitions[0]

    segmented_column = reference_partition.name
    if segmented_column != target_partition.name:
        raise ValueError("The segmented columns in the reference and target results must be the same.")

    reference_segments = reference_results.segments_in_partition(reference_partition)
    target_segments = target_results.segments_in_partition(target_partition)

    if any([len(segment.key)>1 for segment in reference_segments]):
        raise ValueError("Only single key segments are supported.")
    if any([len(segment.key)>1 for segment in target_segments]):
        raise ValueError("Only single key segments are supported.")


    # make a set out of the keys in the reference segments
    reference_keys = set([segment.key[0] for segment in reference_segments])
    target_keys = set([segment.key[0] for segment in target_segments])
    if reference_keys != target_keys:
        raise ValueError("The keys in the reference and target segments must be the same.")
    reference_accuracies = {}
    for reference_segment in reference_segments:
        id = reference_segment.parent_id
        reference_conf = reference_results._segments[id][reference_segment].model_performance_metrics.confusion_matrix.confusion_matrix
        target_segment = next(segment for segment in target_segments if segment.key[0] == reference_segment.key[0])
        tp = get_cell_from_confusion_matrix(reference_conf, (1,1))
        tn = get_cell_from_confusion_matrix(reference_conf, (0,0))
        fp = get_cell_from_confusion_matrix(reference_conf, (0,1))
        fn = get_cell_from_confusion_matrix(reference_conf, (1,0))
        reference_acc = (tp+tn)/(tp+tn+fp+fn)
        reference_accuracies[reference_segment.key[0]] = reference_acc
    

    target_counts = get_segment_counts(target_results)
    reference_counts = get_segment_counts(reference_results)

    target_proportions = get_proportions(target_counts)
    reference_proportions = get_proportions(reference_counts)

    reference_accuracy  = estimate_accuracy_based_on_proportions(reference_accuracies, reference_proportions)
    estimated_accuracy = estimate_accuracy_based_on_proportions(reference_accuracies, target_proportions)

    return reference_accuracy, estimated_accuracy

def get_segment_counts(results):
    partition = results.partitions[0]
    segments = results.segments_in_partition(partition)
    segmented_column = partition.name

    counts = {}
    for segment in segments:
        id = segment.parent_id
        profile = results._segments[id][segment]
        segment_count = profile._columns[segmented_column]._metrics['counts'].n.value
        counts[segment.key[0]] = segment_count
    return counts


def estimate_accuracy_based_on_proportions(reference_accuracies, target_proportions):
    estimated_accuracy = sum([reference_accuracies[k]*target_proportions[k] for k in reference_accuracies.keys()])
    return estimated_accuracy

def get_proportions(counts):
    total = sum(counts.values())
    proportions = {k: v/total for k, v in counts.items()}
    return proportions

def get_cell_from_confusion_matrix(confusion_matrix, key):
    dist_cell = confusion_matrix.get(key,None)
    return dist_cell.n if dist_cell is not None else 0

df_perturbed = random_subsample_on_column(df, 'group', lower_pct=0.3, upper_pct=0.5,classes=['white','black', 'LGBTQ', 'muslim'])
perturbed_results = log_with_metrics(df_perturbed)

perturbed_acc = calculate_real_accuracy(df_perturbed)
ref_acc = calculate_real_accuracy(df)
reference_accuracy, estimated_accuracy = estimate_accuracy(results, perturbed_results)
# print(f"Reference accuracy: {reference_accuracy}")
# print(f"Estimated accuracy: {estimated_accuracy}")
print(f"Reference accuracy: {real_acc}")
print(f"Perturbed accuracy: {perturbed_acc}")

Reference accuracy: 0.9485530546623794
Perturbed accuracy: 0.949671772428884


In [None]:
#ideas

from whylogs.experimental.performance_estimation import AccuracyEstimator

estimator = AccuracyEstimator(reference_result_set = reference_results)

perturbed_result_sets = [results_1, results_2, results_3]
# or perturbed_dfs = [df_1, df_2, df_3]

estimated_accuracies = [estimator.estimate_accuracy(results) for results in perturbed_result_set]
# or estimated_accuracies = [estimator.estimate_accuracy(df) for df in perturbed_dfs]

real_accuracies = [get_real_acc(df) for df in perturbed_dfs]

plot_estimated_accuracies(estimated_accuracies, real_accuracies)

# imagine beautiful plot here

In [None]:
# another idea

# schema.estimator = estimator
# results = why.log(df, schema=schema)