In [1]:
import numpy as np

def acc(y_true, y_pred):
    """
    Calculate clustering accuracy. Require scikit-learn installed
    # Arguments
        y: true labels, numpy.array with shape `(n_samples,)`
        y_pred: predicted labels, numpy.array with shape `(n_samples,)`
    # Return
        accuracy, in [0,1]
    """
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    from scipy.optimize import linear_sum_assignment
    row_inds, col_inds = linear_sum_assignment(w.max() - w)
    print("Table of predicted values:\n", w)
    return sum(w[row_inds, col_inds]) * 1.0 / y_pred.size

In [2]:
# # Preprocessing data
# import pandas as pd
# actual_data = pd.read_csv("iris.csv")
# actual_data.isnull().sum()
# # # #actual_data.dropna(inplace=True)
# actual_data.to_csv("preprocessed_iris.csv")

In [3]:
### Config parameters
ACTABLEAI_RESULT_LINK = "https://app.actable.ai/tsne/api/task/aab56e30-9075-40e9-9dd3-1b6f5acab130"
TABLEAU_RESULT_FILE_NAME = "iris_tableau.csv"
ACTUAL_FILE_NAME = "iris.csv"

ACTUAL_LABEL_MAP = {'Setosa':0, 'Versicolor':1, 'Virginica':2}
ACTUAL_COLUMN_NAME = 'variety'

ORDERED_COLUMN = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
CONVERTED_TABLEAU_CLUSTER_LABEL = {1:0, 2:1, 3:2}

## ActableAI

In [4]:
# Transform the result from ActableAI to dataframe
actableai_result_link = ACTABLEAI_RESULT_LINK

In [5]:
import requests
import pandas as pd

In [6]:
actableai_result = requests.get(actableai_result_link).json()['data']

In [7]:
refined_actableai_result = []

for cluster in actableai_result:
    cluster_id = cluster['cluster_id']
    cluster_data = cluster['value']
    for row in cluster_data:
        row_content = row['column']
        row_content['label'] = cluster_id
        refined_actableai_result.append(row_content)

actableai_df = pd.DataFrame(refined_actableai_result)

## Tableau

In [8]:
# Load clustering results from Tableau
tableau_df = pd.read_csv(TABLEAU_RESULT_FILE_NAME)
tableau_df.columns = [column.lower() for column in tableau_df.columns]
tableau_df.columns = [column.replace(' ', '_') for column in tableau_df.columns]

## Actual data

In [9]:
# Load actual_data
actual_df = pd.read_csv(ACTUAL_FILE_NAME)

# Convert category label to numeric label
convert_column = ACTUAL_COLUMN_NAME
convert_dict = ACTUAL_LABEL_MAP
actual_df[convert_column] = actual_df[convert_column].map(convert_dict)

## Merge all dataframes together

In [10]:
# Set column
ordered_columns = ORDERED_COLUMN

## ActableAI key transform

In [11]:
actableai_int_df = actableai_df.applymap(lambda x: x if (type(x) == str) or (type(x) == int) else float(x))
actableai_int_df['key'] = actableai_int_df[ordered_columns].astype(str).apply(lambda x: ''.join(x), axis=1)
actableai_transform_df = actableai_int_df[['key', 'label']]
actableai_transform_df = actableai_transform_df.drop_duplicates(keep=False)
print(actableai_transform_df.shape)

(148, 2)


## Tableau key transform

In [12]:
tableau_int_df = tableau_df.applymap(lambda x: x if (type(x) == str) or (type(x) == int) else float(x))
tableau_int_df['key'] = tableau_int_df[ordered_columns].astype(str).apply(lambda x: ''.join(x), axis=1)
tableau_transform_df = tableau_int_df[['key', 'clusters']]
tableau_transform_df = tableau_transform_df.drop_duplicates(keep=False)

## Actual key transform

In [13]:
actual_int_df = actual_df.applymap(lambda x: x if (type(x) == str) or (type(x) == int) else float(x))
actual_int_df['key'] = actual_int_df[ordered_columns].astype(str).apply(lambda x: ''.join(x), axis=1)
actual_transform_df = actual_int_df[['key', ACTUAL_COLUMN_NAME]]
actual_transform_df = actual_transform_df.drop_duplicates(keep=False)

In [14]:
combined_df = pd.merge(left=actual_transform_df, right=actableai_transform_df, on='key', how='inner')
combined_df.shape
combined_df = pd.merge(left=combined_df, right=tableau_transform_df, on='key', how='inner')

In [15]:
convert_tableau_label = CONVERTED_TABLEAU_CLUSTER_LABEL
rename_column = {ACTUAL_COLUMN_NAME:'actual', 'label':'actableai', 'clusters':'tableau'}
combined_df = combined_df.rename(columns=rename_column)
combined_df['tableau'] = combined_df['tableau'].map(convert_tableau_label)
combined_df

Unnamed: 0,key,actual,actableai,tableau
0,5.13.51.40.2,0,1,0
1,4.93.01.40.2,0,1,0
2,4.73.21.30.2,0,1,0
3,4.63.11.50.2,0,1,0
4,5.03.61.40.2,0,1,0
...,...,...,...,...
143,6.73.05.22.3,2,2,1
144,6.32.55.01.9,2,2,2
145,6.53.05.22.0,2,2,1
146,6.23.45.42.3,2,2,1


In [16]:
actable_ai_acc = acc(combined_df['actual'], combined_df['actableai'])
print(actable_ai_acc)

Table of predicted values:
 [[ 0 49 11]
 [50  0  0]
 [ 0  1 37]]
0.918918918918919


In [17]:
tableau_acc = acc(combined_df['actual'], combined_df['tableau'])
print(tableau_acc)

Table of predicted values:
 [[50  0  0]
 [ 0  3 36]
 [ 0 47 12]]
0.8986486486486487
