In [1]:
import os
import sys
import numpy as np
from numpy.lib.arraysetops import unique
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from copy import deepcopy

from datetime import datetime
from pprint import pprint
from tqdm import tqdm
import ipdb
import pickle

plt.style.use("seaborn")
np.random.seed(1)

from training.utils import load_obj, save_obj
from training.data import load_data
from training.dataset import _preprocess_call_data, preprocess_and_make_dataset

from sklearn.cluster import KMeans, OPTICS, SpectralClustering
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from training.modelling.metrics import F1, Precision, Recall, BinaryAccuracy
from tensorflow.keras.models import load_model
from training.modelling.dataloader import get_train_val_test


2021-10-12 19:08:16,122 - INFO - _init_num_threads - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-10-12 19:08:16,123 - INFO - _init_num_threads - NumExpr defaulting to 8 threads.


In [4]:

stats = pd.read_csv("may_data/beneficiary_stats_v5.csv")
beneficiary_data = pd.read_csv("may_data/beneficiary/AIRegistration-20200501-20200731.csv")
b_data, call_data = load_data("may_data")
call_data = _preprocess_call_data(call_data)
all_beneficiaries = stats[stats['Group'].isin(["Google-AI-Control", "Google-AI-Calls"])]
transitions = pd.read_csv("may_data/RMAB_one_month/weekly_transitions_SI_single_group.csv")

2021-10-12 19:10:08,023 - INFO - load_data - Loading data from folder 'may_data'
2021-10-12 19:10:08,419 - INFO - load_data - Successfully loaded and cleaned beneficiary and call data.
2021-10-12 19:10:08,420 - INFO - load_data - Beneficiary data contains data for 26548 beneficiaries
2021-10-12 19:10:08,433 - INFO - load_data - Call data contains 1224245 call records for 26548 beneficiaries


In [3]:
with open('may_data/features_dataset.pkl', 'rb') as fw:
    features_dataset = pickle.load(fw)
fw.close()

In [22]:
def get_individual_transition_clusters(train_beneficiaries, train_transitions, features_dataset, n_clusters):
    cols = [
        "P(L, I, L)", "P(L, I, H)", "P(H, I, L)", "P(H, I, H)", "P(L, N, L)", "P(L, N, H)", "P(H, N, L)", "P(H, N, H)", 
    ]

    user_ids, dynamic_xs, gest_ages, static_xs, ngo_hosp_ids, labels = features_dataset
    
    train_ids = train_beneficiaries['user_id']
    idxes = [np.where(user_ids == x)[0][0] for x in train_ids]
    train_static_features = static_xs[idxes]
    train_static_features = train_static_features[:, : -8]

    # test_ids = test_beneficiaries['user_id']
    # idxes = [np.where(user_ids == x)[0][0] for x in test_ids]
    # test_static_features = static_xs[idxes]
    # test_static_features = test_static_features[:, : -8]
    all_transition_probabilities = get_all_transition_probabilities(train_beneficiaries, train_transitions)
    pass_to_kmeans_cols = ['P(L, N, L)', 'P(H, N, L)']

    train_labels, centroids, _, cls, num_clusters, max_iters = kmeans_missing(all_transition_probabilities[pass_to_kmeans_cols], n_clusters, max_iter=100)
    
    # ipdb.set_trace()
    train_beneficiaries['cluster'] = train_labels
    # test_beneficiaries['cluster'] = cls.predict(test_static_features)

    dt_clf = RandomForestClassifier(n_estimators=200, criterion="entropy", max_depth=30, n_jobs=-1, random_state=124)
    dt_clf.fit(train_static_features, train_labels)

    cluster_transition_probabilities = pd.DataFrame(columns=['cluster', 'count'] + cols)

    for i in range(n_clusters):
        cluster_beneficiaries = train_beneficiaries[train_beneficiaries['cluster'] == i]
        cluster_b_user_ids = cluster_beneficiaries['user_id']
        probs, _ = get_transition_probabilities(cluster_b_user_ids, train_transitions, min_support=3)
        probs['cluster'] = i
        probs['count'] = len(cluster_b_user_ids)
        cluster_transition_probabilities = cluster_transition_probabilities.append(probs, ignore_index=True)

    # ipdb.set_trace()

    return cluster_transition_probabilities, dt_clf

def get_transition_probabilities(beneficiaries, transitions, min_support=3):
    transitions = transitions[transitions['user_id'].isin(beneficiaries)]

    i_transitions = transitions[transitions['action']=='Intervention']
    n_i_transitions = transitions[transitions['action']=='No Intervention']

    i_L = i_transitions[i_transitions['pre-action state']=="L"]
    i_H = i_transitions[i_transitions['pre-action state']=="H"]

    i_L_L = i_L[i_L['post-action state']=="L"]
    i_L_H = i_L[i_L['post-action state']=="H"]

    i_H_L = i_H[i_H['post-action state']=="L"]
    i_H_H = i_H[i_H['post-action state']=="H"]

    n_i_L = n_i_transitions[n_i_transitions['pre-action state']=="L"]
    n_i_H = n_i_transitions[n_i_transitions['pre-action state']=="H"]

    n_i_L_L = n_i_L[n_i_L['post-action state']=="L"]
    n_i_L_H = n_i_L[n_i_L['post-action state']=="H"]

    n_i_H_L = n_i_H[n_i_H['post-action state']=="L"]
    n_i_H_H = n_i_H[n_i_H['post-action state']=="H"]

    transition_probabilities = dict()
    if i_L.shape[0] >= min_support:
        transition_probabilities['P(L, I, L)'] = i_L_L.shape[0] / i_L.shape[0]
        transition_probabilities['P(L, I, H)'] = i_L_H.shape[0] / i_L.shape[0]
    else:
        transition_probabilities['P(L, I, L)'] = np.nan
        transition_probabilities['P(L, I, H)'] = np.nan

    if i_H.shape[0] >= min_support:
        transition_probabilities['P(H, I, L)'] = i_H_L.shape[0] / i_H.shape[0]
        transition_probabilities['P(H, I, H)'] = i_H_H.shape[0] / i_H.shape[0]
    else:
        transition_probabilities['P(H, I, L)'] = np.nan
        transition_probabilities['P(H, I, H)'] = np.nan
    
    if n_i_L.shape[0] >= min_support:
        transition_probabilities['P(L, N, L)'] = n_i_L_L.shape[0] / n_i_L.shape[0]
        transition_probabilities['P(L, N, H)'] = n_i_L_H.shape[0] / n_i_L.shape[0]
    else:
        transition_probabilities['P(L, N, L)'] = np.nan
        transition_probabilities['P(L, N, H)'] = np.nan

    if n_i_H.shape[0] >= min_support:
        transition_probabilities['P(H, N, L)'] = n_i_H_L.shape[0] / n_i_H.shape[0]
        transition_probabilities['P(H, N, H)'] = n_i_H_H.shape[0] / n_i_H.shape[0]
    else:
        transition_probabilities['P(H, N, L)'] = np.nan
        transition_probabilities['P(H, N, H)'] = np.nan

    return transition_probabilities, {'P(L, I, L)': i_L_L.shape[0], 'P(L, I, H)': i_L_H.shape[0], 'P(H, I, L)': i_H_L.shape[0], 'P(H, I, H)': i_H_H.shape[0], 'P(L, N, L)': n_i_L_L.shape[0], 'P(L, N, H)': n_i_L_H.shape[0], 'P(H, N, L)': n_i_H_L.shape[0], 'P(H, N, H)': n_i_H_H.shape[0]}

def get_all_transition_probabilities(train_beneficiaries, transitions):
    cols = [
        "P(L, I, L)", "P(L, I, H)", "P(H, I, L)", "P(H, I, H)", "P(L, N, L)", "P(L, N, H)", "P(H, N, L)", "P(H, N, H)", 
    ]
    transition_probabilities = pd.DataFrame(columns = ['user_id'] + cols)
    user_ids = train_beneficiaries['user_id']

    for user_id in user_ids:
        probs, _ = get_transition_probabilities([user_id], transitions, min_support=1)
        probs['user_id'] = user_id

        transition_probabilities = transition_probabilities.append(probs, ignore_index=True)

    return transition_probabilities

def get_individual_transition_clusters(train_beneficiaries, train_transitions, features_dataset, n_clusters):
    cols = [
        "P(L, I, L)", "P(L, I, H)", "P(H, I, L)", "P(H, I, H)", "P(L, N, L)", "P(L, N, H)", "P(H, N, L)", "P(H, N, H)", 
    ]

    user_ids, dynamic_xs, gest_ages, static_xs, ngo_hosp_ids, labels = features_dataset
    
    train_ids = train_beneficiaries['user_id']
    idxes = [np.where(user_ids == x)[0][0] for x in train_ids]
    train_static_features = static_xs[idxes]
    train_static_features = train_static_features[:, : -8]

    # test_ids = test_beneficiaries['user_id']
    # idxes = [np.where(user_ids == x)[0][0] for x in test_ids]
    # test_static_features = static_xs[idxes]
    # test_static_features = test_static_features[:, : -8]
    all_transition_probabilities = get_all_transition_probabilities(train_beneficiaries, train_transitions)
    pass_to_kmeans_cols = ['P(L, N, L)', 'P(H, N, L)']
    train_labels, centroids, _, cls, num_clusters, max_iters = kmeans_missing(all_transition_probabilities[pass_to_kmeans_cols], n_clusters, max_iter=100)

    # ipdb.set_trace()
    train_beneficiaries['cluster'] = train_labels
    # test_beneficiaries['cluster'] = cls.predict(test_static_features)

    dt_clf = RandomForestClassifier(n_estimators=200, criterion="entropy", max_depth=30, n_jobs=-1, random_state=124)
    dt_clf.fit(train_static_features, train_labels)

    cluster_transition_probabilities = pd.DataFrame(columns=['cluster', 'count'] + cols)

    for i in range(n_clusters):
        cluster_beneficiaries = train_beneficiaries[train_beneficiaries['cluster'] == i]
        cluster_b_user_ids = cluster_beneficiaries['user_id']
        probs, _ = get_transition_probabilities(cluster_b_user_ids, train_transitions, min_support=3)
        print(i, probs, len(cluster_b_user_ids))
        probs['cluster'] = i
        probs['count'] = len(cluster_b_user_ids)
        cluster_transition_probabilities = cluster_transition_probabilities.append(probs, ignore_index=True)

    # ipdb.set_trace()

    return cluster_transition_probabilities, dt_clf, all_transition_probabilities, train_labels

def kmeans_missing(X, n_clusters, max_iter=10):
    n_clusters = CONFIG['clusters']
    missing = ~np.isfinite(X)
    mu = np.nanmean(X, 0, keepdims=1)
    X_hat = np.where(missing, mu, X)

    prev_labels = None
    for i in range(max_iter):
        if CONFIG['clustering'] == 'optics':
            cls = OPTICS(min_samples=4, n_jobs=-1)
        elif CONFIG['clustering'] == 'kmeans':
            cls = KMeans(n_clusters, n_jobs=-1, random_state=0)
        elif CONFIG['clustering'] == 'spectral':
            cls = SpectralClustering(n_clusters, n_jobs=-1, random_state=0)

        labels = cls.fit_predict(X_hat)

        if CONFIG['clustering'] == 'kmeans':
            centroids = cls.cluster_centers_
        else:
            if CONFIG['clustering'] == 'optics':
                labels = labels + 1
            unique_labels = len(set(labels))
            centroids = []
            for i in range(unique_labels):
                idxes = np.where(labels == i)[0]
                centroids.append(np.mean(X_hat[idxes], axis=0))
            centroids = np.array(centroids)

        X_hat[missing] = centroids[labels][missing]

        if i > 0 and np.all(labels == prev_labels):
            break

        prev_labels = labels

    return labels, centroids, X_hat, cls, len(set(labels)), i

In [23]:
aug_states = []
for i in range(6):
    if i % 2 == 0:
        aug_states.append('L{}'.format(i // 2))
    else:
        aug_states.append('H{}'.format(i // 2))
CONFIG = {
    "problem": {
        "orig_states": ['L', 'H'],
        "states": aug_states + ['L', 'H'],
        "actions": ["N", "I"],
    },
    "time_step": 7,
    "gamma": 0.99,
    "clusters": 40,
    "transitions": "weekly",
    "clustering": "kmeans",
}

cluster_transition_probabilities, cls, all_transition_probabilities, train_labels = get_individual_transition_clusters(all_beneficiaries, transitions, features_dataset, CONFIG['clusters'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_beneficiaries['cluster'] = train_labels


0 {'P(L, I, L)': 1.0, 'P(L, I, H)': 0.0, 'P(H, I, L)': nan, 'P(H, I, H)': nan, 'P(L, N, L)': 0.791015625, 'P(L, N, H)': 0.208984375, 'P(H, N, L)': 0.7925925925925926, 'P(H, N, H)': 0.2074074074074074} 46
1 {'P(L, I, L)': 0.14285714285714285, 'P(L, I, H)': 0.8571428571428571, 'P(H, I, L)': 0.2727272727272727, 'P(H, I, H)': 0.7272727272727273, 'P(L, N, L)': 0.3333333333333333, 'P(L, N, H)': 0.6666666666666666, 'P(H, N, L)': 0.08076358296622614, 'P(H, N, H)': 0.9192364170337739} 135
2 {'P(L, I, L)': 0.25, 'P(L, I, H)': 0.75, 'P(H, I, L)': 0.47058823529411764, 'P(H, I, H)': 0.5294117647058824, 'P(L, N, L)': 0.5791984732824428, 'P(L, N, H)': 0.4208015267175573, 'P(H, N, L)': 0.2384887839433294, 'P(H, N, H)': 0.7615112160566706} 96
3 {'P(L, I, L)': 0.4, 'P(L, I, H)': 0.6, 'P(H, I, L)': 0.08333333333333333, 'P(H, I, H)': 0.9166666666666666, 'P(L, N, L)': 0.0, 'P(L, N, H)': 1.0, 'P(H, N, L)': 0.0755982319768328, 'P(H, N, H)': 0.9244017680231672} 248
4 {'P(L, I, L)': 0.75, 'P(L, I, H)': 0.25, '

In [24]:
all_transition_probabilities

Unnamed: 0,user_id,"P(L, I, L)","P(L, I, H)","P(H, I, L)","P(H, I, H)","P(L, N, L)","P(L, N, H)","P(H, N, L)","P(H, N, H)"
0,2276450.0,,,,,0.444444,0.555556,0.200000,0.800000
1,2276467.0,,,,,0.826087,0.173913,0.666667,0.333333
2,2276476.0,,,,,0.000000,1.000000,0.037037,0.962963
3,2276487.0,,,,,0.000000,1.000000,0.074074,0.925926
4,2276490.0,,,,,0.333333,0.666667,0.038462,0.961538
...,...,...,...,...,...,...,...,...,...
4233,2302701.0,,,,,0.000000,1.000000,0.115385,0.884615
4234,2302848.0,1.0,0.0,,,0.333333,0.666667,0.142857,0.857143
4235,2302916.0,,,,,0.687500,0.312500,0.307692,0.692308
4236,2302927.0,,,,,0.750000,0.250000,0.444444,0.555556


In [28]:
all_transition_probabilities['cluster'] = train_labels
all_transition_probabilities.to_csv('outputs/may2020-data-transitions.csv')

In [51]:
all_transition_probabilities[all_transition_probabilities.cluster==7]

Unnamed: 0,user_id,"P(L, I, L)","P(L, I, H)","P(H, I, L)","P(H, I, H)","P(L, N, L)","P(L, N, H)","P(H, N, L)","P(H, N, H)",cluster
136,2277362.0,,,0.0,1.0,0.750000,0.250000,0.086957,0.913043,7
191,2277726.0,,,,,0.666667,0.333333,0.043478,0.956522,7
263,2278109.0,,,,,0.666667,0.333333,0.086957,0.913043,7
268,2278143.0,,,0.0,1.0,0.750000,0.250000,0.105263,0.894737,7
329,2278526.0,,,,,0.666667,0.333333,0.043478,0.956522,7
...,...,...,...,...,...,...,...,...,...,...
3915,2300201.0,,,,,0.777778,0.222222,0.100000,0.900000,7
3970,2300561.0,,,,,0.714286,0.285714,0.090909,0.909091,7
3979,2300627.0,,,,,0.666667,0.333333,0.086957,0.913043,7
4167,2301850.0,,,,,0.769231,0.230769,0.125000,0.875000,7


In [43]:
all_transition_probabilities.groupby('cluster').std().drop(columns=['user_id']).\
                                                           to_csv('outputs/may_2020_clustering_stddev.csv')  
                        

In [46]:
all_transition_probabilities.drop('cluster', 1).isna().\
                    groupby(all_transition_probabilities.cluster).sum().reset_index().\
                    to_csv('outputs/may_2020_missing_counts.csv')


In [31]:
cluster_transition_probabilities

Unnamed: 0,cluster,count,"P(L, I, L)","P(L, I, H)","P(H, I, L)","P(H, I, H)","P(L, N, L)","P(L, N, H)","P(H, N, L)","P(H, N, H)"
0,0.0,46.0,1.0,0.0,,,0.791016,0.208984,0.792593,0.207407
1,1.0,135.0,0.142857,0.857143,0.272727,0.727273,0.333333,0.666667,0.080764,0.919236
2,2.0,96.0,0.25,0.75,0.470588,0.529412,0.579198,0.420802,0.238489,0.761511
3,3.0,248.0,0.4,0.6,0.083333,0.916667,0.0,1.0,0.075598,0.924402
4,4.0,60.0,0.75,0.25,0.727273,0.272727,0.396471,0.603529,0.57093,0.42907
5,5.0,122.0,0.625,0.375,0.454545,0.545455,0.262931,0.737069,0.253499,0.746501
6,6.0,47.0,0.941176,0.058824,,,0.914259,0.085741,0.329365,0.670635
7,7.0,65.0,,,0.470588,0.529412,0.72479,0.27521,0.071273,0.928727
8,8.0,118.0,0.6,0.4,0.826087,0.173913,0.606061,0.393939,0.447263,0.552737
9,9.0,26.0,0.333333,0.666667,0.75,0.25,0.072072,0.927928,0.389961,0.610039


In [50]:
stddev_df = pd.read_csv('outputs/may_2020_clustering_stddev.csv')
p_cols = [col for col in stddev_df.columns if col.startswith('P')]
stddev_df = stddev_df.rename(columns={i: f'std-{i}' for i in p_cols})
stddev_df

Unnamed: 0,cluster,"std-P(L, I, L)","std-P(L, I, H)","std-P(H, I, L)","std-P(H, I, H)","std-P(L, N, L)","std-P(L, N, H)","std-P(H, N, L)","std-P(H, N, H)"
0,0,0.0,0.0,0.0,0.0,0.040746,0.040746,0.045287,0.045287
1,1,0.377964,0.377964,0.467099,0.467099,0.0,0.0,0.028978,0.028978
2,2,0.5,0.5,0.514496,0.514496,0.030307,0.030307,0.029482,0.029482
3,3,0.547723,0.547723,0.280306,0.280306,0.0,0.0,0.002689,0.002689
4,4,0.5,0.5,0.467099,0.467099,0.052432,0.052432,0.047541,0.047541
5,5,0.517549,0.517549,0.522233,0.522233,0.022073,0.022073,0.029371,0.029371
6,6,0.242536,0.242536,0.0,0.0,0.049492,0.049492,0.043659,0.043659
7,7,,,0.514496,0.514496,0.044619,0.044619,0.042394,0.042394
8,8,0.507093,0.507093,0.387553,0.387553,0.047703,0.047703,0.037724,0.037724
9,9,0.57735,0.57735,0.5,0.5,0.055725,0.055725,0.034113,0.034113


In [52]:
missing_df = pd.read_csv('outputs/may_2020_missing_counts.csv')
p_cols = [col for col in missing_df.columns if col.startswith('P')]
missing_df = missing_df.rename(columns={i: f'missing-{i}' for i in p_cols})
missing_df

Unnamed: 0.1,Unnamed: 0,cluster,user_id,"missing-P(L, I, L)","missing-P(L, I, H)","missing-P(H, I, L)","missing-P(H, I, H)","missing-P(L, N, L)","missing-P(L, N, H)","missing-P(H, N, L)","missing-P(H, N, H)"
0,0,0,0.0,28.0,28.0,44.0,44.0,0.0,0.0,0.0,0.0
1,1,1,0.0,128.0,128.0,124.0,124.0,0.0,0.0,0.0,0.0
2,2,2,0.0,92.0,92.0,79.0,79.0,0.0,0.0,0.0,0.0
3,3,3,0.0,243.0,243.0,212.0,212.0,0.0,0.0,0.0,0.0
4,4,4,0.0,56.0,56.0,49.0,49.0,0.0,0.0,0.0,0.0
5,5,5,0.0,114.0,114.0,111.0,111.0,0.0,0.0,0.0,0.0
6,6,6,0.0,30.0,30.0,45.0,45.0,0.0,0.0,6.0,6.0
7,7,7,0.0,65.0,65.0,48.0,48.0,0.0,0.0,0.0,0.0
8,8,8,0.0,103.0,103.0,95.0,95.0,0.0,0.0,0.0,0.0
9,9,9,0.0,23.0,23.0,22.0,22.0,0.0,0.0,0.0,0.0


In [55]:
big_summary_df = pd.merge(pd.merge(cluster_transition_probabilities, stddev_df ),
         missing_df.drop(columns=['Unnamed: 0', 'user_id']))

In [56]:
big_summary_df.to_csv('outputs/may_2020_clustering_summary.csv', index=False)

In [57]:
big_summary_df

Unnamed: 0,cluster,count,"P(L, I, L)","P(L, I, H)","P(H, I, L)","P(H, I, H)","P(L, N, L)","P(L, N, H)","P(H, N, L)","P(H, N, H)",...,"std-P(H, N, L)","std-P(H, N, H)","missing-P(L, I, L)","missing-P(L, I, H)","missing-P(H, I, L)","missing-P(H, I, H)","missing-P(L, N, L)","missing-P(L, N, H)","missing-P(H, N, L)","missing-P(H, N, H)"
0,0.0,46.0,1.0,0.0,,,0.791016,0.208984,0.792593,0.207407,...,0.045287,0.045287,28.0,28.0,44.0,44.0,0.0,0.0,0.0,0.0
1,1.0,135.0,0.142857,0.857143,0.272727,0.727273,0.333333,0.666667,0.080764,0.919236,...,0.028978,0.028978,128.0,128.0,124.0,124.0,0.0,0.0,0.0,0.0
2,2.0,96.0,0.25,0.75,0.470588,0.529412,0.579198,0.420802,0.238489,0.761511,...,0.029482,0.029482,92.0,92.0,79.0,79.0,0.0,0.0,0.0,0.0
3,3.0,248.0,0.4,0.6,0.083333,0.916667,0.0,1.0,0.075598,0.924402,...,0.002689,0.002689,243.0,243.0,212.0,212.0,0.0,0.0,0.0,0.0
4,4.0,60.0,0.75,0.25,0.727273,0.272727,0.396471,0.603529,0.57093,0.42907,...,0.047541,0.047541,56.0,56.0,49.0,49.0,0.0,0.0,0.0,0.0
5,5.0,122.0,0.625,0.375,0.454545,0.545455,0.262931,0.737069,0.253499,0.746501,...,0.029371,0.029371,114.0,114.0,111.0,111.0,0.0,0.0,0.0,0.0
6,6.0,47.0,0.941176,0.058824,,,0.914259,0.085741,0.329365,0.670635,...,0.043659,0.043659,30.0,30.0,45.0,45.0,0.0,0.0,6.0,6.0
7,7.0,65.0,,,0.470588,0.529412,0.72479,0.27521,0.071273,0.928727,...,0.042394,0.042394,65.0,65.0,48.0,48.0,0.0,0.0,0.0,0.0
8,8.0,118.0,0.6,0.4,0.826087,0.173913,0.606061,0.393939,0.447263,0.552737,...,0.037724,0.037724,103.0,103.0,95.0,95.0,0.0,0.0,0.0,0.0
9,9.0,26.0,0.333333,0.666667,0.75,0.25,0.072072,0.927928,0.389961,0.610039,...,0.034113,0.034113,23.0,23.0,22.0,22.0,0.0,0.0,0.0,0.0


In [1]:
import pandas as pd
df = pd.read_csv('outputs/may_2020_clustering_summary.csv')
df

Unnamed: 0,cluster,count,"P(L, I, L)","P(L, I, H)","P(H, I, L)","P(H, I, H)","P(L, N, L)","P(L, N, H)","P(H, N, L)","P(H, N, H)",...,"std-P(H, N, L)","std-P(H, N, H)","missing-P(L, I, L)","missing-P(L, I, H)","missing-P(H, I, L)","missing-P(H, I, H)","missing-P(L, N, L)","missing-P(L, N, H)","missing-P(H, N, L)","missing-P(H, N, H)"
0,0.0,46.0,1.0,0.0,,,0.791016,0.208984,0.792593,0.207407,...,0.045287,0.045287,28.0,28.0,44.0,44.0,0.0,0.0,0.0,0.0
1,1.0,135.0,0.142857,0.857143,0.272727,0.727273,0.333333,0.666667,0.080764,0.919236,...,0.028978,0.028978,128.0,128.0,124.0,124.0,0.0,0.0,0.0,0.0
2,2.0,96.0,0.25,0.75,0.470588,0.529412,0.579198,0.420802,0.238489,0.761511,...,0.029482,0.029482,92.0,92.0,79.0,79.0,0.0,0.0,0.0,0.0
3,3.0,248.0,0.4,0.6,0.083333,0.916667,0.0,1.0,0.075598,0.924402,...,0.002689,0.002689,243.0,243.0,212.0,212.0,0.0,0.0,0.0,0.0
4,4.0,60.0,0.75,0.25,0.727273,0.272727,0.396471,0.603529,0.57093,0.42907,...,0.047541,0.047541,56.0,56.0,49.0,49.0,0.0,0.0,0.0,0.0
5,5.0,122.0,0.625,0.375,0.454545,0.545455,0.262931,0.737069,0.253499,0.746501,...,0.029371,0.029371,114.0,114.0,111.0,111.0,0.0,0.0,0.0,0.0
6,6.0,47.0,0.941176,0.058824,,,0.914259,0.085741,0.329365,0.670635,...,0.043659,0.043659,30.0,30.0,45.0,45.0,0.0,0.0,6.0,6.0
7,7.0,65.0,,,0.470588,0.529412,0.72479,0.27521,0.071273,0.928727,...,0.042394,0.042394,65.0,65.0,48.0,48.0,0.0,0.0,0.0,0.0
8,8.0,118.0,0.6,0.4,0.826087,0.173913,0.606061,0.393939,0.447263,0.552737,...,0.037724,0.037724,103.0,103.0,95.0,95.0,0.0,0.0,0.0,0.0
9,9.0,26.0,0.333333,0.666667,0.75,0.25,0.072072,0.927928,0.389961,0.610039,...,0.034113,0.034113,23.0,23.0,22.0,22.0,0.0,0.0,0.0,0.0


In [2]:
df['count'].sum()

4238.0