In [None]:
import os
import io
import sys
import json
import boto3
import pymongo 
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm
from matplotlib import cm
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from PIL import Image
from collections import Counter

sys.path.append("../")
import cabutils

In [None]:
sns.set_style("darkgrid")
sns.set_context("talk")

from matplotlib import rcParams
rcParams["font.size"] = 18
rcParams["figure.titlesize"] = 26

In [None]:
projName = "mlve"
experiment = "nsd"
experimentName = f"{experiment}-surface-normals"
S3_BUCKET_NAME = "mlve-v1"

attention_check = "attentionCheck"
batch_idx = "batchIdx"

In [None]:
conn = cabutils.get_db_connection()
db = conn[projName + "_outputs"]

In [None]:
names = list(db.list_collection_names())
names.sort()

In [None]:
summaries = {}

In [None]:
def results_to_df(iterName=None):
        results = []
        cursor = col.find({})
        for document in tqdm(cursor):
            results.append(document)

        df = pd.DataFrame(results)
        return df

In [None]:
for col_name in tqdm(names):
    col = db[col_name]
    summaries[col_name] = {}
    print("Getting summary data for ", col_name)
    
    if os.path.exists(f"datasets/{col_name}.csv"):
        df = pd.read_csv(f"datasets/{col_name}.csv")
    else:
        df = results_to_df() # drop non-experiment trials
        df.to_csv(f"datasets/{col_name}.csv")
    
    get_trial_key = lambda x: [key for key in x if "batch" in key][0]
    
    total_records = len(df)
    n_participants = len(df["userID"].dropna().unique())
    iteration_names = df["iterationName"].unique()

    get_batch_key = lambda x: [key for key in x if "batch" in key][0]
    batch_key = get_batch_key(df.columns)
    n_batches = len(df[batch_key].dropna().unique())
    
    print(f"Total Records: {total_records}, n_participants: {n_participants}, n_points: {n_batches}, iterations: {len(iteration_names)}")
    
    summaries[col_name]["n_participants"] = n_participants
    summaries[col_name]["n_points"] = n_batches
    summaries[col_name]["total_records"] = total_records
    summaries[col_name]["n_iterations"] = len(iteration_names)
    summaries[col_name]["iteration_names"] = iteration_names
    

In [None]:
all_summaries = summaries

summaries = {}
for key in all_summaries:
    if "object-loc" not in key:
        summaries[key] = all_summaries[key]

In [None]:
summary_df = pd.DataFrame(summaries)

In [None]:
summary_df

In [None]:
stat_first = {"n_participants": {}, "n_points": {}, "n_iterations": {}, "total_records": {}, "iteration_names": {}}
for dataset in summaries:
    for stat in summaries[dataset]:
        stat_first[stat][dataset] = summaries[dataset][stat]

In [None]:
stat_df = pd.DataFrame(stat_first)

In [None]:
stat_df.to_csv("stat_df.csv")

In [None]:
stat_df

In [None]:
stat_df["n_participants"].sum()

In [None]:
import plotting
%load_ext autoreload
%autoreload 2

In [None]:
def filter_attention_check_fails(df, key="correct", threshold=0.5, remove_failures=True, remove_attention_trials=True):
    att_key = lambda x: [k for k in x if "attention" in k][0]
    attention_key = att_key(df.columns)
    att_checks = df.loc[df[attention_key] == True]
    failed_checks = att_checks.groupby("userID")[key].mean() < threshold
    participants_failed = [failed_checks.keys()[i] for i in range(len(failed_checks)) if failed_checks[i]]
    if remove_failures:
        if len(participants_failed) == 0:
            print("No one failed any attention checks!")
        else:   
            df = df[~df["userID"].isin(participants_failed)]
        
    if remove_attention_trials:
        df = df[df[attention_key] == False]
            
    return df

def str_to_float_array(str_arr):
    if type(str_arr) == list:
        return str_arr
    
    if str_arr == np.nan or str_arr == "nan" or str_arr == float("nan") or not str_arr:
        return []

    arr = []
    str_arr = str_arr.strip("[]").split(",")
    for st in str_arr:
        try:
            arr.append(float(st))
        except:
            return []
    return arr

In [None]:
for col_name in names:
    path = f"datasets/{col_name}.csv"
    if "depth" in path or "segmentation" in path:
        
        print(col_name)

        if col_name == "nsd-depth-estimation-pilot" or col_name == "nsd-depth-estimation-split-half":
            continue
            
        accuracy_key = "correct"
        target_key = "userID"
        
        if col_name == "nsd-segmentation" or col_name == "tdw-segmentation":
            accuracy_key = "segmentation_correct"
        
        df = pd.read_csv(path)
        if col_name != "tdw-segmentation":
            df = filter_attention_check_fails(df, accuracy_key)
        df = df[df["trial_type"].str.contains("task")]
        
        fig = plotting.plot_accuracy(df, col_name, accuracy_key=accuracy_key, target_key=target_key)
        plt.show()
        plt.close()
        fig = plotting.plot_accuracy(df, col_name, accuracy_key=accuracy_key, errorbar=None, target_key="stimulus")
        plt.show()
        plt.close()
        
        if df["response"].iloc[0] is np.nan:
            response_key = "segmentation_response"
        else:
            response_key = "response"
            
        df[response_key] = pd.to_numeric(df[response_key])
        
        fig = plotting.plot_split_half(df, response_key, col_name)
        print("\n\n")
        

In [None]:
for col_name in names:
    path = f"datasets/{col_name}.csv"
    if "surface-normals" in path:
        if "hypersim_surface-normals" in path:
            continue
        
        print(path)
        
        df = pd.read_csv(path)
        df = df.dropna(axis=0, subset="indicatorFinalDirection")
        get_attention_key = lambda x: [k for k in x if "attention" in k][0]
        attention_key = get_attention_key(df.columns)
        get_duplicate_key = lambda x: [k for k in x if "duplicate" in k][0]
        duplicate_key = get_duplicate_key(df.columns)

        df = df[df[duplicate_key] == False]
        df = df[df[attention_key] == False]
        
        df["indicatorFinalDirection"] = df["indicatorFinalDirection"].apply(str_to_float_array)
        if "nsd" not in col_name:
            df["trueArrowDirection"] = df["trueArrowDirection"].apply(str_to_float_array)
            
        plotting.plot_surface_normal_split_half(df, col_name)

In [None]:
for col_name in names:
    path = f"datasets/{col_name}.csv"
    if "surface-normals" in path:
        if "hypersim_surface-normals" in path:
            continue
        
        print(path)
        
        df = pd.read_csv(path)
        df = df.dropna(axis=0, subset="indicatorFinalDirection")
        get_attention_key = lambda x: [k for k in x if "attention" in k][0]
        attention_key = get_attention_key(df.columns)
        get_duplicate_key = lambda x: [k for k in x if "duplicate" in k][0]
        duplicate_key = get_duplicate_key(df.columns)

        df = df[df[duplicate_key] == False]
        df = df[df[attention_key] == False]
        
        df["indicatorFinalDirection"] = df["indicatorFinalDirection"].apply(str_to_float_array)
        if "nsd" not in col_name:
            df["trueArrowDirection"] = df["trueArrowDirection"].apply(str_to_float_array)
            plotting.plot_mean_angular_error(df, "Ground Truth Angular Error, " + col_name, "userID")
            plt.show()
            plt.close()
            plotting.plot_mean_angular_error(df, "Ground Truth Angular Error, " + col_name, "imageURL")
            plt.show()
            plt.close()
            
        plotting.plot_mean_angular_agreement(df, "Angular Agreement, " + col_name)
        plt.show()
        plt.close()