In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import re
import io
import ast
import sys
import json
import boto3 
import pymongo 
import subprocess
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from PIL import Image
from urllib import request
from collections import Counter

sys.path.append("../")
import cabutils

In [None]:
sns.set_style("darkgrid")
sns.set_context("notebook")

In [None]:
from matplotlib import rcParams
rcParams["font.size"] = 18
rcParams["figure.titlesize"] = 26

In [None]:
projName = "mlve"
experimentName = "tdw_surface-normals"
S3_BUCKET_NAME = "mlve-v1"

To download data (@Yoni only), run: 
ssh -i /Users/yoni/Desktop/cocosci_pem/Cocosci_WebExperiments.pem -nNL 8000:localhost:27017  ubuntu@ec2-3-90-78-57.compute-1.amazonaws.com

# Downloading data from ec2 server (mostly just instructions for thomas)

In `settings.conf` change the `MONGODB_PORT` to 8000, and the `MONGODB_HOST` to `localhost`. Then run the ssh port into the ec2 server: 

```
ssh -i path/to/pem/key/maybe-named-something-like/Cocosci_WebExperiments.pem -fNL 8000:localhost:27017 ubuntu@ec2-54-91-252-25.compute-1.amazonaws.com
```

Change the path to the pem key, but otherwise this should all stay the same.

In [None]:
conn = cabutils.get_db_connection()
db = conn[projName + "_outputs"]
col = db[experimentName]

In [None]:
def results_to_df():
    results = []
    cursor = col.find({})
    for document in cursor:
        results.append(document)
    
    df = pd.DataFrame(results)
    return df

df = results_to_df()

In [None]:
for i, x in df[df["trial_type"] == "survey-text"].iterrows():
    print(x["response"])

In [None]:
df.info()

In [None]:
df.drop("trial_type", axis=1, inplace=True)
df.drop("rt", axis=1, inplace=True)
df.drop("response", axis=1, inplace=True)
df.drop("inputid", axis=1, inplace=True)

In [None]:
df.drop(df[df["indicatorFinalDirection"].isna() == True].index, inplace=True)

In [None]:
def download_from_s3(url, resource_type="image"):
    s3 = boto3.resource('s3', region_name="us-east-2")
    bucket = s3.Bucket(S3_BUCKET_NAME)
    item = bucket.Object(url)
    if resource_type == "image":
        file_stream = io.BytesIO()
        item.download_fileobj(file_stream)
        img = Image.open(file_stream)
        return img
    
    else:
        return item

In [None]:
def download_from_url(url):
    obj = request.urlretrieve(url)
    image = Image.open(obj[0])
    return image

In [None]:
def cos_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
def geodesic_dist(a, b):
    if not a or not b:
        return float("nan")
    a = np.array(a)
    b = np.array(b)
    a = a / np.linalg.norm(a)
    b = b / np.linalg.norm(b)
    
    return np.arccos(a.dot(b))

In [None]:
df["geodesic_distance"] = df.apply(lambda x: geodesic_dist(x["indicatorFinalDirection"], x["trueArrowDirection"]), axis=1)
df["cosine_similarity"] = df.apply(lambda x: cos_similarity(x["indicatorFinalDirection"], x["trueArrowDirection"]), axis=1)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18,9))
sns.histplot(df["geodesic_distance"], ax=axs[0])
sns.histplot(df["cosine_similarity"], ax=axs[1])

axs[0].set_title(f"Geodesic error across all trials", fontsize=18)
axs[1].set_title(f"Cosine similarity across all trials", fontsize=18)
plt.suptitle(f"{experimentName}")

plt.show()

# Generate cleaned version

In [None]:
df.head()

In [None]:
participants = df.groupby("userID")
participants_failed = []
i = 0 
batch_idxs = []
for index, user_results in participants:
    i += 1
    print(len(user_results))
    batch_idxs.append(user_results.iloc[0]["batch_idx"])
    
    if len(user_results) < 100:
        print("USER ID: " + user_results["userID"].iloc[0] + " did not finish the experiment")
        participants_failed.append(user_results["userID"].iloc[0])
        continue
    
    if user_results["geodesic_distance"].mean() > 1.5:
        print("USER ID: " + user_results["userID"].iloc[0] + " did not do too well")
        continue
        
    attention_checks = user_results[user_results["attention_check"]] == True
    attention_score = attention_checks["geodesic_distance"].mean()
    if attention_score > 1:
        print("USER ID: " + user_results["userID"].iloc[0] + "scored: " + attention_score)
        participants_failed.append(user_results["userID"].iloc[0])

print(participants_failed)

# failed_participants = df["userID"].apply(lambda x: x in participants_failed)
# df = df[~failed_participants]
# attention_checks = df["stimulus"].apply(lambda x: "ground_truth" in x)
# df = df[~attention_checks]

# my_data = df["userID"] == "yoni_test2"
# df = df[~my_data]

In [None]:
attention_indexes = df[df["attention_check"] == True].index
df.drop(attention_indexes, axis=0, inplace=True)

In [None]:
df.info()

# To-Do:

1. Visualize answers
2. Check consistency
3. Rank participants by score
4. Check which trials had the lowest score
5. Check how often people oriented arrow straight up (relative to baseline)
6. Heatmap of correct vs. incorrect?