# Imports

In [1]:
import json
import random

import ipywidgets as widgets
import matplotlib.colors as mcolors
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Image as IpImage
from IPython.display import display
from ipywidgets import Button, HBox, VBox
from PIL import Image as PilImage
from siuba import _, filter, group_by, summarize
from sklearn.cluster import KMeans

# %matplotlib widget

# Wrangle dataframe

## Load source dataframe

In [56]:
df = (
    pd.read_csv(
        "../data_in/fancy-a-cup-of-marchantia-classifications_final.csv"
    )
    .sort_values(["user_name", "created_at", "subject_ids"])
    .reset_index()
)
df = df[df.workflow_name == "Draw rectangles around the gemma cups"]
df.head()

Unnamed: 0,index,classification_id,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,created_at,gold_standard,expert,metadata,annotations,subject_data,subject_ids
0,43219,325816549,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,2021-04-19 16:25:50 UTC,,,"{""source"":""api"",""session"":""3042ea96c815e61a800...","[{""task"":""T0"",""task_label"":""Draw a rectangle a...","{""54208765"":{""retired"":{""id"":80884704,""workflo...",54208765
1,43229,325816828,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,2021-04-19 16:26:43 UTC,,,"{""source"":""api"",""session"":""3042ea96c815e61a800...","[{""task"":""T0"",""task_label"":""Draw a rectangle a...","{""54065181"":{""retired"":{""id"":80886304,""workflo...",54065181
2,43230,325816842,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,2021-04-19 16:26:46 UTC,,,"{""source"":""api"",""session"":""3042ea96c815e61a800...","[{""task"":""T0"",""task_label"":""Draw a rectangle a...","{""54098414"":{""retired"":{""id"":80886330,""workflo...",54098414
3,43235,325817034,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,2021-04-19 16:27:21 UTC,,,"{""source"":""api"",""session"":""3042ea96c815e61a800...","[{""task"":""T0"",""task_label"":""Draw a rectangle a...","{""54066915"":{""retired"":{""id"":80885061,""workflo...",54066915
4,43236,325817045,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,2021-04-19 16:27:23 UTC,,,"{""source"":""api"",""session"":""3042ea96c815e61a800...","[{""task"":""T0"",""task_label"":""Draw a rectangle a...","{""54068404"":{""retired"":{""id"":80884316,""workflo...",54068404


In [57]:
df.columns

Index(['index', 'classification_id', 'user_name', 'user_id', 'user_ip',
       'workflow_id', 'workflow_name', 'workflow_version', 'created_at',
       'gold_standard', 'expert', 'metadata', 'annotations', 'subject_data',
       'subject_ids'],
      dtype='object')

In [58]:
df.shape

(67731, 15)

## Expand columns

### Expand metadata

In [88]:
metadata = df["metadata"].apply(lambda x: json.loads(x)).apply(pd.Series)
metadata.columns

Index(['source', 'session', 'viewport', 'started_at', 'user_agent',
       'utc_offset', 'finished_at', 'live_project', 'interventions',
       'user_language', 'user_group_ids', 'subject_dimensions',
       'subject_selection_state', 'workflow_translation_id', 'seen_before'],
      dtype='object')

In [89]:
subject_selection_state = metadata["subject_selection_state"].apply(pd.Series).drop([0], axis=1)
subject_selection_state.columns

Index(['already_seen', 'finished_workflow', 'retired', 'selected_at',
       'selection_state', 'user_has_finished_workflow'],
      dtype='object')

In [102]:
subject_selection_state.head()

Unnamed: 0,already_seen,finished_workflow,retired,selected_at,selection_state,user_has_finished_workflow
0,False,False,False,2021-04-19T16:25:27.121Z,normal,False
1,False,False,False,2021-04-19T16:25:27.121Z,normal,False
2,False,False,False,2021-04-19T16:25:27.121Z,normal,False
3,False,False,False,2021-04-19T16:25:27.121Z,normal,False
4,False,False,False,2021-04-19T16:25:27.121Z,normal,False


In [90]:
viewport = metadata["viewport"].apply(pd.Series)
viewport.columns

Index(['width', 'height'], dtype='object')

In [100]:
interventions = metadata["interventions"].apply(pd.Series).drop([0], axis=1)

In [101]:
interventions.head()

Unnamed: 0,messageShown,opt_in
0,False,True
1,False,True
2,False,True
3,False,True
4,False,True


In [92]:
subject_dimensions = metadata["subject_dimensions"].apply(pd.Series)
subject_dimensions.columns

Index([0, 'clientHeight', 'clientWidth', 'naturalHeight', 'naturalWidth'], dtype='object')

In [95]:
subject_dimensions.head()

Unnamed: 0,0,clientHeight,clientWidth,naturalHeight,naturalWidth
0,"{'clientWidth': 549, 'clientHeight': 549, 'nat...",,,,
1,"{'clientWidth': 549, 'clientHeight': 549, 'nat...",,,,
2,,,,,
3,"{'clientWidth': 549, 'clientHeight': 549, 'nat...",,,,
4,,,,,


In [99]:
subject_dimensions[0].apply(pd.Series)

Unnamed: 0,0,clientHeight,clientWidth,naturalHeight,naturalWidth
0,,549.0,549.0,1600.0,1600.0
1,,549.0,549.0,1600.0,1600.0
2,,,,,
3,,549.0,549.0,1600.0,1600.0
4,,,,,
...,...,...,...,...,...
82539,,816.0,816.0,1600.0,1600.0
82540,,816.0,816.0,1600.0,1600.0
82541,,816.0,816.0,1600.0,1600.0
82542,,338.0,338.0,1600.0,1600.0


In [103]:
df_xµd = pd.concat(
    [
        df.drop(["metadata"], axis=1),
        subject_selection_state,
        viewport,
        metadata.drop(
            [
                "viewport",
                "interventions",
                "subject_dimensions",
                "subject_selection_state",
            ],
            axis=1,
        ),
    ],
    axis=1,
)
# df_xµd = pd.concat(
#     [
#         df_xµd.drop([0], axis=1),
#         df_xµd[0].apply(pd.Series),
#     ],
#     axis=1,
# )
df_xµd.head(3)

Unnamed: 0,index,classification_id,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,created_at,gold_standard,...,session,started_at,user_agent,utc_offset,finished_at,live_project,user_language,user_group_ids,workflow_translation_id,seen_before
0,43219,325816549,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,2021-04-19 16:25:50 UTC,,...,3042ea96c815e61a800e636480724f3e9cae48ce8e95eb...,2021-04-19T16:25:27.325Z,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-19800,2021-04-19T16:25:50.314Z,True,en,[],40817,
1,43229,325816828,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,2021-04-19 16:26:43 UTC,,...,3042ea96c815e61a800e636480724f3e9cae48ce8e95eb...,2021-04-19T16:26:07.962Z,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-19800,2021-04-19T16:26:43.178Z,True,en,[],40817,
2,43230,325816842,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,2021-04-19 16:26:46 UTC,,...,3042ea96c815e61a800e636480724f3e9cae48ce8e95eb...,2021-04-19T16:26:44.907Z,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-19800,2021-04-19T16:26:46.178Z,True,en,[],40817,


In [104]:
df_xµd.columns

Index(['index', 'classification_id', 'user_name', 'user_id', 'user_ip',
       'workflow_id', 'workflow_name', 'workflow_version', 'created_at',
       'gold_standard', 'expert', 'annotations', 'subject_data', 'subject_ids',
       'already_seen', 'finished_workflow', 'retired', 'selected_at',
       'selection_state', 'user_has_finished_workflow', 'width', 'height',
       'source', 'session', 'started_at', 'user_agent', 'utc_offset',
       'finished_at', 'live_project', 'user_language', 'user_group_ids',
       'workflow_translation_id', 'seen_before'],
      dtype='object')

### Avoid same name column issues

In [105]:
df_xµd["retired_bool"] = df_xµd.retired
df_xµd = df_xµd.drop(["retired"], axis=1)
df_xµd["creation_time"] = df_xµd.created_at
df_xµd = df_xµd.drop(["created_at"], axis=1)
df_xµd

Unnamed: 0,index,classification_id,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,gold_standard,expert,...,user_agent,utc_offset,finished_at,live_project,user_language,user_group_ids,workflow_translation_id,seen_before,retired_bool,creation_time
0,43219,325816549,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,,,...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-19800,2021-04-19T16:25:50.314Z,True,en,[],40817,,False,2021-04-19 16:25:50 UTC
1,43229,325816828,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,,,...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-19800,2021-04-19T16:26:43.178Z,True,en,[],40817,,False,2021-04-19 16:26:43 UTC
2,43230,325816842,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,,,...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-19800,2021-04-19T16:26:46.178Z,True,en,[],40817,,False,2021-04-19 16:26:46 UTC
3,43235,325817034,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,,,...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-19800,2021-04-19T16:27:21.042Z,True,en,[],40817,,False,2021-04-19 16:27:21 UTC
4,43236,325817045,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,,,...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-19800,2021-04-19T16:27:23.144Z,True,en,[],40817,,False,2021-04-19 16:27:23 UTC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82539,63744,327077931,zooniverseuser2021,2297096.0,209a0180e112b16d6e5b,17178,Draw rectangles around the gemma cups,5.8,,,...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-3600,2021-04-23T13:45:39.841Z,True,en,[],40817,,False,2021-04-23 13:45:41 UTC
82540,63748,327078182,zooniverseuser2021,2297096.0,209a0180e112b16d6e5b,17178,Draw rectangles around the gemma cups,5.8,,,...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-3600,2021-04-23T13:46:41.281Z,True,en,[],40817,,False,2021-04-23 13:46:42 UTC
82541,63749,327078209,zooniverseuser2021,2297096.0,209a0180e112b16d6e5b,17178,Draw rectangles around the gemma cups,5.8,,,...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,-3600,2021-04-23T13:46:47.871Z,True,en,[],40817,,False,2021-04-23 13:46:49 UTC
82542,52762,326444225,zosiaIB,2296228.0,ac81348b3c970a6d3736,17178,Draw rectangles around the gemma cups,5.8,,,...,Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like...,-7200,2021-04-21T14:17:16.415Z,True,en,[],40817,,False,2021-04-21 14:17:16 UTC


### Split subject_data and annotations

In [115]:
dfx = pd.concat(
    [
        df_xµd.drop(["subject_data", "annotations"], axis=1),
        df_xµd[
            "subject_data"
        ].apply(
            lambda x: list(json.loads(x).values())[0]
        ).apply(
            pd.Series
        ),
        df_xµd["annotations"].str[1:-1].apply(lambda x: json.loads(x)).apply(pd.Series),
    ],
    axis=1,
)
dfx = (
    pd.concat(
        [
            dfx.drop(["retired"], axis=1),
            dfx["retired"].apply(pd.Series),
        ],
        axis=1,
    )
    .rename(str.lower, axis="columns")
    .sort_values(by=["user_name", "filename", "created_at"])
)
dfx.head(3)

Unnamed: 0,index,classification_id,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,gold_standard,expert,...,task_label,value,id,workflow_id.1,classifications_count,created_at,updated_at,retired_at,subject_id,retirement_reason
4,43236,325817045,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,,,...,Draw a rectangle around every visible gemma cup.,[],80884316.0,17178.0,16.0,2021-03-15T09:53:09.375Z,2021-04-24T22:27:41.308Z,2021-04-24T22:27:41.302Z,54068404.0,classification_count
1,43229,325816828,-1997,2293824.0,e4bc332fc37fb06e47e4,17178,Draw rectangles around the gemma cups,5.8,,,...,Draw a rectangle around every visible gemma cup.,"[{'x': 91.71228790283203, 'y': 68.973846435546...",80886304.0,17178.0,16.0,2021-03-15T11:25:16.208Z,2021-04-20T18:02:52.188Z,2021-04-20T18:02:52.181Z,54065181.0,classification_count
6,43244,325817487,-1997,2293824.0,93d03ff918f719771778,17178,Draw rectangles around the gemma cups,5.8,,,...,Draw a rectangle around every visible gemma cup.,"[{'x': 690.1336059570312, 'y': 667.39526367187...",80883738.0,17178.0,16.0,2021-03-15T09:31:24.629Z,2021-04-25T14:39:45.792Z,2021-04-25T14:39:45.786Z,54066772.0,classification_count


In [116]:
dfx.columns

Index(['index', 'classification_id', 'user_name', 'user_id', 'user_ip',
       'workflow_id', 'workflow_name', 'workflow_version', 'gold_standard',
       'expert', 'subject_ids', 'already_seen', 'finished_workflow',
       'selected_at', 'selection_state', 'user_has_finished_workflow', 'width',
       'height', 'source', 'session', 'started_at', 'user_agent', 'utc_offset',
       'finished_at', 'live_project', 'user_language', 'user_group_ids',
       'workflow_translation_id', 'seen_before', 'retired_bool',
       'creation_time', 'filename', 'task', 'task_label', 'value', 'id',
       'workflow_id', 'classifications_count', 'created_at', 'updated_at',
       'retired_at', 'subject_id', 'retirement_reason'],
      dtype='object')

In [117]:
df_xµd["annotations"].str[1:-1].apply(lambda x: json.loads(x)).apply(pd.Series)

Unnamed: 0,task,task_label,value
0,T0,Draw a rectangle around every visible gemma cup.,"[{'x': 468.6400146484375, 'y': 669.33825683593..."
1,T0,Draw a rectangle around every visible gemma cup.,"[{'x': 91.71228790283203, 'y': 68.973846435546..."
2,T0,Draw a rectangle around every visible gemma cup.,[]
3,T0,Draw a rectangle around every visible gemma cup.,"[{'x': 793.1087036132812, 'y': 640.1943359375,..."
4,T0,Draw a rectangle around every visible gemma cup.,[]
...,...,...,...
82539,T0,Draw a rectangle around every visible gemma cup.,"[{'x': 915.2613525390625, 'y': 603.70190429687..."
82540,T0,Draw a rectangle around every visible gemma cup.,"[{'x': 801.5772705078125, 'y': 531.17932128906..."
82541,T0,Draw a rectangle around every visible gemma cup.,[]
82542,T0,Draw a rectangle around every visible gemma cup.,"[{'x': 911.180908203125, 'y': 696.654968261718..."


In [118]:
df_xµd[
    "subject_data"
].apply(
    lambda x: list(json.loads(x).values())[0]
).apply(
    pd.Series
)

Unnamed: 0,retired,Filename
0,"{'id': 80884704, 'workflow_id': 17178, 'classi...",bwh8SqPdFkVi7WMuFmOhraSPmXu8.jpg
1,"{'id': 80886304, 'workflow_id': 17178, 'classi...",b8rpeAAfN-vyWalNTkMeLOQoUl7I.jpg
2,"{'id': 80886330, 'workflow_id': 17178, 'classi...",bcmoHuoQv4GV2X5ZweVhMONWmZE.jpg
3,"{'id': 80885061, 'workflow_id': 17178, 'classi...",b9u7P2hugHqX8F1Oh8vo9bXXeeQ.jpg
4,"{'id': 80884316, 'workflow_id': 17178, 'classi...",b-38OOnRNVd8OdKdmNYZXXy83E.jpg
...,...,...
82539,"{'id': 80885908, 'workflow_id': 17178, 'classi...",bsHodjH-u4U6ljks5WTA-sN78Bzk.jpg
82540,"{'id': 80886359, 'workflow_id': 17178, 'classi...",b7kxhmcKZj3QY14ZoifchFbWmKF4.jpg
82541,"{'id': 80884623, 'workflow_id': 17178, 'classi...",b4VpsbIvibdSZyxlZKNtnG4DgUgk.jpg
82542,"{'id': 80884401, 'workflow_id': 17178, 'classi...",bNPfgw-BpHMcmequRCFtkA0FSXc.jpg


### Fix user_agent

In [119]:
dfx.user_agent = dfx.user_agent.str[13:28]
dfx.user_agent.unique()

array(['Windows NT 10.0', 'p', 'Macintosh; Inte', 'Linux; Android ',
       'X11; Linux x86_', 'X11; CrOS x86_6', 'e App', 'Windows NT 6.1;',
       'iPhone; CPU iPh', 'Windows NT 6.3;', 'X11; CrOS aarch',
       'iPad; CPU OS 14', 'Windows NT 6.1)', 'X11; Ubuntu; Li',
       'X11; CrOS armv7', 'iPad; CPU OS 12'], dtype=object)

In [120]:
dfx.user_agent = np.select(
    [
        dfx.user_agent == "Windows NT 10.0",
        dfx.user_agent == "Macintosh; Inte",
        dfx.user_agent == "Linux; Android ",
        dfx.user_agent == "X11; CrOS x86_6",
        dfx.user_agent == "Windows NT 6.1;",
        dfx.user_agent == "iPhone; CPU iPh",
        dfx.user_agent == "X11; Linux x86_",
        dfx.user_agent == "Windows NT 6.3;",
        dfx.user_agent == "iPad; CPU OS 12",
    ],
    [
        "Windows 10",
        "OSX",
        "Android",
        "Chrome OS",
        "Windows 7",
        "iPhone",
        "Linux",
        "Windows 8.1",
        "iPad",
    ],
    default="Oops",
)
dfx.user_agent.unique()

array(['Windows 10', 'Oops', 'OSX', 'Android', 'Linux', 'Chrome OS',
       'Windows 7', 'iPhone', 'Windows 8.1', 'iPad'], dtype=object)

### Keep only needed columns

In [121]:
dfx.shape

(67731, 43)

In [122]:
dfx.columns

Index(['index', 'classification_id', 'user_name', 'user_id', 'user_ip',
       'workflow_id', 'workflow_name', 'workflow_version', 'gold_standard',
       'expert', 'subject_ids', 'already_seen', 'finished_workflow',
       'selected_at', 'selection_state', 'user_has_finished_workflow', 'width',
       'height', 'source', 'session', 'started_at', 'user_agent', 'utc_offset',
       'finished_at', 'live_project', 'user_language', 'user_group_ids',
       'workflow_translation_id', 'seen_before', 'retired_bool',
       'creation_time', 'filename', 'task', 'task_label', 'value', 'id',
       'workflow_id', 'classifications_count', 'created_at', 'updated_at',
       'retired_at', 'subject_id', 'retirement_reason'],
      dtype='object')

In [123]:
df_keep = dfx[
    [
        "user_name",
        "user_agent",
        "filename",
        "value",
        "classifications_count",
        "creation_time",
    ]
]
df_keep.head(3)

Unnamed: 0,user_name,user_agent,filename,value,classifications_count,creation_time
4,-1997,Windows 10,b-38OOnRNVd8OdKdmNYZXXy83E.jpg,[],16.0,2021-04-19 16:27:23 UTC
1,-1997,Windows 10,b8rpeAAfN-vyWalNTkMeLOQoUl7I.jpg,"[{'x': 91.71228790283203, 'y': 68.973846435546...",16.0,2021-04-19 16:26:43 UTC
6,-1997,Windows 10,b9Si9hywZC9837sedSMj0H1vmHOI.jpg,"[{'x': 690.1336059570312, 'y': 667.39526367187...",16.0,2021-04-19 16:28:38 UTC


### Count observations

In [124]:
df_keep["rect_count"] = df_keep.value.apply(lambda x: len(x))
df_keep.head()
# df_keep.value.apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_keep["rect_count"] = df_keep.value.apply(lambda x: len(x))


Unnamed: 0,user_name,user_agent,filename,value,classifications_count,creation_time,rect_count
4,-1997,Windows 10,b-38OOnRNVd8OdKdmNYZXXy83E.jpg,[],16.0,2021-04-19 16:27:23 UTC,0
1,-1997,Windows 10,b8rpeAAfN-vyWalNTkMeLOQoUl7I.jpg,"[{'x': 91.71228790283203, 'y': 68.973846435546...",16.0,2021-04-19 16:26:43 UTC,1
6,-1997,Windows 10,b9Si9hywZC9837sedSMj0H1vmHOI.jpg,"[{'x': 690.1336059570312, 'y': 667.39526367187...",16.0,2021-04-19 16:28:38 UTC,1
3,-1997,Windows 10,b9u7P2hugHqX8F1Oh8vo9bXXeeQ.jpg,"[{'x': 793.1087036132812, 'y': 640.1943359375,...",16.0,2021-04-19 16:27:21 UTC,4
8,-1997,Windows 10,bGh8yswCmKlIDLd3nzjpsFQGa7EM.jpg,[],16.0,2021-04-19 16:28:51 UTC,0


### Tidy up

In [125]:
df_keep.columns

Index(['user_name', 'user_agent', 'filename', 'value', 'classifications_count',
       'creation_time', 'rect_count'],
      dtype='object')

In [126]:
tidy = (
    pd.concat(
        [
            df_keep.drop(["value"], axis=1), 
            df_keep.value.apply(pd.Series)
        ], 
        axis=1
    ).melt(
        id_vars=[
            "user_name",
            "filename",
            "user_agent",
            "rect_count",
            "classifications_count",
            "creation_time",
        ],
        var_name="dummy",
        value_name="rectangle",
    )
    .drop(["dummy"], axis=1)
    .dropna(subset=["user_name", "filename", "user_agent", "rect_count"])
)

tidy = pd.concat(
    [tidy.drop(["rectangle"], axis=1), tidy.rectangle.apply(pd.Series)], axis=1
)[
    [
        "user_name",
        "filename",
        "creation_time",
        "user_agent",
        "rect_count",
        "classifications_count",
        "x",
        "y",
        "width",
        "height",
    ]
]

tidy = (tidy >> filter(_.classifications_count >= 0)).reset_index().drop_duplicates()
tidy

Unnamed: 0,index,user_name,filename,creation_time,user_agent,rect_count,classifications_count,x,y,width,height
0,0,-1997,b-38OOnRNVd8OdKdmNYZXXy83E.jpg,2021-04-19 16:27:23 UTC,Windows 10,0,16.0,,,,
1,1,-1997,b8rpeAAfN-vyWalNTkMeLOQoUl7I.jpg,2021-04-19 16:26:43 UTC,Windows 10,1,16.0,91.712288,68.973846,1435.822685,1447.480438
2,2,-1997,b9Si9hywZC9837sedSMj0H1vmHOI.jpg,2021-04-19 16:28:38 UTC,Windows 10,1,16.0,690.133606,667.395264,242.865784,237.037170
3,3,-1997,b9u7P2hugHqX8F1Oh8vo9bXXeeQ.jpg,2021-04-19 16:27:21 UTC,Windows 10,4,16.0,793.108704,640.194336,54.401978,46.630249
4,4,-1997,bGh8yswCmKlIDLd3nzjpsFQGa7EM.jpg,2021-04-19 16:28:51 UTC,Windows 10,0,16.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2678755,2709235,zooniverseuser2021,bsHodjH-u4U6ljks5WTA-sN78Bzk.jpg,2021-04-23 13:45:41 UTC,Windows 10,4,16.0,,,,
2678756,2709236,zooniverseuser2021,bwBXWOt2GzMPB3SrfjsyRCIVHiXc.jpg,2021-04-22 17:03:32 UTC,Windows 10,5,16.0,,,,
2678757,2709237,zooniverseuser2021,byWvZHcGija9YzLS2tR9wYN5CChY.jpg,2021-04-22 17:04:04 UTC,Windows 10,0,16.0,,,,
2678758,2709238,zosiaIB,bJnTE48mxs8Joc0cgkhK3py0IvE.jpg,2021-04-21 14:18:03 UTC,iPhone,11,16.0,,,,


#  Explore the data

In [127]:
tidy[
    (tidy.user_name == "Brooker1957")
    & (tidy.filename == "b0xhA8TCuQtLRbirX369iE7dJvUE.jpg")
].dropna(
).reset_index(
).drop(
    ["index"], axis=1
).drop_duplicates(
).sort_values(
    ["user_name", "filename", "x", "y", "width", "height"]
).dropna()

Unnamed: 0,level_0,user_name,filename,creation_time,user_agent,rect_count,classifications_count,x,y,width,height
35,339731,Brooker1957,b0xhA8TCuQtLRbirX369iE7dJvUE.jpg,2021-02-04 17:27:52 UTC,Windows 10,15,27.0,525.379517,615.052002,79.688782,47.813232
36,406695,Brooker1957,b0xhA8TCuQtLRbirX369iE7dJvUE.jpg,2020-12-22 17:54:19 UTC,Windows 10,15,27.0,539.868469,632.438660,73.893127,33.324341
87,942450,Brooker1957,b0xhA8TCuQtLRbirX369iE7dJvUE.jpg,2021-02-03 17:42:31 UTC,Windows 10,15,27.0,542.766174,626.643066,52.159851,40.568848
68,741542,Brooker1957,b0xhA8TCuQtLRbirX369iE7dJvUE.jpg,2021-01-15 17:32:21 UTC,Windows 10,15,27.0,545.663940,623.745300,62.302063,55.057739
61,674572,Brooker1957,b0xhA8TCuQtLRbirX369iE7dJvUE.jpg,2020-12-22 18:01:09 UTC,Windows 10,15,27.0,552.908325,616.500916,60.853149,66.648682
...,...,...,...,...,...,...,...,...,...,...,...
1,4882,Brooker1957,b0xhA8TCuQtLRbirX369iE7dJvUE.jpg,2020-12-22 18:01:09 UTC,Windows 10,15,27.0,977.431885,807.753845,99.973145,133.297485
89,942452,Brooker1957,b0xhA8TCuQtLRbirX369iE7dJvUE.jpg,2021-02-04 17:27:52 UTC,Windows 10,15,27.0,978.880798,819.344910,68.097595,86.933167
19,205789,Brooker1957,b0xhA8TCuQtLRbirX369iE7dJvUE.jpg,2020-12-22 18:01:09 UTC,Windows 10,15,27.0,980.329712,590.421021,92.728638,95.626465
15,138822,Brooker1957,b0xhA8TCuQtLRbirX369iE7dJvUE.jpg,2021-02-03 17:42:31 UTC,Windows 10,15,27.0,980.329712,836.731567,79.688599,68.097534


In [128]:
no_rect_df = tidy[
    ["user_name", "filename", "creation_time", "user_agent", "rect_count"]
].drop_duplicates()
df_stats = (
    no_rect_df.groupby("filename")
    .rect_count.agg(
        count="count",
        min="min",
        max="max",
        mean="mean",
        median="median",
        std="std",
        mode=lambda x: x.mode(),
    )
    .reset_index()
    .sort_values("filename")
)
df_stats

Unnamed: 0,filename,count,min,max,mean,median,std,mode
0,b-1HoJ-Hqz5STrwrZHGBYdjAE3Q.jpg,16,0,11,2.7500,3.0,2.863564,0
1,b-38OOnRNVd8OdKdmNYZXXy83E.jpg,16,0,15,9.1250,10.0,3.324154,11
2,b-3Q-HdqeuB2sRxMIzzUPNjZfLSw.jpg,16,0,0,0.0000,0.0,0.000000,0
3,b-3wnyR8oNsu-V149ZYfCL-dfeDk.jpg,16,0,4,3.5625,4.0,1.209339,4
4,b-4-5gzYj0rmr9-dvGMkA3-FNkEs.jpg,16,0,14,7.8750,12.0,6.195428,12
...,...,...,...,...,...,...,...,...
3762,bzxDlaz3aqsr52KWjmGHJwUwssxk.jpg,16,0,3,1.8750,2.0,1.087811,3
3763,bzxxHFz3sNO2rEMRp7l5LOY2QaM0.jpg,16,0,8,2.0000,2.0,1.788854,2
3764,bzyUXItGRt98Cjh2dk1KeIyqjga8.jpg,16,0,6,4.1250,5.0,2.061553,5
3765,bzywiC3csPQ0738i9JSahLgzD9wE.jpg,16,0,6,2.0000,2.0,1.316561,2


In [129]:
observations = ["Select an observation"] + sorted(df_stats.filename.unique().tolist())
obs_selected = widgets.Dropdown(
    options=observations,
    description="Select an observation:",
)

user_selected = widgets.Dropdown(
    options=[],
    description="Select a user:",
)

date_selected = widgets.Dropdown(
    options=[],
    description="Select a date:",
)

shape_selected = widgets.Dropdown(
    options=["Rectangle", "Circle"],
    description="Draw shape:",
    value="Rectangle",
)

button = widgets.Button(description="Render")

image_with_rects = widgets.Output(layout={"border": "1px solid black"})
text_rects = widgets.Output(layout={"border": "1px solid black"})


dataframe: widgets.Output = widgets.Output(layout={"border": "1px solid black"})
stats_output = widgets.Output(layout={"border": "1px solid black"})


def update_overview(observation, user, date_, shape, update_user, update_date):
    dataframe.clear_output()
    with dataframe:
        display(
            no_rect_df.drop("filename", axis=1)[
                no_rect_df.filename == observation
            ].reset_index()
        )

    stats_output.clear_output()
    with stats_output:
        display(df_stats[df_stats.filename == observation].reset_index())

    if update_user == "update":
        user_selected.options = ["None", "All"] + sorted(
            tidy[tidy.filename == observation].user_name.unique().tolist()
        )
    elif update_user == "clear":
        user_selected.options = []
    if update_date == "update":
        date_selected.options = ["All"] + sorted(
            tidy[(tidy.filename == observation) & (tidy.user_name == user)]
            .creation_time.unique()
            .tolist()
        )
    elif update_date == "clear":
        date_selected.options = []

#     if observation == "Select an observation":
#         return
#     elif user == "None":
#         rects = None
#     elif user == "All":
    rects = tidy[(tidy.filename == observation)]
#     else:
#         rects = tidy[(tidy.filename == observation) & (tidy.user_name == user)]

#     if rects is None:
#         pass
#     elif date_ == "None":
#         rects = None
#     elif date_ != "All":
#         rects = rects[rects.creation_time == date_]

    if rects is not None:
        rects = rects.reset_index().dropna()[["x", "y", "width", "height"]]
    image_with_rects.clear_output()
    with image_with_rects:
        img = PilImage.open(f"../data_in/images/{obs_selected.value}")
        fig, ax = plt.subplots()
        fig.set_size_inches(14, 14)
        ax.set_axis_off()
        ax.imshow(img)
        if rects is not None:
            rects = rects.reset_index().dropna()[["x", "y", "width", "height"]]
            colors = [
                random.choice(list(mcolors.CSS4_COLORS.keys()))
                for _ in range(rects.shape[0])
            ]
            for x, y, w, h, c in zip(
                rects.x, rects.y, rects.width, rects.height, colors
            ):
                if shape == "Circle":
                    ax.add_patch(
                        patches.Circle(
                            (x + w // 2, y + h // 2),
                            8,
                            linewidth=8,
                            edgecolor=c,
                            facecolor="none",
                        )
                    )
                elif shape == "Rectangle":
                    ax.add_patch(
                        patches.Rectangle(
                            (x, y), w, h, linewidth=2, edgecolor=c, facecolor="none"
                        )
                    )
        plt.show()

    text_rects.clear_output()
    with text_rects:
        if rects is not None:
            display(rects)


def on_observation_selected(change):
    update_overview(
        observation=change.new,
        user=user_selected.value,
        date_=date_selected.value,
        shape=shape_selected.value,
        update_user="update",
        update_date="clear",
    )


def on_user_selected(change):
    update_overview(
        observation=obs_selected.value,
        user=change.new,
        date_=date_selected.value,
        shape=shape_selected.value,
        update_user="",
        update_date="update",
    )


def on_timestamp_selected(change):
    update_overview(
        observation=obs_selected.value,
        user=user_selected.value,
        date_=date_selected.value,
        shape=shape_selected.value,
        update_user="",
        update_date="",
    )


def on_shape_selected(change):
    update_overview(
        observation=obs_selected.value,
        user=user_selected.value,
        date_=date_selected.value,
        shape=change.new,
        update_user="",
        update_date="",
    )


obs_selected.observe(on_observation_selected, names="value")
# user_selected.observe(on_user_selected, names="value")
# date_selected.observe(on_timestamp_selected, names="value")
# shape_selected.observe(on_shape_selected, names="value")
# button.on_click(on_button_clicked)

display(
    HBox([obs_selected, user_selected, date_selected, shape_selected, button]),
    stats_output,
    HBox([dataframe, image_with_rects, text_rects]),
)

HBox(children=(Dropdown(description='Select an observation:', options=('Select an observation', 'b-1HoJ-Hqz5ST…

Output(layout=Layout(border='1px solid black'))

HBox(children=(Output(layout=Layout(border='1px solid black')), Output(layout=Layout(border='1px solid black')…

In [130]:
images_list = ["Select an observation"] + sorted(df_stats.filename.unique().tolist())
dd_image = widgets.Dropdown(
    options=images_list,
    description="Select an observation:",
)

is_print_all = widgets.Checkbox(
    value=False,
    description="Print all annotations centers",
    disabled=False,
    indent=False,
)
is_print_centers = widgets.Checkbox(
    value=False, description="Print all kmeans centers", disabled=False, indent=False
)
is_print_rectangles = widgets.Checkbox(
    value=True, description="Print rectangles", disabled=False, indent=False
)

image_output = widgets.Output(layout={"border": "1px solid black"})
image_stat_output = widgets.Output(layout={"border": "1px solid black"})
user_gt = widgets.Output(layout={"border": "1px solid black"})
value_counts = widgets.Output(layout={"border": "1px solid black"})


def print_ground_truth(
    observation: str,
    print_all: bool = False,
    print_centers: bool = False,
    print_rectangles: bool = True,
):
    image_stat_output.clear_output()
    with image_stat_output:
        display(df_stats[df_stats.filename == observation].reset_index())

    vc = no_rect_df[no_rect_df.filename == observation]["rect_count"].value_counts()
    if len(vc) > 1:
        v = vc.index.to_list()[:2]
        c = vc.to_list()[:2]
        if c[0] > 2 * c[1]:
            allowed_counts = [v[0]]
        else:
            allowed_counts = v
    else:
        allowed_counts = vc.index.to_list()

    rects = tidy[
        (tidy.filename == observation) & (tidy.rect_count.isin(allowed_counts))
    ].dropna()[["x", "y", "width", "height"]]
    rects = rects[
        (rects.width < 2 * rects.width.median())
        & (rects.height < 2 * rects.height.median())
    ].reset_index()
    image_output.clear_output()
    with image_output:
        img = PilImage.open(f"../data_in/images/{observation}")
        fig, ax = plt.subplots()
        fig.set_size_inches(14, 14)
        ax.set_axis_off()
        ax.imshow(img)
        if (rects is not None) and (rects.shape[0] > 0):
            rects = rects.assign(
                cx=rects.x + rects.width // 2,
                cy=rects.y + rects.height // 2,
            )
            X = [(cx, cy) for cx, cy in zip(rects.cx, rects.cy)]
            kmeans = KMeans(
                n_clusters=max(allowed_counts),
                random_state=42,
            ).fit(X)
            y_pred = kmeans.predict(X)
            if print_all:
                ax.scatter(rects.cx, rects.cy, c=y_pred, alpha=0.5)
            if print_rectangles:
                rects["y_pred"] = y_pred
                rects = rects.groupby("y_pred").median().reset_index()
                for x, y, w, h in zip(rects.x, rects.y, rects.width, rects.height):
                    ax.add_patch(
                        patches.Rectangle(
                            (x, y), w, h, linewidth=2, edgecolor="r", facecolor="none"
                        )
                    )
            if print_centers:
                centers = kmeans.cluster_centers_
                ax.scatter(centers[:, 0], centers[:, 1], c="b", s=200, alpha=0.5)
        plt.show()

    user_gt.clear_output()
    with user_gt:
        display(
            no_rect_df[no_rect_df.filename == observation]
            .reset_index()
            .drop(["filename", "user_agent"], axis=1)
        )

    value_counts.clear_output()
    with value_counts:
        display(
            no_rect_df[no_rect_df.filename == observation]["rect_count"].value_counts()
        )


def on_image_selected(change):
    print_ground_truth(
        observation=change.new,
        print_all=is_print_all.value,
        print_centers=is_print_centers.value,
        print_rectangles=is_print_rectangles.value,
    )


def on_print_all_changed(change):
    print_ground_truth(
        observation=dd_image.value,
        print_all=change.new,
        print_centers=is_print_centers.value,
        print_rectangles=is_print_rectangles.value,
    )


def on_print_centers_changed(change):
    print_ground_truth(
        observation=dd_image.value,
        print_all=is_print_all.value,
        print_centers=change.new,
        print_rectangles=is_print_rectangles.value,
    )


def on_print_rectangles_changed(change):
    print_ground_truth(
        observation=dd_image.value,
        print_all=is_print_all.value,
        print_centers=is_print_centers.value,
        print_rectangles=change.new,
    )


dd_image.observe(on_image_selected, names="value")

is_print_all.observe(on_print_all_changed, names="value")
is_print_centers.observe(on_print_centers_changed, names="value")
is_print_rectangles.observe(on_print_rectangles_changed, names="value")

display(
    HBox([dd_image, is_print_all, is_print_centers, is_print_rectangles]),
    image_stat_output,
    HBox([image_output, user_gt, value_counts]),
)

HBox(children=(Dropdown(description='Select an observation:', options=('Select an observation', 'b-1HoJ-Hqz5ST…

Output(layout=Layout(border='1px solid black'))

HBox(children=(Output(layout=Layout(border='1px solid black')), Output(layout=Layout(border='1px solid black')…

In [131]:
df_lst = []
for filename in df_stats.filename.unique().tolist():
    vc = no_rect_df[no_rect_df.filename == filename]["rect_count"].value_counts()
    if len(vc) > 1:
        v = vc.index.to_list()[:2]
        c = vc.to_list()[:2]
        if c[0] > 2 * c[1]:
            allowed_counts = [v[0]]
        else:
            allowed_counts = v
    else:
        allowed_counts = vc.index.to_list()

    rects = (
        tidy[(tidy.filename == filename) & (tidy.rect_count.isin(allowed_counts))]
        .drop(
            [
                "index",
                "user_name",
                "creation_time",
                "user_agent",
                "rect_count",
                "classifications_count",
            ],
            axis=1,
        )
        .dropna()
    )
    rects = rects[
        (rects.width < 2 * rects.width.median())
        & (rects.height < 2 * rects.height.median())
    ].reset_index()
    if (rects is not None) and (rects.shape[0] > 0):
        X = [
            (cx, cy)
            for cx, cy in zip(rects.x + rects.width // 2, rects.y + rects.height // 2)
        ]
        rects["y_pred"] = (
            KMeans(
                n_clusters=max(allowed_counts),
                random_state=42,
            )
            .fit(X)
            .predict(X)
        )
        df_lst.append(
            rects.assign(
                x=rects.groupby("y_pred", dropna=True,)[
                    "x"
                ].transform("median"),
                y=rects.groupby("y_pred", dropna=True,)[
                    "y"
                ].transform("median"),
                width=rects.groupby("y_pred", dropna=True,)[
                    "width"
                ].transform("median"),
                height=rects.groupby("y_pred", dropna=True,)[
                    "height"
                ].transform("median"),
            )
            .reset_index(drop=True)
            .drop(["y_pred", "index"], axis=1)
            .drop_duplicates()
            .reset_index(drop=True)
        )
    else:
        df_lst.append(
            pd.DataFrame(
                [[filename, np.NaN, np.NaN, np.NaN, np.NaN]],
                columns=["filename", "x", "y", "width", "height"],
            )
        )

FileNotFoundError: [Errno 2] No such file or directory: '../../output/zooniverse/boxes_final.csv'

In [135]:
df_final = pd.concat(df_lst)
df_final.to_csv(
    path_or_buf="../data_out/boxes_final.csv",
    index=False,
)
df_final

Unnamed: 0,filename,x,y,width,height
0,b-1HoJ-Hqz5STrwrZHGBYdjAE3Q.jpg,619.566864,756.123444,65.635284,63.824951
1,b-1HoJ-Hqz5STrwrZHGBYdjAE3Q.jpg,756.218201,591.466797,46.376831,38.321350
2,b-1HoJ-Hqz5STrwrZHGBYdjAE3Q.jpg,900.525146,730.371765,56.363098,56.363037
0,b-38OOnRNVd8OdKdmNYZXXy83E.jpg,631.533997,605.439514,37.705322,36.788940
1,b-38OOnRNVd8OdKdmNYZXXy83E.jpg,899.241516,891.426575,37.705261,35.149414
...,...,...,...,...,...
3,bzyUXItGRt98Cjh2dk1KeIyqjga8.jpg,761.688232,973.187805,80.784576,62.330414
4,bzyUXItGRt98Cjh2dk1KeIyqjga8.jpg,874.374115,922.859985,65.104553,77.646912
0,bzywiC3csPQ0738i9JSahLgzD9wE.jpg,927.864258,701.034180,87.356779,74.697632
1,bzywiC3csPQ0738i9JSahLgzD9wE.jpg,765.876777,568.327881,90.995261,83.236267


In [136]:
dd_final_image = widgets.Dropdown(options=sorted(df_final.filename.unique().tolist()))

final_image_output = widgets.Output(layout={"border": "1px solid black"})
rects_output = widgets.Output(layout={"border": "1px solid black"})


def print_final_rects(change):
    final_image_output.clear_output()
    rects = df_final[df_final.filename == change.new]
    with final_image_output:
        img = PilImage.open(f"../data_in/images/{change.new}")
        fig, ax = plt.subplots()
        fig.set_size_inches(14, 14)
        ax.set_axis_off()
        ax.imshow(img)
        if (rects is not None) and (rects.shape[0] > 0):
            for x, y, w, h in zip(rects.x, rects.y, rects.width, rects.height):
                ax.add_patch(
                    patches.Rectangle(
                        (x, y),
                        w,
                        h,
                        linewidth=2,
                        edgecolor="r",
                        facecolor="none",
                    )
                )
        plt.show()
    rects_output.clear_output()
    with rects_output:
        display(rects)


dd_final_image.observe(print_final_rects, names="value")
display(dd_final_image, HBox([final_image_output, rects_output]))

Dropdown(options=('b-1HoJ-Hqz5STrwrZHGBYdjAE3Q.jpg', 'b-38OOnRNVd8OdKdmNYZXXy83E.jpg', 'b-3Q-HdqeuB2sRxMIzzUPN…

HBox(children=(Output(layout=Layout(border='1px solid black')), Output(layout=Layout(border='1px solid black')…