In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
# real imports
import ipywidgets as widgets
from functools import partial

from IPython.display import clear_output

from mt2gf.utils import read_access_keys
                       

from mt2gf.gform import (get_gform_map,
                                get_batch_gform_map,
                                 get_drive_service,
                                 download_drive_txt,
                                 download_multi_csv,
                                 download_csv)

from mt2gf.preprocess import get_batch_indexes,create_batch_directories
from mt2gf.mturk import (create_mturk_client,
                                 Turker,MTurkParam)
from mt2gf.widgets import ControlPanel
from pathlib import Path
import pandas as pd
from mt2gf.watcher import Watcher

## Parameters

In [4]:
#################### File folder params ####################

repo_path = Path("/home/ymentha/Documents/Cours/dlab_project/mturk2gform/")

results_dir = repo_path.joinpath("data/results")

# directory where the credentials.json file is present and where the token.pk file will be stored
creds_dir = repo_path.joinpath("creds")

use_batch = False

#################### Mturk params ####################

aws_key_path = creds_dir.joinpath("aws.csv")

production = False

# link to the hit layout
if production:
    hit_layout = "3ACG29O6JDJKYOPH2ORTS52TR57YKA"
else:
    hit_layout = "3XJFTJAV8QARKRU4KW7Q2OQT6WM9R4"



#################### Gforms params ####################

gform_map_path = results_dir.joinpath("gform_map.txt")

#################### Monitor params ####################

watcher_path = repo_path.joinpath("mt2gf/data/watcher.py")

watcher_forms_result_dir = repo_path.joinpath("data/monitor/")

# Maximum number of forms a worker is allowed to work in our project
max_forms_per_worker = 2

___

## Gform management

<div class="alert alert-success">
Adapt the following variables if you want a specific batch size/ batch number. If BATCH_NUMBER is set to <b>None</b>, the batch number is automatically computed. 
</div>

In [5]:
# file id to the gform mapping file
gform_map_id = "1QH7lTq_0mOhEmknMJwLJwZeZV1MSxf8ztN4AKtzrlTI"
# Retrieve the urls of the forms generated by the app script
service = get_drive_service("/home/ymentha/Documents/Cours/dlab_project/emojivec/creds")

### Batch Implementation

In [6]:
if False:
    batch_size = 7
    max_forms_per_worker = 2

    create_batch_directories(results_dir,6)

    # retrieve the indexes for the next batch to compute
    batch_dir,batch_number,form_indexes = get_batch_indexes(parent_dir=results_dir,
                                                  batch_number=None,
                                                  batch_size=batch_size,
                                                  MaxAssignments=max_forms_per_worker)
    print(f"Treating batch number {batch_number},forms:{form_indexes}")
    target_dir = batch_dir
    gform_map = get_batch_gform_map(service,gform_map_id,gform_map_path,form_indexes)
else:
    target_dir = results_dir
    gform_map = get_gform_map(service,gform_map_id,gform_map_path)

Download 100%


___

## Monitor

In [9]:
watcher = Watcher(form_results_dir=watcher_forms_result_dir,
                  gform_map=gform_map,
                  aws_key_path=aws_key_path,
                  drive_service=service,
                  max_forms_per_worker=max_forms_per_worker,
                  qualification_type_name="qualif test n5",
                  qualification_description="test qualification",          
                  production=production)

In [10]:
# Qualification requirement the monitor uses to tag workers
QualificationRequirements = [watcher.get_qualif_requirement()]

___

## MTurk management

In [11]:
param = MTurkParam(
             production=False,
             MaxAssignments = 2,
             LifetimeInDays = 1 / 24 / 6,  # 10 min     
             AutoApprovalDelayInDays=1, 
             AssignmentDurationInSeconds=1000,
             Reward='0.01',
             HITTitle="Emojis Descriptions n",
             Keywords='emojis, description, sentiment, emotions',
             Description='Describe emojis by a single accurate word',
             aws_key_path=aws_key_path,
             QualificationRequirements=QualificationRequirements,
             hit_layout=hit_layout
             )

### Callbacks instanciation

In [12]:
import Levenshtein
honeypots = {"3":["ab"],
             "4":["atm"],
             "18":['argentina'],
             "26":['back']
            }
def detect_honey_frauders(honeypots,form_df,dist_lshtein=2):
    """
    Returns the worker_ids of the workers who did not manage to find the honeypots

    Args:
        form_df (pd.df): as saved by download_all_csv_results
        dist_lshteing (int): distance tolerated to accept a honeypot
    """
    assert(form_df['WorkerID'].is_unique)
    form_df = form_df.set_index('WorkerID').copy()
    honey_columns = [col for col in form_df.columns if col in honeypots.keys()]
    if len(honey_columns) == 0:
        return set()
    form_df = form_df[honey_columns]
    for col in honey_columns:
        corr_words = honeypots[col]
        form_df[col] = form_df[col].apply(lambda word: min([Levenshtein.distance(word,corr_word) for corr_word in corr_words]) > dist_lshtein)
    frauder_list = form_df[form_df.any(axis=1)].index.tolist()
    return set(frauder_list)
part_detect_honey_frauders = partial(detect_honey_frauders,honeypots)

def detect_repeat_frauders(form_df,threshold=0.8):
    """
    Detect the fraudulous workers i.e. the one who repeated the same word too many times
    """
    form_df = form_df.copy()
    columns = [col for col in form_df.columns if col not in ['Timestamp','WorkerID']]
    form_df['vocsize'] = form_df[columns].apply(lambda x: len(set(x)),axis=1)
    fraud_workers = form_df[form_df['vocsize'] < threshold * len(columns)]['WorkerID'].values.tolist()
    return set(fraud_workers)

frauder_callbacks = ((part_detect_honey_frauders,"Wrong honeypot confirmation"),
                     (detect_repeat_frauders,"Repeated vocabulary"))

In [13]:
def conf_code_generator(i):
    """
    Confirmation Code Generator
    """
    a = i * 837 + 763
    return str(a)[:3]

In [14]:
turk = Turker(meta_dir = repo_path,
              param=param,
              gservice=service,
              gform_map=gform_map,
              formresdir=target_dir,
              conf_code_generator = conf_code_generator,
              frauder_callbacks = frauder_callbacks,
              check_conf_code=True)

Estimated cost:0.06 $


___

## Control Panel

In [15]:
control_panel = ControlPanel(turk=turk,
                  watcher=watcher
                  )

HBox(children=(Button(description='list hits', style=ButtonStyle()), Button(description='create hits', style=B…

HBox(children=(Button(description='list assignments', style=ButtonStyle()), Button(description='approve all', …

HBox(children=(Button(description='approve correct (dry)', style=ButtonStyle(button_color='lightgreen')), Butt…

HBox(children=(Button(description='stop all hits', style=ButtonStyle(button_color='orange')), Button(button_st…

HBox(children=(Button(button_style='info', description='start monitor', style=ButtonStyle()), Button(button_st…

HBox(children=(Button(button_style='primary', description='list tagged workers', style=ButtonStyle()), Button(…

Text(value='', placeholder='Results HITid/formidx')

In [25]:
turk.get_url("3UDTAB6HH6KK32YZLP36Z5R8NM390H")

'https://workersandbox.mturk.com/mturk3A4WG0MNLT41BIOASONE1OFVLCUJF9'