# Setup

In [39]:
# path to the config file (see example). 
# will work with a config file for https://github.com/kimberli/mturk-template, 
# but a minimal example is included.
# Also supports an additional feature, the "variants" key, which is a list of dictionaries. 
# If "variants" is specified, for each dictionary it contains, those keys will be meshed with the "hitCreation"
# key and one task will be made per variant. Else, config["hitCreation"]["numTasks"] versions of the same
# task will be launched. 
CONFIG_PATH = "./config.json"

# where to save downloaded results 
SAVE_PATH = "./result.csv" 

# Whether to launch a hit per fold.txt in folder "files"
LAUNCH_HITS_FOR_ALL_FOLDS = False

## Define folds to use in the hit
The folds are txt files containing paths to the images to be used in the hit. This notebooks allows for creation of multiple hits, each with a different fold.txt .
In the folder "files", multiple txt files (folds) can be found. Each one contains 10 images, and there are no repeats among folds. Each fold has a piece of the full canva_scraping2 dataset.

In [48]:
from boto3 import client
import json
import copy

_USING_PROD = None

with open(CONFIG_PATH, 'r') as f:
    config = json.loads(f.read())
    hit_config = config['hitCreation']

if hit_config['production']:
    print("USING PROD")
    _USING_PROD = True
    endpoint_url = 'https://mturk-requester.us-east-1.amazonaws.com'
else:
    print("USING SANDBOX")
    _USING_PROD = False
    endpoint_url = 'https://mturk-requester-sandbox.us-east-1.amazonaws.com'
        
cl = client('mturk', region_name='us-east-1', endpoint_url=endpoint_url)

if hit_config['fold']:
    hit_config['taskUrl'] = hit_config['taskUrl'] + "?url=%s" % hit_config['fold']

if LAUNCH_HITS_FOR_ALL_FOLDS:
    all_folds = os.listdir('./all_folds')        
    
print("TASK URL:", hit_config['taskUrl'])


USING SANDBOX
TASK URL: https://cfosco.github.io/mturk-importance/?url=fold2_dropbox.txt


# Make new HIT

In [49]:
# Safety flags that prevent you from accidentally messing up your HITs. 
# Set to False except when you are performing these specific tasks. 
ALLOW_HIT_CREATION = True
ALLOW_ASSIGNMENT_ADDITION = False
ALLOW_CREATE_QUAL = False
ALLOW_UPDATE_EXPIRATION = False

In [50]:
# List of qualifications that you will use to filter potential workers. 
# These require that workers come from the US and have an approval rating >= 95%
QUALS = [
       {
           'QualificationTypeId': '00000000000000000071',
           'Comparator': 'EqualTo',
           'LocaleValues': [{
               'Country': 'US',
           }],
       },
        
       {
           'QualificationTypeId': '000000000000000000L0',
           'Comparator': 'GreaterThanOrEqualTo',
           'IntegerValues': [
               95
           ],
       },
    ]

In [51]:
# creates a HIT in the form of an External Question inside an iFrame
def create_hit(task):
    questionText = "<ExternalQuestion xmlns=\"http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/"
    questionText += "2006-07-14/ExternalQuestion.xsd\">\n<ExternalURL>" + task['taskUrl']
    questionText += "</ExternalURL>\n  <FrameHeight>700</FrameHeight>\n</ExternalQuestion>"

    response = cl.create_hit(
        MaxAssignments=task['numAssignments'],
        AutoApprovalDelayInSeconds=604800,
        LifetimeInSeconds=task['lifetime'],
        AssignmentDurationInSeconds=task['duration'],
        Reward=task['rewardAmount'],
        Title=task['title'],
        Keywords=task['keywords'],
        Description=task['description'],
        Question=questionText,
        QualificationRequirements=QUALS,
    )

    print(response)
    print("\n")

In [52]:
if ALLOW_HIT_CREATION: 
    if config.get('variants', None): 
        print("creating " + str(len(config['variants'])) + " variants")
        for var in config['variants']: 
            task = copy.deepcopy(config)
            task.update(var)
            create_hit(task)
    
    elif LAUNCH_HITS_FOR_ALL_FOLDS:
        print("creating", len(all_folds), "tasks")
        for fold in all_folds:
            hit_config['taskUrl'] = hit_config['taskUrl'] + "?url=%s" % fold
            create_hit(hit_config)
    else:
        print("creating " + str(hit_config['numTasks']) + " tasks")
        for i in range(hit_config['numTasks']):
            create_hit(hit_config)

creating 1 tasks
{'HIT': {'HITId': '3U18MJKL1U3I0R5JQXX0NCG3SJZCNI', 'HITTypeId': '33VSRLCQV1I71CR6MVQQ5U0SMIPXC7', 'HITGroupId': '3GBCJUK5C2I2RARWHU21Q3KS7CZPKT', 'CreationTime': datetime.datetime(2019, 1, 27, 6, 49, 23, tzinfo=tzlocal()), 'Title': 'Importance labeling', 'Description': 'Highlight the important parts of an image.', 'Question': '<ExternalQuestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/ExternalQuestion.xsd">\n<ExternalURL>https://cfosco.github.io/mturk-importance/?url=fold2_dropbox.txt</ExternalURL>\n  <FrameHeight>700</FrameHeight>\n</ExternalQuestion>', 'Keywords': 'labeling, importance, highlighting', 'HITStatus': 'Assignable', 'MaxAssignments': 2000, 'Reward': '0.00', 'AutoApprovalDelayInSeconds': 604800, 'Expiration': datetime.datetime(2019, 1, 28, 6, 49, 23, tzinfo=tzlocal()), 'AssignmentDurationInSeconds': 1200, 'QualificationRequirements': [{'QualificationTypeId': '00000000000000000071', 'Comparator': 'EqualTo', 'Locale

# HIT monitoring helpers

In [22]:
MAX_RESULTS = 10 # set equal to the number of outstanding hits you have 

hits = cl.list_hits(MaxResults=MAX_RESULTS)['HITs']

In [23]:
# Gets all assignments created for a HIT
def get_all_assignments(hitid): 
    assignments = []
    should_continue = True
    next_token = False
    while (should_continue): 
        args = {
            'HITId': hitid, 
            'MaxResults': 100
        }
        if (next_token): 
            args['NextToken'] = next_token
        r = cl.list_assignments_for_hit(**args)
        next_token = r.get('NextToken', False)
        assignments.extend(r["Assignments"])
        should_continue = len(r["Assignments"]) > 0
    return assignments

In [24]:
import datetime

# Summarizes all hits in `hits` in a human-readable way 
def summarize_hits(hits): 
    print(len(hits))
    ret = ""
    for hit in hits: 
        expiration = hit['Expiration'].replace(tzinfo=None)
        is_expired = expiration < datetime.datetime.now()
        description = ("Title: {title}\n" 
        "ID: {hid}\n"
        "\tAssignments left: {left}\n"
        "\tAssignments completed: {complete}\n"
        "\tAssignments pending: {pending}\n"
        "\tExpired: {exp}\n\n").format(
            title=hit['Title'], 
            hid=hit['HITId'], 
            left=hit['NumberOfAssignmentsAvailable'], 
            complete=hit['NumberOfAssignmentsCompleted'], 
            pending=hit['NumberOfAssignmentsPending'],
            exp=str(is_expired)
        )
        ret += description
    print(ret)

In [25]:
# Summarizes all pending/submitted/approved assignments for all hits in `hits`
def summarize_assignments(hits):
    ret = ""
    for hit in hits: 
        hid = hit['HITId']
        title =  hit['Title']
        name = "HIT %s: %s" % (hid, title)
        ret += name + "\n"
        assignments = get_all_assignments(hid)
        for a in assignments: 
            desc = "\tAssignment {aid}\n\t\tStatus: {status}\n".format(aid=a['AssignmentId'], status=a['AssignmentStatus'])
            ret += desc
    print(ret)

In [26]:
def refresh_hits(): 
    global hits 
    global MAX_RESULTS
    hits = cl.list_hits(MaxResults=MAX_RESULTS)['HITs']

# HIT monitoring

In [27]:
refresh_hits()
pass

In [28]:
refresh_hits()

summarize_hits(hits)

10
Title: Importance labeling
ID: 3S37Y8CWI8H5SH1IPJI50XH6TIDW47
	Assignments left: 1999
	Assignments completed: 0
	Assignments pending: 0
	Expired: False

Title: Importance labeling
ID: 3PR3LXCWSFGLENCKGJAIJO6TOCXX9X
	Assignments left: 2000
	Assignments completed: 0
	Assignments pending: 0
	Expired: False

Title: Importance labeling
ID: 3S8A4GJRD3K6SF5RIX6OBTI6I0HV6H
	Assignments left: 1999
	Assignments completed: 0
	Assignments pending: 0
	Expired: False

Title: Importance labeling
ID: 3W3RSPVVGS8OYLJY0B2L4QYKUOXULZ
	Assignments left: 1999
	Assignments completed: 0
	Assignments pending: 0
	Expired: False

Title: Importance labeling
ID: 3DFYDSXB2WHI3VVYXQQ8OBLSQZSJUM
	Assignments left: 2000
	Assignments completed: 0
	Assignments pending: 0
	Expired: False

Title: Importance labeling
ID: 3CMV9YRYP3IFK584OCY7XOP9D8ZJLS
	Assignments left: 1999
	Assignments completed: 0
	Assignments pending: 0
	Expired: False

Title: Importance labeling
ID: 3LG268AV38NICNHPFVKCMKLGD5MERB
	Assignments left

In [31]:
refresh_hits()
summarize_assignments(hits)
pass

HIT 34OWYT6U3WYOOSNPHXAHGEJEZ5CI9C: Importance labeling
HIT 3WGCNLZJKFPPRC48L6UDABLI9M91D9: Importance labeling
	Assignment 3NPI0JQDAPM2BDH1I0AK5VHJNRFPTG
		Status: Submitted
HIT 3W1K7D6QSBYT7BRWFLCZ7FHUJG9ZBO: Marking important regions in a graphic design
HIT 32TZXEA1OL1CRGLFSVTK77RCW9G14W: Marking important regions in a graphic design
HIT 3WUVMVA7OBKOO1M497C3C4XO1YOZAT: Marking important regions in a graphic design
	Assignment 37UQDCYH6YCSIZHQR9GI6REJKI97VR
		Status: Submitted
HIT 31KSVEGZ349CTN3G0VEI8QSGK08WR8: Marking important regions in a graphic design
	Assignment 32KTQ2V7REWD6VBASEDBXRMCOFL9MO
		Status: Submitted
HIT 3IZVJEBJ6A1N1ZA1JUVRWZV25FYZ63: Marking important regions in a graphic design
HIT 3WRKFXQBOBO3PTVXP92XFVC58COYI4: Marking important regions in a graphic design
	Assignment 3KGTPGBS6Y2LJJVKG7PIEEVSVUTU2M
		Status: Submitted
HIT 3KG2UQJ0MJ5MUSF2VFYV7H5KV21NQ6: Marking important regions in a graphic design
	Assignment 31IBVUNM9TG5XSWW9SEUUJR0I68FVJ
		Status: Submitted

# Approve HITs

In [29]:
# Approves all outstanding hits created for the HITs in hits 
def approve_all(hits): 
    num_approved = 0
    for hit in hits: 
        # make sure you keep getting assignments 
        assignments = get_all_assignments(hit["HITId"])
        #print(assignments)
        for a in assignments: 
            if a['AssignmentStatus'] != 'Approved':
                print("Approving assignment")
                num_approved += 1
                cl.approve_assignment(AssignmentId=a['AssignmentId'])
    print("Approved %d assignments" % num_approved)

In [30]:
refresh_hits()
approve_all(hits)

Approving assignment
Approving assignment
Approving assignment
Approving assignment
Approved 4 assignments


# Update expiration or num tasks

In [31]:
import datetime 

# changes the expiration date on a HIT to days_from_now days in the future
def update_expiration(hitid, days_from_now): 
    if ALLOW_UPDATE_EXPIRATION: 
        days = days_from_now*datetime.timedelta(days=1)
        expire_time = datetime.datetime.now() + days

        response = cl.update_expiration_for_hit(HITId=hitid, ExpireAt=expire_time)
        print(response)
        return response
    else: 
        raise RuntimeException("This action is not currently enabled; set `ALLOW_UPDATE_EXPIRATION` to true to proceed with this action")
    
def expire_hit(hit): 
    return update_expiration(hit, -10)

In [32]:
def add_assignments(hitid, num_assignments): 
    if ALLOW_ASSIGNMENT_ADDITION: 
        response = cl.create_additional_assignments_for_hit(
            HITId=hitid,
            NumberOfAdditionalAssignments=num_assignments
        )
        print(response)
        return response
    else: 
        raise RuntimException("This action is not currently enabled; set `ALLOW_ASSIGNMENT_ADDITION` to true to proceed with this action")

# Add custom qualifications 

## Add a qualification to disqualify workers who have done work before

- uses "negative qualification" method from https://github.com/cloudyr/MturkR/wiki/qualifications-as-blocks

### NOTE: quals are kept separate for the sandbox and prod. Make sure you are creating and assigning your quals in prod. 

In [33]:
# structure of a new qualification 
NEW_QUAL = {
    'Name': 'qualName',
    'Keywords': 'Keywords for qual',
    'Description': 'What is this qual, and why are you assigning it?',
    'QualificationTypeStatus': 'Active',
    'AutoGranted': False
}

In [34]:
def create_qual(new_qual):
    if ALLOW_CREATE_QUAL: 
        response = cl.create_qualification_type(**new_qual)
        print(response)
        Id = response['QualificationTypeId']
        print("id", Id)
        return Id
    else: 
        raise RuntimException("This action is not currently enabled; set `ALLOW_CREATE_QUAL` to true to proceed with this action")

In [38]:
# Gets all the custom quals you have created. 
def list_quals(): 
    response = cl.list_qualification_types(
            Query='hasCompletedVisualGraphRecallTask',
            MustBeRequestable=False
    )
    print(response)

list_quals()

{'NumResults': 0, 'QualificationTypes': [], 'ResponseMetadata': {'RequestId': '4ff9cff9-32f4-49b2-b79a-0137726813e0', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '4ff9cff9-32f4-49b2-b79a-0137726813e0', 'content-type': 'application/x-amz-json-1.1', 'content-length': '40', 'date': 'Tue, 27 Nov 2018 21:11:39 GMT'}, 'RetryAttempts': 0}}


In [39]:
def assign_qual(qual_id, worker_ids): 
    for worker in worker_ids: 
        response = cl.associate_qualification_with_worker(
                QualificationTypeId=qual_id, 
                WorkerId=worker,
                IntegerValue=1,
                SendNotification=False
        )
        print(response)
        assert response
        
def get_workers_for_hit(hitid): 
    a = get_all_assignments(hitid)
    workers = [a_['WorkerId'] for a_ in a]
    return workers
    
def confirm_quals(qual_id, worker_ids): 
    for w in worker_ids: 
        response = cl.get_qualification_score(
                QualificationTypeId=qual_id,
                WorkerId=w
        )
        response = response['Qualification']
        assert response['Status'] == 'Granted'
        assert response['IntegerValue'] == 1
        
# Assigns qual with `qual_id` to every worker who has completed an assignment for the hit with `hitid`
def assign_qual_for_hit(hitid, qual_id): 
    workers = get_workers_for_hit(hitid)
    print("got workers")
    assign_qual(qual_id, workers)
    print("assigned qual")
    confirm_quals(qual_id, workers)
    print("confirmed qual")

# Download data

In [35]:
from bs4 import BeautifulSoup as bs 
import pprint

def pretty_print(obj):
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(obj)
    pp = None

# Downloads all the assignments completed for `hits` as a list of dictionaries. 
# If a download_path is given, also saves that data as json 
def get_assignment_content(hits, download_path="", should_print=False): 
    all_responses = []
    for hit in hits: 
        hitid = hit['HITId']
        assignments = get_all_assignments(hitid)
        print(hitid)
        print(assignments)
        for a in assignments:
            #print(a)
            a_xml = a['Answer']
            #print(a_xml)
            soup = bs(a_xml, "lxml")
            answers = soup.find_all("answer")
            #print(answers)
            results = {'HITId': hitid}
            for ans in answers: 
                identifier = ans.find('questionidentifier').string
                answer = ans.find('freetext').string
                try: 
                    results[identifier] = json.loads(answer)
                except:
                    results[identifier] = answer
            all_responses.append(results)
    if should_print: 
        pretty_print(all_responses)
    if download_path: 
        with open(download_path, 'w') as outfile: 
            json.dump(all_responses, outfile)
    return all_responses
            

In [36]:
responses = get_assignment_content(hits, download_path='responses.json', should_print=False)
len(responses)

3S37Y8CWI8H5SH1IPJI50XH6TIDW47
[{'AssignmentId': '39GAF6DQWSHE7D2O2TZ95HN6H2A1V1', 'WorkerId': 'ACUP2HVXGZY46', 'HITId': '3S37Y8CWI8H5SH1IPJI50XH6TIDW47', 'AssignmentStatus': 'Approved', 'AutoApprovalTime': datetime.datetime(2019, 2, 1, 15, 11, 44, tzinfo=tzlocal()), 'AcceptTime': datetime.datetime(2019, 1, 25, 15, 10, 6, tzinfo=tzlocal()), 'SubmitTime': datetime.datetime(2019, 1, 25, 15, 11, 44, tzinfo=tzlocal()), 'ApprovalTime': datetime.datetime(2019, 1, 25, 15, 14, 41, tzinfo=tzlocal()), 'Answer': '<?xml version="1.0" encoding="ASCII"?><QuestionFormAnswers xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionFormAnswers.xsd"><Answer><QuestionIdentifier>workerId</QuestionIdentifier><FreeText>ACUP2HVXGZY46</FreeText></Answer><Answer><QuestionIdentifier>results</QuestionIdentifier><FreeText>{"inputs":[],"outputs":[["600,400,./files/certificates_1_13_MACS70yIlqs.png:3,2,1,258.35,117.5,258.35,118.35,258.35,120.85,258.35,123.35,258.35,128.35,258.35,1

3W3RSPVVGS8OYLJY0B2L4QYKUOXULZ
[{'AssignmentId': '373ERPL3YPPNNQXGVSE9AQHYOFNRT6', 'WorkerId': 'ACUP2HVXGZY46', 'HITId': '3W3RSPVVGS8OYLJY0B2L4QYKUOXULZ', 'AssignmentStatus': 'Approved', 'AutoApprovalTime': datetime.datetime(2019, 2, 1, 14, 58, 18, tzinfo=tzlocal()), 'AcceptTime': datetime.datetime(2019, 1, 25, 14, 57, 59, tzinfo=tzlocal()), 'SubmitTime': datetime.datetime(2019, 1, 25, 14, 58, 18, tzinfo=tzlocal()), 'ApprovalTime': datetime.datetime(2019, 1, 25, 15, 14, 42, tzinfo=tzlocal()), 'Answer': '<?xml version="1.0" encoding="ASCII"?><QuestionFormAnswers xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionFormAnswers.xsd"><Answer><QuestionIdentifier>workerId</QuestionIdentifier><FreeText>ACUP2HVXGZY46</FreeText></Answer><Answer><QuestionIdentifier>results</QuestionIdentifier><FreeText>{"inputs":[],"outputs":[["240,600,./files/infographics_1_0_free.png:3,2,1,320.85,380.85,;","464,600,./files/magazine-covers_14_31_free.png:","377,600,./files/

4