# Setup

In [2]:
# path to the config file (see example). 
# will work with a config file for https://github.com/kimberli/mturk-template, 
# but a minimal example is included.
# Also supports an additional feature, the "variants" key, which is a list of dictionaries. 
# If "variants" is specified, for each dictionary it contains, those keys will be meshed with the "hitCreation"
# key and one task will be made per variant. Else, config["hitCreation"]["numTasks"] versions of the same
# task will be launched. 
CONFIG_PATH = "./config.json"

# where to save downloaded results 
SAVE_PATH = "./result.csv" 

# Whether to launch a hit per fold.txt in folder "files"
LAUNCH_HITS_FOR_ALL_FOLDS = False

## Define folds to use in the hit
The folds are txt files containing paths to the images to be used in the hit. This notebooks allows for creation of multiple hits, each with a different fold.txt .
In the folder "files", multiple txt files (folds) can be found. Each one contains 10 images, and there are no repeats among folds. Each fold has a piece of the full canva_scraping2 dataset.

In [3]:
from boto3 import client
import json
import copy

_USING_PROD = None

with open(CONFIG_PATH, 'r') as f:
    config = json.loads(f.read())
    hit_config = config['hitCreation']

if hit_config['production']:
    print("USING PROD")
    _USING_PROD = True
    endpoint_url = 'https://mturk-requester.us-east-1.amazonaws.com'
else:
    print("USING SANDBOX")
    _USING_PROD = False
    endpoint_url = 'https://mturk-requester-sandbox.us-east-1.amazonaws.com'
        
cl = client('mturk', region_name='us-east-1', endpoint_url=endpoint_url)

if hit_config['fold']:
    hit_config['taskUrl'] = hit_config['taskUrl'] + "?url=%s" % hit_config['fold']
elif fld:
    hit_config['taskUrl'] = fld
else:
    fld = input('Define fold (input name+.txt)')
    hit_config['taskUrl'] = hit_config['taskUrl'] + "?url=%s" % fld

if LAUNCH_HITS_FOR_ALL_FOLDS:
    all_folds = os.listdir('./all_folds')        
    
print("TASK URL:", hit_config['taskUrl'])


USING PROD
TASK URL: https://cfosco.github.io/mturk-importance/?url=fold_resume.txt


# Make new HIT

In [4]:
# Safety flags that prevent you from accidentally messing up your HITs. 
# Set to False except when you are performing these specific tasks. 
ALLOW_HIT_CREATION = True
ALLOW_ASSIGNMENT_ADDITION = False
ALLOW_CREATE_QUAL = False
ALLOW_UPDATE_EXPIRATION = False

In [14]:
# List of qualifications that you will use to filter potential workers. 
# These require that workers come from the US and have an approval rating >= 95%
QUALS = [
       {
           'QualificationTypeId': '00000000000000000071',
           'Comparator': 'EqualTo',
           'LocaleValues': [{
               'Country': 'US',
           }],
       },
        
       {
           'QualificationTypeId': '000000000000000000L0',
           'Comparator': 'GreaterThanOrEqualTo',
           'IntegerValues': [
               95
           ],
       },
    ]

In [15]:
# creates a HIT in the form of an External Question inside an iFrame
def create_hit(task):
    questionText = "<ExternalQuestion xmlns=\"http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/"
    questionText += "2006-07-14/ExternalQuestion.xsd\">\n<ExternalURL>" + task['taskUrl']
    questionText += "</ExternalURL>\n  <FrameHeight>700</FrameHeight>\n</ExternalQuestion>"

    response = cl.create_hit(
        MaxAssignments=task['numAssignments'],
        AutoApprovalDelayInSeconds=604800,
        LifetimeInSeconds=task['lifetime'],
        AssignmentDurationInSeconds=task['duration'],
        Reward=task['rewardAmount'],
        Title=task['title'],
        Keywords=task['keywords'],
        Description=task['description'],
        Question=questionText,
        QualificationRequirements=QUALS,
    )

    print(response)
    print("\n")

In [16]:
if ALLOW_HIT_CREATION: 
    if config.get('variants', None): 
        print("creating " + str(len(config['variants'])) + " variants")
        for var in config['variants']: 
            task = copy.deepcopy(config)
            task.update(var)
            create_hit(task)
    
    elif LAUNCH_HITS_FOR_ALL_FOLDS:
        print("creating", len(all_folds), "tasks")
        for fold in all_folds:
            hit_config['taskUrl'] = hit_config['taskUrl'] + "?url=%s" % fold
            create_hit(hit_config)
    else:
        print("creating " + str(hit_config['numTasks']) + " tasks")
        for i in range(hit_config['numTasks']):
            create_hit(hit_config)

creating 1 tasks
{'HIT': {'HITId': '3RQVKZ7ZRKS8XDDQ7UZKWOWI5D027C', 'HITTypeId': '3Q1N1EPUS9B7W1G860U3ZBL7FUXNZP', 'HITGroupId': '329GGY5YRIF73PNW7HVJ4H6MJJG4TM', 'CreationTime': datetime.datetime(2019, 2, 1, 0, 52, 12, tzinfo=tzlocal()), 'Title': 'Annotate the most important regions on graphic designs', 'Description': 'Manually highlight the important parts of a graphic design. You will be shown a set of 10 images, and you will have to indicate which parts feel important to you.', 'Question': '<ExternalQuestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/ExternalQuestion.xsd">\n<ExternalURL>https://cfosco.github.io/mturk-importance/?url=fold_resume.txt</ExternalURL>\n  <FrameHeight>700</FrameHeight>\n</ExternalQuestion>', 'Keywords': 'labeling, importance, highlighting, graphic, designs, images', 'HITStatus': 'Assignable', 'MaxAssignments': 10, 'Reward': '1.00', 'AutoApprovalDelayInSeconds': 604800, 'Expiration': datetime.datetime(2019, 2, 3, 4,

# HIT monitoring helpers

In [5]:
MAX_RESULTS = 3 # set equal to the number of outstanding hits you have 

hits = cl.list_hits(MaxResults=MAX_RESULTS)['HITs']

In [6]:
print(len(hits))

3


In [7]:
# Gets all assignments created for a HIT
def get_all_assignments(hitid): 
    assignments = []
    should_continue = True
    next_token = False
    while (should_continue): 
        args = {
            'HITId': hitid, 
            'MaxResults': 100
        }
        if (next_token): 
            args['NextToken'] = next_token
        r = cl.list_assignments_for_hit(**args)
        next_token = r.get('NextToken', False)
        assignments.extend(r["Assignments"])
        should_continue = len(r["Assignments"]) > 0
    return assignments

In [8]:
import datetime

# Summarizes all hits in `hits` in a human-readable way 
def summarize_hits(hits): 
    print(len(hits))
    ret = ""
    for hit in hits: 
        expiration = hit['Expiration'].replace(tzinfo=None)
        is_expired = expiration < datetime.datetime.now()
        description = ("Title: {title}\n" 
        "ID: {hid}\n"
        "\tAssignments left: {left}\n"
        "\tAssignments completed: {complete}\n"
        "\tAssignments pending: {pending}\n"
        "\tExpired: {exp}\n\n").format(
            title=hit['Title'], 
            hid=hit['HITId'], 
            left=hit['NumberOfAssignmentsAvailable'], 
            complete=hit['NumberOfAssignmentsCompleted'], 
            pending=hit['NumberOfAssignmentsPending'],
            exp=str(is_expired)
        )
        ret += description
    print(ret)

In [9]:
# Summarizes all pending/submitted/approved assignments for all hits in `hits`
def summarize_assignments(hits):
    ret = ""
    for hit in hits: 
        hid = hit['HITId']
        title =  hit['Title']
        name = "HIT %s: %s" % (hid, title)
        ret += name + "\n"
        assignments = get_all_assignments(hid)
        for a in assignments: 
            desc = "\tAssignment {aid}\n\t\tStatus: {status}\n".format(aid=a['AssignmentId'], status=a['AssignmentStatus'])
            ret += desc
    print(ret)

In [10]:
def refresh_hits(): 
    global hits 
    global MAX_RESULTS
    hits = cl.list_hits(MaxResults=MAX_RESULTS)['HITs']

# HIT monitoring

In [11]:
refresh_hits()
pass

In [12]:
refresh_hits()

summarize_hits(hits)

3
Title: Annotate the most important regions on graphic designs
ID: 3RQVKZ7ZRKS8XDDQ7UZKWOWI5D027C
	Assignments left: 0
	Assignments completed: 0
	Assignments pending: 0
	Expired: False

Title: Annotate the most important regions on graphic designs
ID: 3RWB1RTQDJW8PYJFJX226106R7XP88
	Assignments left: 0
	Assignments completed: 0
	Assignments pending: 0
	Expired: False

Title: Annotate the most important regions on graphic designs
ID: 3DGDV62G7OIX3JHQ3GT1WAN3575P25
	Assignments left: 0
	Assignments completed: 0
	Assignments pending: 0
	Expired: False




In [13]:
refresh_hits()
summarize_assignments(hits)
pass

HIT 3RQVKZ7ZRKS8XDDQ7UZKWOWI5D027C: Annotate the most important regions on graphic designs
	Assignment 3RJSC4XJ114P1JUNUGLA3ZLTX1N50V
		Status: Submitted
	Assignment 3NS0A6KXC5IKJ1PXF6HTDVF45HJZGY
		Status: Submitted
	Assignment 3ZQIG0FLQFQZ73ANTEW8ESOWBU4WVV
		Status: Submitted
	Assignment 3RXCAC0YISZ5Z5OA6GHG0BI068D8G3
		Status: Submitted
	Assignment 3UOUJI6MTEOEJOQDYTIPKP0K6P0UXW
		Status: Submitted
	Assignment 3H7Z272LX8H6RP1S3OG7ZPIYRZYPL9
		Status: Submitted
	Assignment 3FPRZHYEPZH2G58ZJUL69VDZD1A3VW
		Status: Submitted
	Assignment 34BBWHLWHBLULXQNU7XPZ3Y23SBWI1
		Status: Submitted
	Assignment 39RP059MEI3OTDI3KVZL4ISYA0DBMG
		Status: Submitted
	Assignment 35GCEFQ6I6YJPPD9KRA9U1QV1CI3ZY
		Status: Submitted
HIT 3RWB1RTQDJW8PYJFJX226106R7XP88: Annotate the most important regions on graphic designs
	Assignment 3RYC5T2D743HUNW588D3L1RXXRNRP1
		Status: Submitted
	Assignment 3M0NZ3JDP28M340DYAY29YOFF85Z5G
		Status: Submitted
	Assignment 3R5F3LQFV3U8BE6V3PU1I4O6W3AZOL
		Status: Submitted

# Approve HITs

In [14]:
# Approves all outstanding hits created for the HITs in hits 
def approve_all(hits): 
    num_approved = 0
    for hit in hits: 
        # make sure you keep getting assignments 
        assignments = get_all_assignments(hit["HITId"])
        #print(assignments)
        for a in assignments: 
            if a['AssignmentStatus'] != 'Approved':
                print("Approving assignment")
                num_approved += 1
                cl.approve_assignment(AssignmentId=a['AssignmentId'])
    print("Approved %d assignments" % num_approved)

In [15]:
refresh_hits()
approve_all(hits)

Approving assignment
Approving assignment
Approving assignment
Approving assignment
Approving assignment
Approving assignment
Approved 6 assignments


# Update expiration or num tasks

In [116]:
import datetime 

# changes the expiration date on a HIT to days_from_now days in the future
def update_expiration(hitid, days_from_now): 
    if ALLOW_UPDATE_EXPIRATION: 
        days = days_from_now*datetime.timedelta(days=1)
        expire_time = datetime.datetime.now() + days

        response = cl.update_expiration_for_hit(HITId=hitid, ExpireAt=expire_time)
        print(response)
        return response
    else: 
        raise RuntimeException("This action is not currently enabled; set `ALLOW_UPDATE_EXPIRATION` to true to proceed with this action")
    
def expire_hit(hit): 
    return update_expiration(hit, -10)

In [117]:
def add_assignments(hitid, num_assignments): 
    if ALLOW_ASSIGNMENT_ADDITION: 
        response = cl.create_additional_assignments_for_hit(
            HITId=hitid,
            NumberOfAdditionalAssignments=num_assignments
        )
        print(response)
        return response
    else: 
        raise RuntimException("This action is not currently enabled; set `ALLOW_ASSIGNMENT_ADDITION` to true to proceed with this action")

# Add custom qualifications 

## Add a qualification to disqualify workers who have done work before

- uses "negative qualification" method from https://github.com/cloudyr/MturkR/wiki/qualifications-as-blocks

### NOTE: quals are kept separate for the sandbox and prod. Make sure you are creating and assigning your quals in prod. 

In [33]:
# structure of a new qualification 
NEW_QUAL = {
    'Name': 'qualName',
    'Keywords': 'Keywords for qual',
    'Description': 'What is this qual, and why are you assigning it?',
    'QualificationTypeStatus': 'Active',
    'AutoGranted': False
}

In [34]:
def create_qual(new_qual):
    if ALLOW_CREATE_QUAL: 
        response = cl.create_qualification_type(**new_qual)
        print(response)
        Id = response['QualificationTypeId']
        print("id", Id)
        return Id
    else: 
        raise RuntimException("This action is not currently enabled; set `ALLOW_CREATE_QUAL` to true to proceed with this action")

In [38]:
# Gets all the custom quals you have created. 
def list_quals(): 
    response = cl.list_qualification_types(
            Query='hasCompletedVisualGraphRecallTask',
            MustBeRequestable=False
    )
    print(response)

list_quals()

{'NumResults': 0, 'QualificationTypes': [], 'ResponseMetadata': {'RequestId': '4ff9cff9-32f4-49b2-b79a-0137726813e0', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '4ff9cff9-32f4-49b2-b79a-0137726813e0', 'content-type': 'application/x-amz-json-1.1', 'content-length': '40', 'date': 'Tue, 27 Nov 2018 21:11:39 GMT'}, 'RetryAttempts': 0}}


In [39]:
def assign_qual(qual_id, worker_ids): 
    for worker in worker_ids: 
        response = cl.associate_qualification_with_worker(
                QualificationTypeId=qual_id, 
                WorkerId=worker,
                IntegerValue=1,
                SendNotification=False
        )
        print(response)
        assert response
        
def get_workers_for_hit(hitid): 
    a = get_all_assignments(hitid)
    workers = [a_['WorkerId'] for a_ in a]
    return workers
    
def confirm_quals(qual_id, worker_ids): 
    for w in worker_ids: 
        response = cl.get_qualification_score(
                QualificationTypeId=qual_id,
                WorkerId=w
        )
        response = response['Qualification']
        assert response['Status'] == 'Granted'
        assert response['IntegerValue'] == 1
        
# Assigns qual with `qual_id` to every worker who has completed an assignment for the hit with `hitid`
def assign_qual_for_hit(hitid, qual_id): 
    workers = get_workers_for_hit(hitid)
    print("got workers")
    assign_qual(qual_id, workers)
    print("assigned qual")
    confirm_quals(qual_id, workers)
    print("confirmed qual")

# Download data

In [16]:
from bs4 import BeautifulSoup as bs 
import pprint

def pretty_print(obj):
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(obj)
    pp = None

# Downloads all the assignments completed for `hits` as a list of dictionaries. 
# If a download_path is given, also saves that data as json 
def get_assignment_content(hits, download_path="", should_print=False): 
    all_responses = []
    for hit in hits: 
        hitid = hit['HITId']
        assignments = get_all_assignments(hitid)
        print(hitid)
        print(assignments)
        for a in assignments:
            #print(a)
            a_xml = a['Answer']
            #print(a_xml)
            soup = bs(a_xml, "lxml")
            answers = soup.find_all("answer")
            #print(answers)
            results = {'HITId': hitid}
            for ans in answers: 
                identifier = ans.find('questionidentifier').string
                answer = ans.find('freetext').string
                try: 
                    results[identifier] = json.loads(answer)
                except:
                    results[identifier] = answer
            all_responses.append(results)
    if should_print: 
        pretty_print(all_responses)
    if download_path: 
        with open(download_path, 'w') as outfile: 
            json.dump(all_responses, outfile)
    return all_responses
            

In [17]:
responses = get_assignment_content(hits, download_path='responses_prod_test.json', should_print=False)
len(responses)

3RQVKZ7ZRKS8XDDQ7UZKWOWI5D027C
[{'AssignmentId': '3RJSC4XJ114P1JUNUGLA3ZLTX1N50V', 'WorkerId': 'A2RSHFX4S8R33G', 'HITId': '3RQVKZ7ZRKS8XDDQ7UZKWOWI5D027C', 'AssignmentStatus': 'Approved', 'AutoApprovalTime': datetime.datetime(2019, 2, 8, 0, 59, 42, tzinfo=tzlocal()), 'AcceptTime': datetime.datetime(2019, 2, 1, 0, 52, 24, tzinfo=tzlocal()), 'SubmitTime': datetime.datetime(2019, 2, 1, 0, 59, 42, tzinfo=tzlocal()), 'ApprovalTime': datetime.datetime(2019, 2, 1, 3, 5, 44, tzinfo=tzlocal()), 'Answer': '<?xml version="1.0" encoding="ASCII"?><QuestionFormAnswers xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionFormAnswers.xsd"><Answer><QuestionIdentifier>workerId</QuestionIdentifier><FreeText>A2RSHFX4S8R33G</FreeText></Answer><Answer><QuestionIdentifier>results</QuestionIdentifier><FreeText>{"inputs":[],"outputs":[["0,0,https://www.dropbox.com/s/fcm6qjo2ui49zcn/CV_1_14_MACE-aAuMiY.png?raw=1:10,2,1,148.7,182.4,;10,2,1,148.7,182.4,;10,2,1,149.5,181.6,151

3RWB1RTQDJW8PYJFJX226106R7XP88
[{'AssignmentId': '3RYC5T2D743HUNW588D3L1RXXRNRP1', 'WorkerId': 'A14BO0JIQTNMOW', 'HITId': '3RWB1RTQDJW8PYJFJX226106R7XP88', 'AssignmentStatus': 'Approved', 'AutoApprovalTime': datetime.datetime(2019, 2, 8, 1, 6, 58, tzinfo=tzlocal()), 'AcceptTime': datetime.datetime(2019, 2, 1, 0, 51, 27, tzinfo=tzlocal()), 'SubmitTime': datetime.datetime(2019, 2, 1, 1, 6, 58, tzinfo=tzlocal()), 'ApprovalTime': datetime.datetime(2019, 2, 1, 3, 5, 45, tzinfo=tzlocal()), 'Answer': '<?xml version="1.0" encoding="ASCII"?><QuestionFormAnswers xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionFormAnswers.xsd"><Answer><QuestionIdentifier>workerId</QuestionIdentifier><FreeText>A14BO0JIQTNMOW</FreeText></Answer><Answer><QuestionIdentifier>results</QuestionIdentifier><FreeText>{"inputs":[],"outputs":[["240,600,https://www.dropbox.com/s/bi9qrcr1zxu4659/infographics_1_3_MAB2CQ_1UW0_notext.png?raw=1:","240,600,https://www.dropbox.com/s/8wx1aae

3DGDV62G7OIX3JHQ3GT1WAN3575P25
[{'AssignmentId': '3TR2532VIQ4SMTVNL75LDAEPS536JI', 'WorkerId': 'A8FLNGDLNJCUU', 'HITId': '3DGDV62G7OIX3JHQ3GT1WAN3575P25', 'AssignmentStatus': 'Approved', 'AutoApprovalTime': datetime.datetime(2019, 2, 8, 0, 52, 58, tzinfo=tzlocal()), 'AcceptTime': datetime.datetime(2019, 2, 1, 0, 47, 28, tzinfo=tzlocal()), 'SubmitTime': datetime.datetime(2019, 2, 1, 0, 52, 58, tzinfo=tzlocal()), 'ApprovalTime': datetime.datetime(2019, 2, 1, 3, 5, 45, tzinfo=tzlocal()), 'Answer': '<?xml version="1.0" encoding="ASCII"?><QuestionFormAnswers xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionFormAnswers.xsd"><Answer><QuestionIdentifier>workerId</QuestionIdentifier><FreeText>A8FLNGDLNJCUU</FreeText></Answer><Answer><QuestionIdentifier>results</QuestionIdentifier><FreeText>{"inputs":[],"outputs":[["240,600,https://www.dropbox.com/s/ievwpfwk2b7ri42/infographics_1_26_MACW2lF5fbY.png?raw=1:3,2,1,17,56,20,56,23,56,28,55,51,54,72,53,91,53,10

30