In [1]:
import boto3
import datetime
import json

import os
from pathlib import Path
import datetime
import uuid
import pandas as pd

# This notebook is an example to create HIT tasks for batch of image urls each HIT containing 20 images. 
# The csv file required is already creation using "generate_csv_image_urls" notebook.

## Guidelines to run this notebook
***
1. This file assumes you have downloaded the entire code folder from <a href="https://drive.google.com/drive/folders/1O8HrV5hgWwTM5dNJzhEzOaQddZSZThp5?usp=sharing" target="_blank"> this google drive folder</a>. 

2. This files assumes LINZ dataset in folder above 'code' folder. So please make sure to download the LINZ datatset before proceeding to run this notebook.<a href="https://drive.google.com/drive/u/0/folders/1SetuO2jiS2sEx105AjiRFYyJsKk9IpGV" target="_blank"> this google drive folder.</a>.

3. This notebook demonstrates for Sandbox requester account and creating  a batch of HITs for a given image_urls file. The layout is created already on MTurk. MTurk provides an option to upload a csv file. However, we are creating a batch from script.

In [2]:
def time_converter(o):
    if isinstance(o, datetime.datetime):
        return o.__str__()

rootpath = Path(os.getcwd())
assert os.path.exists(rootpath), "Output path does not exists."


In [3]:
output_path = os.path.join(rootpath, 'output')
if not os.path.exists(output_path):
    os.mkdir(output_path)
qualification_test_path = os.path.join(output_path, 'qualification_tests')
if not os.path.exists(qualification_test_path):
    os.mkdir(qualification_test_path)
if not os.path.exists(output_path):
    os.mkdir(output_path)
hits_path = os.path.join(output_path,'hits')
if not os.path.exists(hits_path):
    os.mkdir(hits_path)


In [4]:
layout_path = os.path.join(rootpath.parent.parent.parent,"layouts" )
assert os.path.exists(layout_path), "Layouts path does not exists. Please check the directory exists."
layout_path

'C:\\Users\\nerdybug\\Documents\\mturk-task-helper\\layouts'

## Load questions and answers key for classification task's qualification test.

In [5]:
questions = open(os.path.join(layout_path,'questions.xml'), mode='r').read()
answers = open(os.path.join(layout_path,'answer_key.xml'), mode='r').read()

In [6]:
assert os.path.exists(layout_path), "questions.xml does not exists. Please check the file exists."
assert os.path.exists(layout_path), "answers.xml path does not exists. Please check the file exists."

## Initialize mTurk client connection. Assumes you have configuration for AWS i.e. ~/.aws/ folder .

In [8]:
assert questions and answers, "Either Questions or Answers templates do not exist. Please check and try again!"

region_name = 'us-east-1'

endpoint_url = 'https://mturk-requester-sandbox.us-east-1.amazonaws.com'
#prod_url = "https://mturk-requester.us-east-1.amazonaws.com"

client = boto3.client(
    'mturk',
    endpoint_url=endpoint_url,
    region_name=region_name,
)
# This will return $10,000.00 in the MTurk Developer Sandbox
print(client.get_account_balance()['AvailableBalance'])

10000.00


## Create Qualification Test. 

In [10]:
qual_response = client.create_qualification_type(
                        Name='Vehicle Classification Qualification Test may 18: example',
                        Keywords='test, qualification, vehicles, boto',
                        Description='This is an qualification test',
                        QualificationTypeStatus='Active',
                        Test=questions,
                        AnswerKey=answers,
                        TestDurationInSeconds=400)
print(qual_response['QualificationType']['QualificationTypeId'])
qual_id = qual_response['QualificationType']['QualificationTypeId']
print( qual_response['QualificationType']['Name'])

assert qual_response and qual_id, "Qualification creation error!"

31YLGOADI2O9MRA31FE8DUYQGAOBUI
Vehicle Classification Qualification Test may 18: example


## Save Qualification Test to file.

In [11]:
with open(os.path.join(qualification_test_path, str(qual_id)+".json"), 'w') as qualification_json_file:
    json.dump(qual_response, qualification_json_file, default = time_converter)

## Given HIT Layout ID already created on MTurk requester website, and a Qualification test we already created, now we create a HIT and batch.

In [12]:
#set HIT task parameters
Reward = "2.5"
MaxAssignments = 3 #number of worker assignments per HIT
LifetimeInSeconds = 432000 # 5days
AssignmentDurationInSeconds = 1200 #10 minutes + 10mins grace period 
HITLayoutId= "375TY3MH02JRVLT08BQ9C9L58V0JK3" #
RequesterAnnotation='string'
QualificationRequirements = [{'QualificationTypeId': qual_id,
                                   'Comparator': 'EqualTo',
                                   'IntegerValues':[100]}]
AutoApprovalDelayInSeconds = 259200
Title='Classify each veichle to defined vehicle classes in the images'
Description='This HIT task requires a qualification.'
Keywords='vehicles, classify, qualification, test'

In [13]:
def save_batch(hits):
    with open(os.path.join(hits_path, "new_batch"+".json"), 'w') as hit_json_file:
         for hit in hits:
            json.dump(hit, hit_json_file, default = time_converter)

In [14]:
def create_hit(HITLayoutParameters):
    hit = client.create_hit(
        Reward=Reward,
        LifetimeInSeconds=LifetimeInSeconds,
        AssignmentDurationInSeconds=AssignmentDurationInSeconds,
        MaxAssignments=MaxAssignments,
        Title=Title,
        Description=Description,
        Keywords=Keywords,
        AutoApprovalDelayInSeconds=AutoApprovalDelayInSeconds,
        QualificationRequirements=QualificationRequirements,
        #Question=hit_template,
        HITLayoutId= HITLayoutId, #from HIT layout created but not published on Requester website
        RequesterAnnotation=RequesterAnnotation,
        HITLayoutParameters=HITLayoutParameters,
        )
    #print(hit)
    assert hit and hit['HIT']['HITId'], "HIT creation failed"
    return hit

##  Load Image URLS csv file we created using generate_csv_image_urls notebook

In [15]:
imageurl_df = pd.read_csv(os.path.join(rootpath.parent.parent,"input_data",'image_urls.csv'))
assert imageurl_df, "Input Image urls csv file do not exist. Please check and try again!"
imageurl_df

Unnamed: 0,image1_url,image2_url,image3_url,image4_url,image5_url,image6_url,image7_url,image8_url,image9_url,image10_url,...,image16_url,image17_url,image18_url,image19_url,image20_url,image21_url,image22_url,image23_url,image24_url,image25_url
0,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...,https://mturk-s3-cg.s3.amazonaws.com/task1/bat...


## We need 25 image_url variables for a single HIT. We get all column names. However we may have column names > 25, so we create only for first 25 columns.

In [16]:
colnames = imageurl_df.columns.values.tolist()
colnames

['image1_url',
 'image2_url',
 'image3_url',
 'image4_url',
 'image5_url',
 'image6_url',
 'image7_url',
 'image8_url',
 'image9_url',
 'image10_url',
 'image11_url',
 'image12_url',
 'image13_url',
 'image14_url',
 'image15_url',
 'image16_url',
 'image17_url',
 'image18_url',
 'image19_url',
 'image20_url',
 'image21_url',
 'image22_url',
 'image23_url',
 'image24_url',
 'image25_url']

## Now for each row in csv, create HITLayoutParameters  
### `[{'Name': colname, "Value": row_dict[colname]} for colname in colnames[:20] ] ` 
### is of format  
#### `[{'Name': image1_url, "Value": 'https://mturk-s3-cg.s3.amazonaws.com/task2-images/LINZ/0001_0001/0001_0001_2462-6635-small.png'},{'Name': image2_url, "Value": 'https://mturk-s3-cg.s3.amazonaws.com/task2-images/LINZ/0003_0002/0003_0002_120-10808-trailer_small.png'},{'Name': image3_url, "Value": 'https://mturk-s3-cg.s3.amazonaws.com/task2-images/LINZ/0004_0001/0004_0001_10899-12065-small.png'} ....{'Name': image20_url, "Value": 'https://mturk-s3-cg.s3.amazonaws.com/task2-images/LINZ/0008_0005/0008_0005_8688-7952-specialized.png'}]  `

In [17]:
HITLayoutParameters = []
hits = []
for row_dict in imageurl_df.to_dict(orient='records'):
    HITLayoutParameters = [{'Name': colname, "Value": row_dict[colname]} for colname in colnames[:25] ]
    
    #print(HITLayoutParameters)
    result = create_hit(HITLayoutParameters)
    assert result, "HIT creation failed. Please check and try again."
    hits.append(result)  
    #print([{'Name': colname, "value": row_dict[colname]} for colname in colnames ])

## A HIT response after successful creation of HIT looks as follows:

In [18]:
hits[0]['HIT']['Title'],time_converter(hits[0]['HIT']['CreationTime'])

('Classify each veichle to defined vehicle classes in the images',
 '2021-05-18 00:59:30-04:00')

## List all HITs of the calling requester

In [19]:
hitlist = client.list_hits()['HITs']
hitlist[0]['Title']

'Classify each veichle to defined vehicle classes in the images'

## List all HITs for a given Qualificaion Type

In [20]:
response = client.list_qualification_types(
        MustBeRequestable=True,
        MustBeOwnedByCaller=True,
    )
assert response and len(response['QualificationTypes']) > 0, "There are no qualification tests."
response['QualificationTypes'][1]['Name']

'Vehicle Classification Qualification Test: example'

In [21]:
response = client.list_hits_for_qualification_type(
    QualificationTypeId=qual_id,
    MaxResults=100
)
assert response and len(response['HITs']) > 0, "There are no HIT tasks."

In [22]:
response['HITs'][0]['Title']

'Classify each veichle to defined vehicle classes in the images'

In [23]:
len(response['HITs'])

1

In [24]:
hit_id = hits[0]['HIT']['HITId']
with open(os.path.join(hits_path, str(hit_id)+".json"), 'w') as hit_json_file:
    json.dump(hits[0], hit_json_file, default = time_converter)

In [25]:
hit = client.create_hit(
        Reward='1.25',
        LifetimeInSeconds=3600,
        AssignmentDurationInSeconds=600,
        MaxAssignments=3,
        Title='Classify each veichle to defined vehicle classes in the images',
        Description='A HIT that requires a qualification.',
        Keywords='vehicles, classify, qualification, test',
        AutoApprovalDelayInSeconds=0,
        QualificationRequirements=[{'QualificationTypeId': qual_id,
                                   'Comparator': 'EqualTo',
                                   'IntegerValues':[100]}],
        #Question=hit_template,
        HITLayoutId= "3PCCVH526IHZMHPEKU6K502VBAS7P7", #from HIT layout created but not published on Requester website
        RequesterAnnotation='string',
        )
print(hit['HIT']['HITId'])
assert hit and hit['HIT']['HITId'], "HIT creation failed"
hit_id = hit['HIT']['HITId']

3XUY87HIWWWU5MNJSQVDOQ7SUTNMMI


In [26]:
with open(os.path.join(hits_path, str(hit_id)+".json"), 'w') as hit_json_file:
    json.dump(hit, hit_json_file, default = time_converter)

## Delete some of HITs that are in assignable state for example demonstration.

In [27]:
for hit in hits[0:]:
    if hit['HIT']['HITStatus'] == 'Assignable':
        response = client.update_expiration_for_hit(
                HITId=hit['HIT']['HITId'],
                ExpireAt=datetime.datetime(2015, 1, 1)
            )   
    try:
        response = client.delete_hit(
            HITId=hit['HIT']['HITId']
        )
    except:
        print('Not deleted')
    else:
        print("deleted")

    response = client.get_hit(
        HITId=hit['HIT']['HITId']
    )
    print(response['HIT']['HITStatus']) #should be "Disposed" after deletion.

deleted
Disposed


## Deleting all HITs leaving out first 1 HITs for the given Qualification Test Type ID

In [29]:
response = client.list_hits()

if hitlist:
    for hit in response['HITs'][:1]:
        print(hit['QualificationRequirements'][0]['QualificationTypeId'] , qual_id)
        if hit['HITStatus'] == 'Assignable' and hit['QualificationRequirements'][0]['QualificationTypeId'] == qual_id:
            print(hit['HITId'])
            response = client.update_expiration_for_hit(
                    HITId=hit['HITId'],
                    ExpireAt=datetime.datetime(2015, 1, 1)
                )   
        try:
            response = client.delete_hit(
                HITId=hit['HITId']
            )
        except:
            print('Not deleted')
        else:
            print("deleted")

        response = client.get_hit(
            HITId=hit['HITId']
        )
        print(response['HIT']['HITStatus'])

31YLGOADI2O9MRA31FE8DUYQGAOBUI 31YLGOADI2O9MRA31FE8DUYQGAOBUI
3XUY87HIWWWU5MNJSQVDOQ7SUTNMMI
deleted
Disposed
