# Notebook to generate fold text files for the mturk task

This notebook generates fold.txt files for each element in the canva_scarping2 dataset.

Each fold.txt will contain 5-10 image names from the canva_scraping2 dataset. The image names will be links to Dropbox, as the canva_scraping2 dataset is hosted there.

Folds can be built with a mix of multiple classes in each one of them, as to avoid having only book-covers or web-ads in each fold.

Example structure of a fold.txt file: 

https://www.dropbox.com/s/gp9snrmlc54d3vo/certificates_1_18_MACS_ooy31s.png?raw=1
https://www.dropbox.com/s/c32nwr8y37sw3or/certificates_1_10_MACTFpwVtmk.png?raw=1

.
.
.
.




In [1]:
# Imports
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import requests
import json


In [2]:
# Constants
USE_FULL_DATASET = True
CLASSES_TO_USE = None
WITH_TEXT = True
FILES_PER_FOLD = 10
DATASET_PATH = '../canva_scraping2_clean'
FOLD_OUTPUT_DIR = './files'

## Dataset stats

In [3]:
# Get statistics on canva_scraping2

# Get folder names
canva_scraping2_folders = [f for f in os.listdir(DATASET_PATH) if '_' not in f and '.' not in f and os.path.isdir(os.path.join(DATASET_PATH,f))]
print('Data folders in canva_scraping2:', canva_scraping2_folders)

num_elems_in_full_dataset = 0

SUBFOLDER = 'png_clean'

# Get number of elements in folders
for fol in canva_scraping2_folders:
    num_elems = len([p for p in os.listdir(os.path.join(DATASET_PATH, fol, 'png_clean')) if p.endswith('png')])
    print('Number of elements in folder %s: %d' % (fol, num_elems))
    num_elems_in_full_dataset += num_elems
    
print('Num elems in full dataset:', num_elems_in_full_dataset)

Data folders in canva_scraping2: ['book-covers', 'cd-covers', 'certificates', 'coupons', 'cvs', 'infographics', 'magazine-covers', 'posters', 'social-graphics', 'web-ads']
Number of elements in folder book-covers: 363
Number of elements in folder cd-covers: 28
Number of elements in folder certificates: 216
Number of elements in folder coupons: 175
Number of elements in folder cvs: 123
Number of elements in folder infographics: 148
Number of elements in folder magazine-covers: 37
Number of elements in folder posters: 1677
Number of elements in folder social-graphics: 1123
Number of elements in folder web-ads: 212
Num elems in full dataset: 4102


## Generate fold files

In [7]:
## Helper functions
def get_all_design_names(data_path, classes_to_use, subfolder='png'):    
    names_dict = {}
    
    for cl in classes_to_use:
        names_dict[cl] = []
    
    
    for cl in classes_to_use: 
        names_dict[cl] = [p for p in os.listdir(os.path.join(data_path, cl, subfolder)) if p.endswith('png')]
        
    return names_dict

## Generate fold files
def generate_fold_files(data_path, output_dir, classes_to_use = None, 
                        files_per_fold = 10, mix_classes = True, 
                        files_to_generate = 2, fold_prefix='fold', verbose = True, subfolder='png'):
    
    if not classes_to_use:
        classes_to_use = [f for f in os.listdir(DATASET_PATH) if '_' not in f and '.' not in f and os.path.isdir(os.path.join(DATASET_PATH,f))]

    
    num_folds_generated = 0
    unused_design_names = get_all_design_names(data_path, classes_to_use, subfolder=subfolder)
    
    while unused_design_names and num_folds_generated<files_to_generate:
        fold_txt_name = fold_prefix+str(num_folds_generated)+'.txt'
        fold_path = os.path.join(output_dir, fold_txt_name)
        
        generate_one_fold_file(fold_path, unused_design_names, files_per_fold, subfolder=subfolder)
        
        num_folds_generated +=1
        
        if verbose:
            print('Fold %s generated. %d files generated so far.' % (fold_txt_name, num_folds_generated))
        
    print('Done.')    
        
            
def generate_one_fold_file(fold_path, unused_design_names, files_per_fold, verbose =1, subfolder='png'):
    
    first_pass = True
    with open(fold_path, 'w+') as f:
        for _ in range(files_per_fold):
            if unused_design_names:
                # Sample a class
                cl = np.random.choice(list(unused_design_names.keys()))
                if verbose: print('chosen class:', cl)

                # Get a design name from that class
                design = np.random.choice(unused_design_names[cl])
                link = get_dropbox_link(design, cl, subfolder=subfolder)
                if verbose: print('Adding dropbox link to fold:', link)

                # Add to fold
                if not first_pass:
                    f.write('\n')
                f.write(link)
                first_pass=False

                # Remove that design from the list of usable designs
                unused_design_names[cl].remove(design)
                if not unused_design_names[cl]:
                    del unused_design_names[cl]
            
    return unused_design_names


def get_dropbox_link(design, cl, 
                     subfolder='png', 
                     create_link_first=True, 
                     base_folder = "/Graduate Studies/Harvard/AC299r Independent Research/canva_scraping2_clean/"):
    '''
    Uses the Dropbox API to get a shared dropbox link to a given design. The design must be located in:
    /Graduate Studies/Harvard/AC299r Independent Research/canva_scraping2_clean/"+cl+"/"+subfolder+"/"+design
    
    Inputs
    -----
    design: name of the design to get link for
    subfolder: subfolder where the design ois located, usually one of 'png', 'png_clean'
    create_link_first: if True, first tries to create the link before asking for the existing one. 
    This should be changed to False when most 
    base_folder: folder where the designs are located
    
    '''
    
    print('Trying to get dropbox link for',design)
    
    if create_link_first:
        url = "https://api.dropboxapi.com/2/sharing/create_shared_link_with_settings"
    else:
        url = "https://api.dropboxapi.com/2/sharing/list_shared_links"

    headers = {
        "Authorization": "Bearer -EVBtUP70JYAAAAAAACNue90xxn9wbCjB4RAZoE8Ys1buD7eEu0GRzbhD9XQqM9O",
        "Content-Type": "application/json"
    }

    data = {
        "path": base_folder+cl+"/"+subfolder+"/"+design
    }

    r = requests.post(url, headers=headers, data=json.dumps(data))    
    
    print(r.text) 
    
    if create_link_first:
        if 'error_summary' in r.json() and 'already_exists' in r.json()['error_summary']:
            print('Error found! Link already exists. Getting existing link...')
            url = "https://api.dropboxapi.com/2/sharing/list_shared_links"
            r = requests.post(url, headers=headers, data=json.dumps(data)) 
            link = r.json()['links'][0]['url'][:-4]+'raw=1'  
        else:
            link = r.json()['url'][:-4]+'raw=1'
    else:
        if not r.json()['links']:
            print('Error found! Shared link not created. Creating link...')
            url = "https://api.dropboxapi.com/2/sharing/create_shared_link_with_settings"
            r = requests.post(url, headers=headers, data=json.dumps(data)) 
            link = r.json()['url'][:-4]+'raw=1'
        else:
            link = r.json()['links'][0]['url'][:-4]+'raw=1'
    
    return link


def dropbox_links_to_file_loc(dropbox_links, mid_folder='png'):
    file_locs = []
    for d in dropbox_links:
        name = d.split('/')[-1].split('?')[0]
        cl = name.split('_')[0]
        
        file_locs.append(os.path.join(cl, mid_folder, name))
        
    return file_locs
        

In [8]:
# Testing the get_dropbox_link function
get_dropbox_link('posters_50_4_MACffVAezac.png', 'posters', subfolder='png_clean',create_link_first=False)

In [None]:
DATASET_PATH = '../canva_scraping2_clean'

generate_fold_files(data_path=DATASET_PATH, 
                    output_dir=FOLD_OUTPUT_DIR, 
                    classes_to_use=['posters'], # if None, all classes will be used
                    files_per_fold = 10, 
                    mix_classes=True, 
                    files_to_generate=15,
                    fold_prefix = 'fold_poster_clean',
                    subfolder = 'png_clean')