In [46]:
import os
import pandas as pd
import numpy as np
import re
import json

def convert_to_datapipeline(image_set, im_url_col):
    """
    prepare csv to use datapipluse's tagging model to extract category/gender label
    
    Args:
        image_set  (dataframe): input query set or reference set dataframe 
        im_url_col (str): the image url column name

    Returns:
        image_set_dp (dataframe): the input for datapipeline
    """    
    image_set_dp = pd.DataFrame(columns=['im_url', 'concepts', 'metadatas'])
    image_set_dp['im_url'] = image_set[im_url_col]
    image_set_dp['concepts'] = "[{'model_name': None, 'version': None, 'labels': {'system': None, 'ground_truth_source': None, 'source_id': None}, 'objects': [{'tags': [{'tag_group': None, 'tag': None, 'full_tag': None, 'score': None}], 'box': None, 'score': None}], 'reference': []}, {'model_name': None, 'version': None, 'labels': {'system': None, 'ground_truth_source': None, 'source_id': None}, 'objects': [{'tags': [{'tag_group': None, 'full_tag': None, 'tag': None, 'score': None}], 'box': None, 'score': None}], 'reference': []}]"
    image_set_dp['metadatas'] = '{}'
    image_set_dp.dropna(inplace=True)
    return image_set_dp

def split_dev_test(data_set, dev_frac, category):
    """
    split data_set into dev/test set
    Args:
        data_set  (dataframe): the data set to be splitted
        dev_frac  (float): the fraction of dev set
        category  (string): split is done per category to maintain the distribution

    Returns:
        (dev_set, test_set)  (tuple): tuple of dev/test set dataframes
    """  
    dev_set = data_set.groupby(category).apply(lambda x:x.sample(frac=dev_frac,random_state=0))
    dev_set.reset_index(level=0, drop=True, inplace=True)
    test_set = data_set.drop(dev_set.index)
    return dev_set, test_set

def extract_json_tag(element, is_gender=True):
    """
    Args:
        element  (dictionary): element in the json list
        is_gender(bool): True -> extract gender label, False -> extract category label
    Returns:
        result (list): 
        if gender,  [im_name, gender, 4 scores in alphabetical order]
        if category,[im_name, category, 20 scores in alphabetical order]
    """  
    name = element['name'].split('/')[-1]
    if name.endswith('.jpg'):
        name = name[:-4]
    scores = dict()
    result = [name]
    if is_gender:
        #gender
        for item in element['gender_detect']:
            scores[item['tag']] = item['score']
        category = element['gender_detect'][0]['tag']
    else:
        # category
        for item in element['product_tagging_flat_backpack']:
            if item['tag'].startswith('root||'):
                scores[item['tag']] = item['score']
        category = element['product_tagging_flat_backpack'][0]['tag']
    keys = sorted(scores.keys())
    result.append(category)
    for key in keys:
        result.append(scores[key])
    return result

def json_to_df(json_folder, is_gender=True):
    """
    Args:
        json_folder (string): path of json folder
        is_gender   (bool): True -> extract gender label, False -> extract category label
    Returns:
        result_df (dataframe):
        if gender, columns = ['im_name', 'gender', 4 tags in alphabetical order]
        if category, columns =['im_name', 'category', 20 tags in alphabetical order]  
    """  
    result_file_list = os.listdir(json_folder)
    result_json = []
    for result_file in result_file_list:
        with open(os.path.join(json_folder, result_file), 'r') as f:
            result_json += json.load(f)
    result_data = list(map(lambda x:extract_json_tag(x, is_gender), result_json))
    if is_gender:
        cols = ['im_name', 'gender']
        keys = list(map(lambda x:x['tag'], result_json[0]['gender_detect']))
    else:
        cols = ['im_name', 'category']
        keys = []
        for item in result_json[0]['product_tagging_flat_backpack']:
            if item['tag'].startswith('root||'):
                keys.append(item['tag'])
    cols += sorted(keys)
    result_df = pd.DataFrame(columns=cols, data=result_data)
    return result_df

def divide_to_img_list(img_folder, path_prefix, output_prefix, batch_size):
    """
    split the images to img_list for mannually triggered multiprocessing when do local recognition
    Args:
        img_folder  (string): path img folder
        path_prefix (string): path prefix in hydra2
        output_prefix (string): output path + name prefix for img list
        batch_size  (int): each img list contains batch_size imgs
    Returns:
        result_df (dataframe): columns = ['path']
    """  
    image_names = os.listdir(img_folder)
    df = pd.DataFrame(columns=['path'])
    img_num = len(image_names)
    shard = math.ceil(img_num / batch_size)
    for i in range(shard):
        start_idx = i * batch_size
        end_idx = min(img_num, start_idx + batch_size)
        df = pd.DataFrame(columns=['path'], data=image_names[start_idx:end_idx])
        df = df.apply(lambda row: os.path.join(path_prefix, row))
        df.to_csv(output_prefix + str(i), index=False, header=False)
    print('done')

In [2]:
#load query set, combining them, remove duplicate
data_folder = os.path.join(os.getcwd(), 'data', 'gender')
query_csvs = ['queryset_1347_gender_search_eval_v1.csv', 'queryset_1356_gender_search_eval_v3_ugc.csv']
query = pd.DataFrame()
for query_csv in query_csvs:
    query_cur = pd.read_csv(os.path.join(data_folder, query_csv))
    query = pd.concat([query, query_cur])
#remove duplicate and reset index
query.drop_duplicates(subset=['Query Set URL'],inplace=True)
query.reset_index(inplace=True, drop=True)
query['Image Id'] = query['Image Id'].apply(str)

In [115]:
#use download script to download images and do local recognition
query_download = query[['Image Id', 'S3 URL', 'Category']]
query_download.columns = ['im_name', 's3_url', 'category']
query_download.to_csv(os.path.join(data_folder, 'gender_query_download.csv'), index=False)

In [50]:
#process the result from hydra2 result
query_gender_predict = json_to_df(os.path.join(data_folder, 'query_gender_predict'), True)
query_category_predict = json_to_df(os.path.join(data_folder, 'query_category_predict'), False)

#concate predict result with query set
query_w_predict = query.join(query_gender_predict.set_index('im_name'), on='Image Id', rsuffix='_predict')
query_w_predict = query_w_predict.join(query_category_predict.set_index('im_name'), on='Image Id', rsuffix='_predict')

In [53]:
#split query set into dev/test set per category
dev_query, test_query = split_dev_test(query_w_predict, 0.5, 'Category')

#save dev/test query to data folder
dev_query.to_csv(os.path.join(data_folder, 'dev_gender_query.csv'), index=False)
test_query.to_csv(os.path.join(data_folder, 'test_gender_query.csv'), index=False)

In [54]:
#load reference set and remove duplicates and reset index
reference = pd.read_csv(os.path.join(data_folder, 'gender_ref_ops_export'))
reference.drop_duplicates(subset=['im_url'],inplace=True)
reference.reset_index(inplace=True, drop=True)

In [17]:
#use download script to download images and do local recognition
reference[['im_name', 's3_url', 'gender']].to_csv(os.path.join(data_folder, 'gender_ref_download.csv'), index=False)

In [None]:
#after download images to hydra2 folder, split the images to img_list and do mannually triggered multiprocessing
img_folder = '/home/xz/hydra2_home_mnt/data/gender_ref'
path_prefix = '/home/zhangxiong/data/gender_ref'
output_prefix = '/home/xz/hydra2_home_mnt/data/gender_img_list/gender_img_list_'
batch_size = 30000

divide_to_img_list(img_folder, path_prefix, output_prefix, batch_size):

In [55]:
#process the result from hydra2 result
ref_gender_predict = json_to_df(os.path.join(data_folder, 'ref_gender_predict'), True)
ref_category_predict = json_to_df(os.path.join(data_folder, 'ref_category_predict'), False)

#concate predict result with reference set and prepare csv for upload in dashboard
reference_w_predict = reference.join(ref_gender_predict.set_index('im_name'), on='im_name', rsuffix='_predict')
reference_w_predict = reference_w_predict.join(ref_category_predict.set_index('im_name'), on='im_name', rsuffix='_predict')

In [57]:
#save ref_w_predict
reference_w_predict.to_csv(os.path.join(data_folder, 'ref_w_predict.csv'), index=False)

In [60]:
#do not upload scores to dashboard
upload_cols = ['im_name', 's3_url', 'gender', 'category', 'source', 'gender_predict', 'category_predict']
reference_upload = reference_w_predict[upload_cols]
reference_upload.columns = reference_upload.columns.str.replace('s3_url','im_url')

In [62]:
#split ref into dev/test set per gender
dev_ref, test_ref = split_dev_test(reference_upload, 0.5, 'gender')

#save dev/test query to data folder
dev_ref.to_csv(os.path.join(data_folder, 'dev_gender_ref.csv'), index=False)
test_ref.to_csv(os.path.join(data_folder, 'test_gender_ref.csv'), index=False)