In [1]:
import numpy as np
import dask.dataframe as dd
import pandas as pd
from mlt.default import selector as s
import cv2, os
import matplotlib.pyplot as plt
%matplotlib inline

# Configs

In [2]:
ATTR_DATA_FILE = "/home/jovyan/vmldata/tmp_shrek/20190104_raw_data/raw_attribute_data_shrek_cleaned.h5"
IMAGE_DATA_FILE = "/home/jovyan/vmldata/tmp_shrek/20190104_raw_data/raw_image_data_patrick.h5"

# Please choose limited CATEGORIES. This will avoid Memory issues.
SELECTED_CATEGORIES = ["HANDBAG", "HANDBAGS", 
                       "WATCH", "WATCHES",
                       "PANTS", "JEANS", "DRESS",
                       "TOP", "TOPS", "SWEATER", "SWEATERS", "COAT", "COATS"]

SELECTED_CATEGORIES = ["COAT", "COATS"]

# optional config. Make it empty list if not used.
SELECTED_ATTRIBUTES = ['Handbag Style', 'Handbag Size', 'Handbag Occasion', 'Handbag Material']
SELECTED_ATTRIBUTES = []

SAVE_OUTPUT_AS = "/home/jovyan/vmldata/tmp_shrek/COAT_VALIDATION_DATA.h5"

# Configs Validation

In [3]:
if len(SELECTED_CATEGORIES) == 0:
    print("ERROR: Please limit your SELECTED_CATEGORIES. Otherwise it takes too long. May also run out of memory.")
    for _ in range(10):
        print("STOP")

# Load IDM raw files

In [4]:
# load attr file
raw_attr = pd.read_hdf(ATTR_DATA_FILE)
print("count raw_attr", len(raw_attr))
raw_attr.head()

count raw_attr 24503412


Unnamed: 0,PROD_ID,ATTR_NAME,ATTR_VAL
0,9,NRF_ID,143
1,9,NRF_ID,46
2,9,NRF_ID,640
3,9,NRF_ID,650
4,45,Age Group,Adult


In [5]:
# convert Top's -> TOPS 
# basically remove all single quotes and brackets so df_query will work without issues later
# 
# this step takes a long time. So I have saved its output as raw_attribute_data_shrek_cleaned.h5
def clean_category_value(row):
    name = row['ATTR_NAME']
    val = row['ATTR_VAL']
    if name == 'Category':
        val = val.upper()
        val = val.replace(" ", "_")
        val = val.replace("'", "")
    return val 

#raw_attr['ATTR_VAL'] = raw_attr.apply(clean_category_value, axis=1)
#raw_attr.head()

In [6]:
# load image file
raw_image = pd.read_hdf(IMAGE_DATA_FILE)
raw_image = raw_image[raw_image['COLORWAY_IMAGE_ROLE_TYPE'] == 'CPRI']
raw_image = raw_image[['PRODUCT_ID', 'IMAGE_ID']]

print("count raw_image", len(raw_image))

def make_image_filename(row):
    pid = str(row['PRODUCT_ID'])
    return pid[0] + '/' + pid[1] + '/' + pid + '_' + str(row['IMAGE_ID']) + '.jpg'

raw_image['IMAGE_FILE'] = raw_image.apply(make_image_filename, axis=1)

raw_image.head()

count raw_image 705487


Unnamed: 0,PRODUCT_ID,IMAGE_ID,IMAGE_FILE
0,5923807,9359780,5/9/5923807_9359780.jpg
7,5303346,9014538,5/3/5303346_9014538.jpg
8,7118640,10436245,7/1/7118640_10436245.jpg
9,8220620,10436245,8/2/8220620_10436245.jpg
10,5303354,9014637,5/3/5303354_9014637.jpg


# Filter Attributes df for Selected Categories

In [7]:
# if no categories are provided then use all unique categories from the data
if len(SELECTED_CATEGORIES) == 0:
    UNIQUE_CATEGORIES = list(set(raw_attr[raw_attr['ATTR_NAME'] == 'Category']['ATTR_VAL']))
    SELECTED_CATEGORIES = UNIQUE_CATEGORIES
    
# collect unique attributes for selected categories
if len(SELECTED_CATEGORIES) == 1:
    query_string = 'ATTR_NAME == "Category" and ATTR_VAL == "{}"'.format(str(SELECTED_CATEGORIES[0]))
else:
    query_string = 'ATTR_NAME == "Category" and (ATTR_VAL == "{}"'.format(str(SELECTED_CATEGORIES[0]))
    for idx, SELECTED_CATEGORY in enumerate(SELECTED_CATEGORIES):
        if idx == 0: continue
        query_string = query_string + ' or ATTR_VAL == "{}"'.format(str(SELECTED_CATEGORY))
    query_string = query_string + ")"

print("\n", "query_string:", query_string)

cat_df = raw_attr.query(query_string)
cat_pids = list(set(cat_df['PROD_ID']))
cat_df = raw_attr.loc[raw_attr['PROD_ID'].isin(cat_pids)] 
print("\n", "df shape:", cat_df.shape)

cat_df.head()


 query_string: ATTR_NAME == "Category" and (ATTR_VAL == "COAT" or ATTR_VAL == "COATS")

 df shape: (304732, 3)


Unnamed: 0,PROD_ID,ATTR_NAME,ATTR_VAL
96413,118856,Age Group,Child
96414,118856,Age Range (Toys),2 years
96415,118856,Brand,Kidorable
96416,118856,Category,COAT
96417,118856,Category of Business,Baby


In [8]:
raw_attr[raw_attr['PROD_ID'] == 144252]

Unnamed: 0,PROD_ID,ATTR_NAME,ATTR_VAL
127214,144252,Age Group,Adult
127215,144252,Age Group,Teen
127216,144252,Brand,Eyeshadow
127217,144252,Category,TOPS
127218,144252,Category of Business,thisIT
127219,144252,Category of Business,Women's
127220,144252,Department Type,Shirts
127221,144252,Department Type,Tops
127222,144252,Gender,Women
127223,144252,More Colors Available,true


In [9]:
image_pids = list(set(raw_image['PRODUCT_ID']))

print("cat_pids", len(cat_pids))
print("image_pids", len(image_pids))


inter = list(set(cat_pids) & set(image_pids))
print("inter", len(inter))

cat_pids 22102
image_pids 417001
inter 2821


# Pivot cat_df

In [10]:
def dedup_rows(x):
    if len(x) > 1:
        o = set([i for i in x])
        return ', '.join(o)
    else:
        return x
    
aggregations = {
    'ATTR_VAL': dedup_rows
}
cat_df_pivoted = cat_df.groupby(['PROD_ID', 'ATTR_NAME']).agg(aggregations).reset_index()
cat_df_pivoted = cat_df_pivoted.pivot(index='PROD_ID', columns='ATTR_NAME', values='ATTR_VAL')
print("\n", "shape cat_df_pivoted", cat_df_pivoted.shape)
cat_df_pivoted.head()


 shape cat_df_pivoted (22102, 91)


ATTR_NAME,Accessories Type,Active Apparel Type,Activewear,Age Group,Age Range (Toys),Apparel Occasion,Body Type,Bottom Type,Brand,Category,...,Sub Brand,Suit Fit,Suit Style,Swim Style,T-Shirt Style,Top Style,Trends,Wallet Style,Weather,Wedding Occasion
PROD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
118856,,,,Child,2 years,,,,Kidorable,COAT,...,,,,,,,,,,
119976,,,,Child,2 years,,,,Kidorable,COAT,...,,,,,,,,,,
138727,,,,,,,,,Polo Ralph Lauren,COAT,...,,,,,,,,,Warm Weather,
162579,,Jackets,,Adult,,,,,Nautica,COAT,...,,,,,,,,,Warm Weather,
186774,,,,Child,,,,,Kidorable,COAT,...,,,,,,,,,,


# Merge ImageData with AttributeData

In [11]:
merged_df = raw_image.merge(cat_df_pivoted, left_on="PRODUCT_ID", right_on="PROD_ID")
print("\n", "shape merged_df", merged_df.shape)
merged_df.tail()


 shape merged_df (7199, 94)


Unnamed: 0,PRODUCT_ID,IMAGE_ID,IMAGE_FILE,Accessories Type,Active Apparel Type,Activewear,Age Group,Age Range (Toys),Apparel Occasion,Body Type,...,Sub Brand,Suit Fit,Suit Style,Swim Style,T-Shirt Style,Top Style,Trends,Wallet Style,Weather,Wedding Occasion
7194,6491856,9818519,6/4/6491856_9818519.jpg,,,,Adult,,,,...,,,,,,,,,,
7195,6907404,10463320,6/9/6907404_10463320.jpg,,,,Adult,,,,...,,,,,,,,,,
7196,6907476,10463344,6/9/6907476_10463344.jpg,,,,Adult,,,,...,Tommy Hilfiger Adaptive,,,,,,,,,
7197,6907402,10463533,6/9/6907402_10463533.jpg,,,,Adult,,,,...,Tommy Hilfiger Adaptive,,,,,,,,,
7198,6417977,9726630,6/4/6417977_9726630.jpg,,,,Adult,,,,...,,,,,,,,,,


# Visualize final df

In [12]:
columns_names = ['IMAGE_FILE', 'Category']
print(SELECTED_ATTRIBUTES)
columns_names.extend(SELECTED_ATTRIBUTES)
print(columns_names)
viz_df = merged_df[columns_names]
print("\n", "shape viz_df", viz_df.shape)
print("\n", viz_df['Category'].unique())
viz_df.head()

[]
['IMAGE_FILE', 'Category']

 shape viz_df (7199, 2)

 ['COAT']


Unnamed: 0,IMAGE_FILE,Category
0,7/7/7763181_11114403.jpg,COAT
1,6/6/6663115_10101404.jpg,COAT
2,6/6/6663115_10101405.jpg,COAT
3,6/6/6663115_10101406.jpg,COAT
4,4/8/4826343_9777264.jpg,COAT


# Save df

In [13]:
merged_df.to_hdf(SAVE_OUTPUT_AS, key="macys_images")
print("Saved")

Saved


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['IMAGE_FILE', 'Accessories Type', 'Active Apparel Type', 'Activewear', 'Age Group', 'Age Range (Toys)', 'Apparel Occasion', 'Body Type', 'Bottom Type', 'Brand', 'Category', 'Category of Business', 'Characters', 'Characters - EA', 'Coat Feature', 'Coat Length', 'Coat Style', 'Coat Weight', 'Cold Weather', 'Collar', 'Collection', 'College Team', 'Color Family', 'Dad Style', 'Denim Fit', 'Denim Wash', 'Department Type', 'Dress Length', 'Dress Style', 'Fabric', 'Fabric Pattern', 'Fabric Property', 'Fill', 'Gender', 'Gender/Age', 'Handbag Material', 'Handbag Style', 'INC Trends', 'Impulse Mens Trend', 'Impulse Trends', 'Jacket Closure', 'Jacket Length', 'Jacket Style', 'Kids Apparel Type', "Kids' Accessories", 'Kitchen Gadget', 'League', "Levi's Fabrication", "Levi's Style", 'Lids Clothing & Accessories', 'Linen/Linen Blend', 'Loungewear'

# Roload for QA

In [14]:
reloaded_df = pd.read_hdf(SAVE_OUTPUT_AS)
print("\n", "shape reloaded", reloaded_df.shape)
reloaded_df.head()


 shape reloaded (7199, 94)


Unnamed: 0,PRODUCT_ID,IMAGE_ID,IMAGE_FILE,Accessories Type,Active Apparel Type,Activewear,Age Group,Age Range (Toys),Apparel Occasion,Body Type,...,Sub Brand,Suit Fit,Suit Style,Swim Style,T-Shirt Style,Top Style,Trends,Wallet Style,Weather,Wedding Occasion
0,7763181,11114403,7/7/7763181_11114403.jpg,,,,Adult,,,,...,,,,,,,,,,
1,6663115,10101404,6/6/6663115_10101404.jpg,,,,Adult,,,,...,,,,,,,,,,
2,6663115,10101405,6/6/6663115_10101405.jpg,,,,Adult,,,,...,,,,,,,,,,
3,6663115,10101406,6/6/6663115_10101406.jpg,,,,Adult,,,,...,,,,,,,,,,
4,4826343,9777264,4/8/4826343_9777264.jpg,,,,Adult,,,,...,,,,,,,,,Cold Weather,


In [15]:
for cat in SELECTED_CATEGORIES:
    tmp_df = reloaded_df[reloaded_df['Category'] == cat]
    print(cat, len(tmp_df))

COAT 7199
COATS 0
