In [None]:
import numpy as np
import dask.dataframe as dd
import pandas as pd
from mlt.default import selector as s
import cv2, os
import matplotlib.pyplot as plt
%matplotlib inline

# Configs

In [2]:
ATTR_DATA_FILE = "raw_attribute_data_shrek_cleaned.h5"
IMAGE_DATA_FILE = "raw_image_data_patrick.h5"



# Please choose limited CATEGORIES. This will avoid Memory issues.
SELECTED_CATEGORIES = ['BOOT', 'BOOTS', "PUMP", "PUMPS", "FLAT", "FLATS", 
                       "HANDBAG", "HANDBAGS", 
                       "WATCH", "WATCHES",
                       "PANTS", "JEANS", "DRESS",
                       "TOP", "TOPS", "SWEATER", "SWEATERS", "COAT", "COATS"]

SELECTED_CATEGORIES = ['BATH_RUGS', 'RUG', 'AREA_AREA_RUG', 'AREA_RUG', 'BATH_RUG']

# optional config. Make it empty list if not used.
SELECTED_ATTRIBUTES = ['Handbag Style', 'Handbag Size', 'Handbag Occasion', 'Handbag Material']
SELECTED_ATTRIBUTES = []

# CHANGE THIS
SAVE_OUTPUT_AS = "RUGS_VALIDATION_DATA.h5"

# Configs Validation

In [3]:
if len(SELECTED_CATEGORIES) == 0:
    print("ERROR: Please limit your SELECTED_CATEGORIES. Otherwise it takes too long. May also run out of memory.")
    for _ in range(10):
        print("STOP")

# Load IDM raw files

In [5]:
!ls *.h5

raw_attribute_data_shrek_cleaned.h5  raw_text_data_shrek.h5
raw_attribute_data_shrek.h5	     RUGS_IDM_1118_and_0119_DATA.h5
raw_image_data_denis.h5		     RUGS_IDM_Nov2018_DATA.h5
raw_image_data_patrick.h5	     RUGS_VALIDATION_DATA.h5


In [None]:
# load attr file
raw_attr = pd.read_hdf(ATTR_DATA_FILE)
print("count raw_attr", len(raw_attr))
raw_attr.head()

In [None]:
unique_cats = raw_attr[raw_attr['ATTR_NAME'] == 'Category']['ATTR_VAL']
unique_cats = list(set(unique_cats))
#unique_cats

In [25]:
[x for x in unique_cats if 'RUG' in x]

['BATH_RUGS', 'RUG', 'AREA_AREA_RUG', 'AREA_RUG', 'BATH_RUG']

In [18]:
# convert Top's -> TOPS 
# basically remove all single quotes and brackets so df_query will work without issues later
# 
# this step takes a long time. So I have saved its output as raw_attribute_data_shrek_cleaned.h5
def clean_category_value(row):
    name = row['ATTR_NAME']
    val = row['ATTR_VAL']
    if name == 'Category':
        val = val.upper()
        val = val.replace(" ", "_")
        val = val.replace("'", "")
    return val 

#raw_attr['ATTR_VAL'] = raw_attr.apply(clean_category_value, axis=1)
#raw_attr.head()

In [32]:
# apparently there are no duplicates ? strange. I guess if I subsetted by image id there would be
#print(raw_attr.shape, raw_attr.drop_duplicates(keep='first').shape)

In [33]:
# load image file
raw_image = pd.read_hdf(IMAGE_DATA_FILE)
raw_image = raw_image[raw_image['COLORWAY_IMAGE_ROLE_TYPE'] == 'CPRI']
raw_image = raw_image[['PRODUCT_ID', 'IMAGE_ID']]

print("count raw_image", len(raw_image))

def make_image_filename(row):
    pid = str(row['PRODUCT_ID'])
    return pid[0] + '/' + pid[1] + '/' + pid + '_' + str(row['IMAGE_ID']) + '.jpg'

raw_image['IMAGE_FILE'] = raw_image.apply(make_image_filename, axis=1)

raw_image.head()

count raw_image 705487


Unnamed: 0,PRODUCT_ID,IMAGE_ID,IMAGE_FILE
0,5923807,9359780,5/9/5923807_9359780.jpg
7,5303346,9014538,5/3/5303346_9014538.jpg
8,7118640,10436245,7/1/7118640_10436245.jpg
9,8220620,10436245,8/2/8220620_10436245.jpg
10,5303354,9014637,5/3/5303354_9014637.jpg


# Filter Attributes df for Selected Categories

In [34]:
# shoes_query_string = 'ATTR_NAME == "Category" and (ATTR_VAL == "SHOE" or ATTR_VAL == "SHOES")'
# print("\n", "query_string:", shoes_query_string)
# shoes_cat_df = raw_attr.query(shoes_query_string)
# shoes_cat_pids = list(set(shoes_cat_df['PROD_ID']))
# shoes_cat_df = raw_attr.loc[raw_attr['PROD_ID'].isin(shoes_cat_pids)]
# print("\n", "count shoes_cat_pids", len(shoes_cat_pids))

# boots_query_string = 'PROD_ID == @shoes_cat_pids and ATTR_NAME == "Shoe Type" and (ATTR_VAL == "Pump" or ATTR_VAL == "Pumps")'
# print("\n", "query_string:", boots_query_string)
# cat_df = raw_attr.query(boots_query_string)
# cat_pids = list(set(cat_df['PROD_ID']))
# cat_df = raw_attr.loc[raw_attr['PROD_ID'].isin(cat_pids)]
# print("\n", "count cat_pids", len(cat_pids))

rugs_query_string = 'PROD_ID == @rugs_cat_pids and ATTR_NAME == "Shoe Type" and (ATTR_VAL == "Pump" or ATTR_VAL == "Pumps")'
rugs_query_string = 'PROD_ID == @rugs_cat_pids and ATTR_NAME == "Shoe Type" and (ATTR_VAL == "Pump" or ATTR_VAL == "Pumps")'
rugs_query_string = 'PROD_ID == @rugs_cat_pids and ATTR_NAME == "Shoe Type" and (ATTR_VAL == "Pump" or ATTR_VAL == "Pumps")'

print("\n", "query_string:", RUGS_query_string)
cat_df = raw_attr.query(boots_query_string)
cat_pids = list(set(cat_df['PROD_ID']))
cat_df = raw_attr.loc[raw_attr['PROD_ID'].isin(cat_pids)]
print("\n", "count cat_pids", len(cat_pids))

print("\n", "df shape:", cat_df.shape)

cat_df.head()

NameError: name 'boots_query_string' is not defined

In [36]:
# if no categories are provided then use all unique categories from the data
if len(SELECTED_CATEGORIES) == 0:
    UNIQUE_CATEGORIES = list(set(raw_attr[raw_attr['ATTR_NAME'] == 'Category']['ATTR_VAL']))
    SELECTED_CATEGORIES = UNIQUE_CATEGORIES
    
# collect unique attributes for selected categories
if len(SELECTED_CATEGORIES) == 1:
    query_string = 'ATTR_NAME == "Category" and ATTR_VAL == "{}"'.format(str(SELECTED_CATEGORIES[0]))
else:
    query_string = 'ATTR_NAME == "Category" and (ATTR_VAL == "{}"'.format(str(SELECTED_CATEGORIES[0]))
    for idx, SELECTED_CATEGORY in enumerate(SELECTED_CATEGORIES):
        if idx == 0: continue
        query_string = query_string + ' or ATTR_VAL == "{}"'.format(str(SELECTED_CATEGORY))
    query_string = query_string + ")"

print("\n", "query_string:", query_string)

cat_df = raw_attr.query(query_string)
cat_pids = list(set(cat_df['PROD_ID']))
cat_df = raw_attr.loc[raw_attr['PROD_ID'].isin(cat_pids)] 
print("\n", "df shape:", cat_df.shape)

cat_df.head()


 query_string: ATTR_NAME == "Category" and (ATTR_VAL == "BATH_RUGS" or ATTR_VAL == "RUG" or ATTR_VAL == "AREA_AREA_RUG" or ATTR_VAL == "AREA_RUG" or ATTR_VAL == "BATH_RUG")

 df shape: (529262, 3)


Unnamed: 0,PROD_ID,ATTR_NAME,ATTR_VAL
21670,32016,Bath Category,Bath Rugs
21671,32016,Brand,Calvin Klein
21672,32016,Category,BATH_RUG
21673,32016,Category of Business,Bed & Bath
21674,32016,Category of Business,Home


In [37]:
raw_attr[raw_attr['PROD_ID'] == 32016]

Unnamed: 0,PROD_ID,ATTR_NAME,ATTR_VAL
21670,32016,Bath Category,Bath Rugs
21671,32016,Brand,Calvin Klein
21672,32016,Category,BATH_RUG
21673,32016,Category of Business,Bed & Bath
21674,32016,Category of Business,Home
21675,32016,Home Categories,Bath
21676,32016,Style,Classic
21677,32016,Style,Modern
21678,32016,Style,Traditional


In [38]:
image_pids = list(set(raw_image['PRODUCT_ID']))

print("cat_pids", len(cat_pids))
print("image_pids", len(image_pids))


inter = list(set(cat_pids) & set(image_pids))
print("inter", len(inter))


cat_pids 49502
image_pids 417001
inter 16078


# Pivot cat_df

In [39]:
def dedup_rows(x):
    if len(x) > 1:
        o = set([i for i in x])
        return ', '.join(o)
    else:
        return x
    
aggregations = {
    'ATTR_VAL': dedup_rows
}
cat_df_pivoted = cat_df.groupby(['PROD_ID', 'ATTR_NAME']).agg(aggregations).reset_index()
cat_df_pivoted = cat_df_pivoted.pivot(index='PROD_ID', columns='ATTR_NAME', values='ATTR_VAL')
print("\n", "shape cat_df_pivoted", cat_df_pivoted.shape)
cat_df_pivoted.head()


 shape cat_df_pivoted (49502, 87)


ATTR_NAME,Active Apparel Type,Age Group,Area Rugs,Bath Accessory Style,Bath Category,Bath Type,Bed & Bath Type,Bed Size,Bedding Color,Bedding Features,...,Table Linen Type,Textiles Bed Size,Top Style,Towel Fabric,Towel Size,Towel Style,Underwear Style,Weather,Wedding Registry,Window Type
PROD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32016,,,,,Bath Rugs,,,,,,...,,,,,,,,,,
32017,,,,,Bath Rugs,,,,,,...,,,,,,,,,,
65749,,Adult,,,Bath Rugs,,Bath,,"Green, Purple, Brown/Tan, White/Ivory, Grey",,...,,,,,,,,,,
92375,,,,,,,,,,,...,,,,,,,,,,
92530,,,,,Bath Rugs,,Bath,,,,...,,,,,,,,,,


# Merge ImageData with AttributeData

In [40]:
merged_df = raw_image.merge(cat_df_pivoted, left_on="PRODUCT_ID", right_on="PROD_ID")
print("\n", "shape merged_df", merged_df.shape)
merged_df.tail()


 shape merged_df (22949, 90)


Unnamed: 0,PRODUCT_ID,IMAGE_ID,IMAGE_FILE,Active Apparel Type,Age Group,Area Rugs,Bath Accessory Style,Bath Category,Bath Type,Bed & Bath Type,...,Table Linen Type,Textiles Bed Size,Top Style,Towel Fabric,Towel Size,Towel Style,Underwear Style,Weather,Wedding Registry,Window Type
22944,7797235,11181971,7/7/7797235_11181971.jpg,,,True,,,,,...,,,,,,,,,,
22945,8081323,11508662,8/0/8081323_11508662.jpg,,,True,,,,,...,,,,,,,,,,
22946,7689754,11090774,7/6/7689754_11090774.jpg,,,True,,,,,...,,,,,,,,,,
22947,8006588,11396879,8/0/8006588_11396879.jpg,,,True,,,,,...,,,,,,,,,,
22948,8006584,11396879,8/0/8006584_11396879.jpg,,,True,,,,,...,,,,,,,,,,


In [41]:
merged_df.columns

Index(['PRODUCT_ID', 'IMAGE_ID', 'IMAGE_FILE', 'Active Apparel Type',
       'Age Group', 'Area Rugs', 'Bath Accessory Style', 'Bath Category',
       'Bath Type', 'Bed & Bath Type', 'Bed Size', 'Bedding Color',
       'Bedding Features', 'Brand', 'Bridal', 'Case Shape', 'Category',
       'Category of Business', 'Certifications', 'Characters',
       'Characters - EA', 'Collection', 'College Team', 'Color Family',
       'Cookware Material', 'Department Type', 'Dress Occasion', 'Fabric',
       'Fabric Pattern', 'Fabric Property', 'Fill', 'Foam',
       'Furniture Category', 'Gender', 'Gender/Age', 'Gold Metal Weight',
       'Hat Style', 'Holiday', 'Holiday Decor Type', 'Home Categories',
       'Home Organization', 'Kids Bath', 'Kitchen Linen Type', 'Kitchen Type',
       'Laundry Organization Type', 'League', 'Lids Clothing & Accessories',
       'Lids For The Home', 'Makeup Category', 'Material', 'Memory Foam',
       'Memory Foam Rugs', 'Mens Product Type', 'Mom Style',
       'M

# Visualize final df

In [48]:
columns_names = [
    'PRODUCT_ID', 'IMAGE_ID','IMAGE_FILE', 'Area Rugs', 'Bath Accessory Style', 'Bath Category', 'Bath Type',
    'Bed & Bath Type', 'Brand', 'Category', 'Category of Business', 'Collection', 'Color Family', 'Department Type', 
    'Fabric', 'Fabric Pattern', 'Fabric Property', 'Fill', 'Furniture Category', 'Home Categories',
    'Home Organization', 'Kitchen Type', 'Material', 'Memory Foam', 'Memory Foam Rugs', 
    'NRF_ID', 'Outdoor Accessories', 'Product Color', 'Product Level', 'Room Type', 'Rug Color', 'Rug Construction', 
    'Rug Size', 'Rug Style', 'Rug Type', 'Rugs Pattern', 'Style', 'Sub Brand']

#columns_names = ['IMAGE_FILE', 'Category', 'Shoe Type']
print(SELECTED_ATTRIBUTES)
columns_names.extend(SELECTED_ATTRIBUTES)
print(columns_names)
viz_df = merged_df[columns_names].dropna(how='all',axis=1)
print("\n", "shape viz_df", viz_df.shape)
print("\n", viz_df['Rug Type'].unique())
viz_df.head()

[]
['PRODUCT_ID', 'IMAGE_ID', 'IMAGE_FILE', 'Area Rugs', 'Bath Accessory Style', 'Bath Category', 'Bath Type', 'Bed & Bath Type', 'Brand', 'Category', 'Category of Business', 'Collection', 'Color Family', 'Department Type', 'Fabric', 'Fabric Pattern', 'Fabric Property', 'Fill', 'Furniture Category', 'Home Categories', 'Home Organization', 'Kitchen Type', 'Material', 'Memory Foam', 'Memory Foam Rugs', 'NRF_ID', 'Outdoor Accessories', 'Product Color', 'Product Level', 'Room Type', 'Rug Color', 'Rug Construction', 'Rug Size', 'Rug Style', 'Rug Type', 'Rugs Pattern', 'Style', 'Sub Brand']

 shape viz_df (22949, 31)

 [nan 'Medium' 'Large' 'Small' 'Bath' 'Bath, Contour' 'Small, Contour'
 'Medium, Contour' 'Medium, Small' 'Medium, Holiday' 'Holiday']


Unnamed: 0,PRODUCT_ID,IMAGE_ID,IMAGE_FILE,Area Rugs,Bath Accessory Style,Bath Category,Bath Type,Bed & Bath Type,Brand,Category,...,Outdoor Accessories,Product Color,Rug Color,Rug Construction,Rug Size,Rug Style,Rug Type,Rugs Pattern,Style,Sub Brand
0,7797410,11181817,7/7/7797410_11181817.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,5x8,Indoor,,Oriental,Traditional,
1,7797410,11181813,7/7/7797410_11181813.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,5x8,Indoor,,Oriental,Traditional,
2,7797411,11181817,7/7/7797411_11181817.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,8x10,Indoor,,Oriental,Traditional,
3,7797411,11181813,7/7/7797411_11181813.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,8x10,Indoor,,Oriental,Traditional,
4,7797409,11181817,7/7/7797409_11181817.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,3x5,Indoor,,Oriental,Traditional,


In [58]:
viz_df = viz_df.drop_duplicates(subset=['IMAGE_FILE'],keep='first')

In [59]:
viz_df['Rug Type'].value_counts()

Medium             739
Small              342
Bath               234
Large              173
Medium, Contour     22
Bath, Contour       12
Small, Contour       8
Medium, Small        6
Holiday              3
Medium, Holiday      1
Name: Rug Type, dtype: int64

In [60]:
viz_df['Rug Type'].unique()

array([nan, 'Medium', 'Large', 'Small', 'Bath', 'Bath, Contour',
       'Small, Contour', 'Medium, Contour', 'Medium, Small',
       'Medium, Holiday', 'Holiday'], dtype=object)

# Save df

In [61]:
#merged_df.to_hdf(SAVE_OUTPUT_AS, key="macys_images")
viz_df.to_hdf(SAVE_OUTPUT_AS, key="macys_rug_images")
print("Saved")

Saved


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['IMAGE_FILE', 'Area Rugs', 'Bath Accessory Style', 'Bath Category', 'Bath Type', 'Bed & Bath Type', 'Brand', 'Category', 'Category of Business', 'Color Family', 'Fabric', 'Fabric Pattern', 'Fabric Property', 'Fill', 'Furniture Category', 'Home Categories', 'Kitchen Type', 'Material', 'NRF_ID', 'Outdoor Accessories', 'Product Color', 'Rug Color', 'Rug Construction', 'Rug Size', 'Rug Style', 'Rug Type', 'Rugs Pattern', 'Style', 'Sub Brand']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


# Reload for QA

In [62]:
reloaded_df = pd.read_hdf(SAVE_OUTPUT_AS,key="macys_rug_images")
print("\n", "shape reloaded", reloaded_df.shape)
reloaded_df.head()


 shape reloaded (22857, 31)


Unnamed: 0,PRODUCT_ID,IMAGE_ID,IMAGE_FILE,Area Rugs,Bath Accessory Style,Bath Category,Bath Type,Bed & Bath Type,Brand,Category,...,Outdoor Accessories,Product Color,Rug Color,Rug Construction,Rug Size,Rug Style,Rug Type,Rugs Pattern,Style,Sub Brand
0,7797410,11181817,7/7/7797410_11181817.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,5x8,Indoor,,Oriental,Traditional,
1,7797410,11181813,7/7/7797410_11181813.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,5x8,Indoor,,Oriental,Traditional,
2,7797411,11181817,7/7/7797411_11181817.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,8x10,Indoor,,Oriental,Traditional,
3,7797411,11181813,7/7/7797411_11181813.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,8x10,Indoor,,Oriental,Traditional,
4,7797409,11181817,7/7/7797409_11181817.jpg,True,,,,,Bob Mackie,AREA_RUG,...,,,Blue,,3x5,Indoor,,Oriental,Traditional,


In [63]:
reloaded_df['Category'].unique()

array(['AREA_RUG', 'BATH_RUG', 'RUG'], dtype=object)

In [64]:
prod_types = ['AREA_RUG', 'BATH_RUG', 'RUG']
for cat in SELECTED_CATEGORIES:
    tmp_df = reloaded_df[reloaded_df['Category'].isin(prod_types)]
    print(cat, len(tmp_df))

BATH_RUGS 22857
RUG 22857
AREA_AREA_RUG 22857
AREA_RUG 22857
BATH_RUG 22857


In [68]:
for attr in viz_df.columns[3:]:
    print(attr)
    print(viz_df[attr].value_counts())

Area Rugs
true    20192
Name: Area Rugs, dtype: int64
Bath Accessory Style
Bathroom Sets    7
Name: Bath Accessory Style, dtype: int64
Bath Category
Bath Rugs                          2088
Bath Rugs, Towels                    75
Bath Rugs, Bathroom Accessories      18
Bath Rugs, Shower Curtains           10
Towels                                9
Bathroom Accessories                  5
Name: Bath Category, dtype: int64
Bath Type
Solid      103
Kids         6
Holiday      2
Fashion      1
Name: Bath Type, dtype: int64
Bed & Bath Type
Bath    2210
Name: Bed & Bath Type, dtype: int64
Brand
Surya                             9174
Oriental Weavers                  1862
Macy's Fine Rug Gallery           1167
Loloi                             1107
Liora Manne'                       974
Karastan                           787
KM Home                            684
Castle Hill London                 653
Nourison                           562
Safavieh                           511
Kas             

In [70]:
final_rug_cols = [
    'PRODUCT_ID', 'IMAGE_ID', 'IMAGE_FILE', 'Brand', 'Category', 'Color Family', 'Fabric', 
    'Fabric Pattern', 'Fabric Property', 'Material','Product Color', 'Rug Color', 'Rug Construction', 
    'Rug Size', 'Rug Style', 'Rug Type', 'Rugs Pattern', 'Style', 'Sub Brand']
filtered_rug_df = viz_df[final_rug_cols]

In [75]:
print(filtered_rug_df.dropna(subset=['Rug Style']).shape)
filtered_rug_df['Rug Style'].value_counts()

(20202, 19)


Indoor                       14702
Bath                          1576
Outdoor, Indoor               1041
Kitchen, Indoor               1003
Shag                           731
Accent                         297
Outdoor                        284
Kids, Shag                      93
Doormats                        77
Accent, Bath                    72
Natural                         72
Natural, Indoor                 70
Rug Sets, Indoor                51
Doormats, Outdoor               33
Rug Sets                        28
Accent, Kitchen                 19
Rug Sets, Bath                  18
Outdoor, Doormats, Indoor       13
Novelty                          6
Novelty, Indoor                  5
Kids                             5
Shag, Indoor                     2
Kitchen                          2
Accent, Doormats                 1
Accent, Doormats, Outdoor        1
Name: Rug Style, dtype: int64

In [76]:
print(filtered_rug_df.dropna(subset=['Style']).shape)
filtered_rug_df['Style'].value_counts()

(18825, 19)


Modern                                                  8023
Traditional                                             4549
Freestyle                                               4214
Casual, Modern, Transitional                             382
Traditional, Modern                                      284
Transitional                                             226
Casual                                                   171
Casual, Modern                                           171
Modern, Freestyle                                        160
Modern, Transitional                                     120
Transitional, Freestyle, Modern, Casual, Traditional     119
Casual, Traditional, Modern                               91
Transitional, Freestyle                                   76
Traditional, Modern, Freestyle                            70
Casual, Modern, Transitional, Freestyle                   61
Modern, Transitional, Freestyle                           53
Casual, Transitional, Fr