In [1]:
##importing libraries
import numpy as np
import pandas as pd

from fuzzywuzzy import process
from nltk.tokenize import word_tokenize



In [2]:
##import the product catalogue dataset
articles=pd.read_csv('articles.csv')
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [3]:
##checking all columns in articles
articles.columns

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

In [4]:
##list of columns used as attributes
att_cols=['product_type_name','product_group_name','graphical_appearance_name','colour_group_name',
          'perceived_colour_value_name','department_name','index_group_name','index_name','garment_group_name',
          'section_name']

In [5]:
##creating a dictionary with attributes and categories/subcategories mapped to their unique values
master_dict={}
for col in att_cols:
    master_dict[col]=articles[col].unique()
##show master dict
master_dict

{'product_type_name': array(['Vest top', 'Bra', 'Underwear Tights', 'Socks', 'Leggings/Tights',
        'Sweater', 'Top', 'Trousers', 'Hair clip', 'Umbrella',
        'Pyjama jumpsuit/playsuit', 'Bodysuit', 'Hair string', 'Unknown',
        'Hoodie', 'Sleep Bag', 'Hair/alice band', 'Belt', 'Boots',
        'Bikini top', 'Swimwear bottom', 'Underwear bottom', 'Swimsuit',
        'Skirt', 'T-shirt', 'Dress', 'Hat/beanie', 'Kids Underwear top',
        'Shorts', 'Shirt', 'Cap/peaked', 'Pyjama set', 'Sneakers',
        'Sunglasses', 'Cardigan', 'Gloves', 'Earring', 'Bag', 'Blazer',
        'Other shoe', 'Jumpsuit/Playsuit', 'Sandals', 'Jacket', 'Costumes',
        'Robe', 'Scarf', 'Coat', 'Other accessories', 'Polo shirt',
        'Slippers', 'Night gown', 'Alice band', 'Straw hat', 'Hat/brim',
        'Tailored Waistcoat', 'Necklace', 'Ballerinas', 'Tie',
        'Pyjama bottom', 'Felt hat', 'Bracelet', 'Blouse',
        'Outdoor overall', 'Watch', 'Underwear body', 'Beanie', 'Giftbox',
 

In [6]:
##function to get fuzzy matches from the query using the master dict
##gives the attributes/categories/subcategories the words in the query match to, and the values they match to
def get_matches(dct):
    ##take in query and convert into a list of strings
    query=word_tokenize(input())
    ##a dict to store all matches
    output_dct=dict()
    for word in query:
        for key in dct.keys():
            ##calculate matching score with top 3 fuzzy matches
            result=process.extract(word,dct[key],limit=10)
            for i in range(0,len(result)):
                if result[i][1]>=90:##threshold to be determined
                    if key not in output_dct.keys():
                        output_dct[key]=np.array(result[i][0])
                    elif key in output_dct.keys():
                        output_dct[key]=np.append(output_dct[key],result[i][0])
    return output_dct

In [7]:
get_matches(master_dict)

black denims for men


{'colour_group_name': array('Black', dtype='<U5'),
 'graphical_appearance_name': array(['Denim', 'Treatment', 'Placement print'], dtype='<U15'),
 'product_type_name': array('Garment Set', dtype='<U11'),
 'product_group_name': array(['Garment Upper body', 'Garment Lower body', 'Garment Full body',
        'Garment and Shoe care'], dtype='<U21'),
 'department_name': array(['Men Sport Woven', 'Men Sport Bottoms', 'Men Sport Acc',
        'Men Sport Tops', 'Denim Other Garments', 'Equatorial Assortment',
        'Asia Assortment'], dtype='<U21'),
 'index_group_name': array('Menswear', dtype='<U8'),
 'index_name': array('Menswear', dtype='<U8'),
 'section_name': array(['Womens Everyday Basics', 'Womens Lingerie', 'Men Underwear',
        'Womens Small accessories', 'Men H&M Sport', 'Mens Outerwear',
        'Womens Big accessories', 'Men Accessories',
        'Men Suits & Tailoring', 'Men Shoes'], dtype='<U24')}

In [8]:
get_matches(master_dict)

kurtas for special occasions


{'department_name': array(['EQ & Special Collections', 'Special Collection'], dtype='<U24'),
 'garment_group_name': array('Special Offers', dtype='<U14'),
 'section_name': array('Special Collections', dtype='<U19')}

In [9]:
get_matches(master_dict)

red floral skirt


{'product_type_name': array(['Tailored Waistcoat', 'Skirt'], dtype='<U18'),
 'colour_group_name': array(['Red', 'Dark Red', 'Other Red', 'Light Red'], dtype='<U9'),
 'department_name': array(['Everyday Waredrobe Denim', 'Skirt', 'Skirts', 'Trousers & Skirt',
        'Shorts & Skirts', 'Skirts DS'], dtype='<U24'),
 'garment_group_name': array(['Skirts', 'Dresses/Skirts girls'], dtype='<U20')}

In [21]:
##estimating threshold value for each column

In [69]:
threshold_dict={'product_type_name':90,
               'product_group_name':75,
               'graphical_appearance_name':75,
               'colour_group_name':90,
               'perceived_colour_value_name':90,
               'department_name':75,
               'index_group_name':70,
               'index_name':60,
               'garment_group_name':64,
               'section_name':95}

In [30]:
##'product_type_name'
sample=['shorts','top','polo tshirts','casual tshirts','shoes','trousers','boxers','chinos','jeans','skirts']

for word in sample:
    result=process.extract(word,master_dict['product_type_name'],limit=3)
    print(f'{word}:{result}')

shorts:[('Shorts', 100), ('Shirt', 73), ('Polo shirt', 66)]
top:[('Top', 100), ('Vest top', 90), ('Bikini top', 90)]
polo tshirts:[('Polo shirt', 91), ('Shirt', 90), ('T-shirt', 77)]
casual tshirts:[('Shirt', 90), ('T-shirt', 77), ('Shorts', 75)]
shoes:[('Flat shoes', 90), ('Other shoe', 80), ('Flat shoe', 80)]
trousers:[('Trousers', 100), ('Outdoor trousers', 90), ('Top', 60)]
boxers:[('Outdoor trousers', 60), ('Nipple covers', 60), ('Stain remover spray', 60)]
chinos:[('Cushion', 62), ('Moccasins', 60), ('Keychain', 57)]
jeans:[('Earrings', 62), ('Beanie', 55), ('Hat/beanie', 54)]
skirts:[('Skirt', 91), ('Shirt', 73), ('Shorts', 67)]


In [37]:
##'product_group_name'
sample=['accessory','cosmetics','home furniture','underwear','shoe care','socks','uppers','lowers','lower']

for word in sample:
    result=process.extract(word,master_dict['product_group_name'],limit=3)
    print(f'{word}:{result}')

accessory:[('Accessories', 80), ('Stationery', 42), ('Garment Lower body', 37)]
cosmetics:[('Cosmetic', 94), ('Items', 45), ('Socks & Tights', 43)]
home furniture:[('Furniture', 90), ('Fun', 60), ('Shoes', 57)]
underwear:[('Underwear', 100), ('Underwear/nightwear', 90), ('Fun', 60)]
shoe care:[('Garment and Shoe care', 90), ('Shoes', 76), ('Garment Lower body', 48)]
socks:[('Socks & Tights', 90), ('Shoes', 60), ('Accessories', 38)]
uppers:[('Garment Upper body', 78), ('Underwear', 45), ('Underwear/nightwear', 45)]
lowers:[('Garment Lower body', 78), ('Shoes', 55), ('Underwear', 50)]
lower:[('Garment Lower body', 90), ('Underwear', 54), ('Nightwear', 54)]


In [41]:
##'graphical_appearance_name'
sample=['solids','checked','stripes','striped','meshy','checkered','dotted','block printed','printed']

for word in sample:
    result=process.extract(word,master_dict['graphical_appearance_name'],limit=3)
    print(f'{word}:{result}')

solids:[('Solid', 91), ('Mixed solid/pattern', 78), ('Embroidery', 45)]
checked:[('Check', 83), ('Dot', 45), ('Lace', 45)]
stripes:[('Stripe', 92), ('Neps', 51), ('Mesh', 51)]
striped:[('Stripe', 92), ('Solid', 50), ('Dot', 45)]
meshy:[('Mesh', 89), ('Neps', 44), ('Treatment', 40)]
checkered:[('Check', 90), ('Dot', 45), ('Lace', 45)]
dotted:[('Dot', 90), ('All over pattern', 45), ('Other structure', 45)]
block printed:[('Front print', 58), ('Placement print', 57), ('Colour blocking', 50)]
printed:[('Front print', 75), ('Placement print', 75), ('Transparent', 56)]


In [43]:
#'colour_group_name'
sample=['blackish','blue','maroon','burgundy','red','reddish','pinkish','yellow','grey','green']

for word in sample:
    result=process.extract(word,master_dict['colour_group_name'],limit=3)
    print(f'{word}:{result}')

blackish:[('Black', 90), ('Dark Blue', 45), ('Yellowish Brown', 45)]
blue:[('Blue', 100), ('Light Blue', 90), ('Dark Blue', 90)]
maroon:[('Dark Orange', 47), ('Yellowish Brown', 45), ('Transparent', 45)]
burgundy:[('Gold', 45), ('Blue', 45), ('Yellowish Brown', 43)]
red:[('Red', 100), ('Dark Red', 90), ('Other Red', 90)]
reddish:[('Red', 90), ('Greyish Beige', 66), ('Greenish Khaki', 64)]
pinkish:[('Pink', 90), ('Greenish Khaki', 51), ('Dark Pink', 50)]
yellow:[('Yellow', 100), ('Yellowish Brown', 90), ('Dark Yellow', 90)]
grey:[('Grey', 100), ('Light Grey', 90), ('Dark Grey', 90)]
green:[('Green', 100), ('Greenish Khaki', 90), ('Dark Green', 90)]


In [46]:
#'perceived_colour_value_name'
sample=['darkish','lightish','dust','brightness','light','dusty','medium']

for word in sample:
    result=process.extract(word,master_dict['perceived_colour_value_name'],limit=3)
    print(f'{word}:{result}')

darkish:[('Dark', 90), ('Bright', 46), ('Light', 33)]
lightish:[('Light', 90), ('Bright', 57), ('Dusty Light', 53)]
dust:[('Dusty Light', 90), ('Medium Dusty', 90), ('Medium', 45)]
brightness:[('Bright', 90), ('Light', 72), ('Dusty Light', 38)]
light:[('Light', 100), ('Dusty Light', 90), ('Bright', 73)]
dusty:[('Dusty Light', 90), ('Medium Dusty', 90), ('Medium', 36)]
medium:[('Medium', 100), ('Medium Dusty', 90), ('Undefined', 40)]


In [52]:
#'department_name'
sample=['trousers','bottoms','belt','shoes','sunglasses','dress','shorts','pants','denims','jeans','formal','formals',
        'basic','scarf','hanky','hankies','jacket']

for word in sample:
    result=process.extract(word,master_dict['department_name'],limit=10)
    print(f'{word}:{result}')
    print('\n')

trousers:[('Trousers', 100), ('Trousers DS', 95), ('Trouser', 93), ('Denim Trousers', 90), ('Trousers & Skirt', 90), ('Denim trousers', 90), ('Young Boy Trouser', 84), ('Trouser S&T', 84), ('Kids Boy Trouser', 84), ('Kids Girl Trouser', 84)]


bottoms:[('Bottoms', 100), ('AK Bottoms', 95), ('Men Sport Bottoms', 90), ('Woven bottoms', 90), ('Ladies Sport Bottoms', 90), ('Projects Woven Bottoms', 90), ('Bottoms Girls', 90), ('Woven bottoms inactive from S.7', 90), ('Bottoms Boys', 90), ('Tops & Bottoms Other', 90)]


belt:[('Belts', 89), ('Everyday Waredrobe Denim', 57), ('Suit jacket', 51), ('Jacket', 51), ('Jacket Casual', 49), ('Shopbasket Lingerie', 49), ('EQ & Special Collections', 45), ('Men Sport Bottoms', 45), ('Shopbasket Socks', 45), ('Jacket Street', 45)]


shoes:[('Shoes', 100), ('Divided Shoes', 90), ('Shoes / Boots inactive from s5', 90), ('Shoes Other', 90), ('Kids Boy Shoes', 90), ('Young Boy Shoes', 90), ('Kids Girl Shoes', 90), ('Baby Shoes', 90), ('Young Girl Shoes', 9

In [55]:
#'index_group_name'
sample=['ladies','lady','man','men','babies','childrens','sports','sportsperson','sportswear','babywear']

for word in sample:
    result=process.extract(word,master_dict['index_group_name'],limit=3)
    print(f'{word}:{result}')

ladies:[('Ladieswear', 90), ('Baby/Children', 50), ('Divided', 46)]
lady:[('Ladieswear', 68), ('Baby/Children', 45), ('Menswear', 26)]
man:[('Menswear', 60), ('Ladieswear', 30), ('Baby/Children', 30)]
men:[('Menswear', 90), ('Baby/Children', 60), ('Ladieswear', 30)]
babies:[('Ladieswear', 60), ('Baby/Children', 53), ('Divided', 31)]
childrens:[('Baby/Children', 73), ('Divided', 38), ('Menswear', 35)]
sports:[('Sport', 91), ('Ladieswear', 32), ('Menswear', 29)]
sportsperson:[('Sport', 90), ('Menswear', 26), ('Baby/Children', 24)]
sportswear:[('Sport', 90), ('Menswear', 56), ('Ladieswear', 50)]
babywear:[('Ladieswear', 56), ('Menswear', 50), ('Baby/Children', 48)]


In [60]:
#'index_name'
sample=['ladies','lady','man','men','babies','childrens','sports','sportsperson','sportswear','babywear','swim','wear']

for word in sample:
    result=process.extract(word,master_dict['index_name'],limit=3)
    print(f'{word}:{result}')

ladies:[('Ladieswear', 90), ('Ladies Accessories', 90), ('Divided', 46)]
lady:[('Ladieswear', 68), ('Ladies Accessories', 68), ('Baby Sizes 50-98', 45)]
man:[('Menswear', 60), ('Ladieswear', 30), ('Lingeries/Tights', 30)]
men:[('Menswear', 90), ('Children Sizes 92-140', 60), ('Children Sizes 134-170', 60)]
babies:[('Ladieswear', 60), ('Baby Sizes 50-98', 60), ('Ladies Accessories', 60)]
childrens:[('Children Sizes 92-140', 80), ('Children Sizes 134-170', 80), ('Children Accessories, Swimwear', 80)]
sports:[('Sport', 91), ('Ladies Accessories', 60), ('Children Accessories, Swimwear', 60)]
sportsperson:[('Sport', 90), ('Ladies Accessories', 40), ('Children Accessories, Swimwear', 38)]
sportswear:[('Sport', 90), ('Menswear', 56), ('Ladieswear', 50)]
babywear:[('Ladieswear', 56), ('Menswear', 50), ('Baby Sizes 50-98', 45)]
swim:[('Children Accessories, Swimwear', 90), ('Ladieswear', 45), ('Lingeries/Tights', 45)]
wear:[('Ladieswear', 90), ('Menswear', 90), ('Children Accessories, Swimwear'

In [63]:
#'garment_group_name'
sample=['jersies','underwear','dress','blouse','knitted','shoe','outdoor','accessory','socks']

for word in sample:
    result=process.extract(word,master_dict['garment_group_name'],limit=3)
    print(f'{word}:{result}')

jersies:[('Jersey Basic', 66), ('Jersey Fancy', 66), ('Woven/Jersey/Knitted mix Baby', 64)]
underwear:[('Under-, Nightwear', 69), ('Knitwear', 59), ('Swimwear', 47)]
dress:[('Dresses Ladies', 90), ('Dresses/Skirts girls', 90), ('Dressed', 83)]
blouse:[('Blouses', 92), ('Trousers Denim', 60), ('Trousers', 57)]
knitted:[('Woven/Jersey/Knitted mix Baby', 90), ('Knitwear', 67), ('Under-, Nightwear', 49)]
shoe:[('Shoes', 89), ('Shorts', 68), ('Blouses', 51)]
outdoor:[('Outdoor', 100), ('Trousers', 40), ('Under-, Nightwear', 39)]
accessory:[('Accessories', 80), ('Shorts', 50), ('Jersey Basic', 46)]
socks:[('Socks and Tights', 90), ('Shoes', 60), ('Shorts', 55)]


In [70]:
#'section_name'
sample=['men','women','babies','boys','girls','lady','man']

for word in sample:
    result=process.extract(word,master_dict['section_name'],limit=5)
    print(f'{word}:{result}')

men:[('Womens Everyday Basics', 90), ('Womens Lingerie', 90), ('Men Underwear', 90), ('Womens Small accessories', 90), ('Men H&M Sport', 90)]
women:[('Womens Everyday Basics', 90), ('Womens Lingerie', 90), ('Womens Nightwear, Socks & Tigh', 90), ('Womens Small accessories', 90), ('Womens Big accessories', 90)]
babies:[('Womens Everyday Basics', 60), ('Baby Essentials & Complements', 60), ('Divided Basics', 60), ('Girls Underwear & Basics', 60), ('Boys Underwear & Basics', 60)]
boys:[('Boys Underwear & Basics', 90), ('Kids Boy', 77), ('Young Boy', 77), ('Baby Boy', 77), ('Womens Everyday Basics', 45)]
girls:[('Girls Underwear & Basics', 90), ('Young Girl', 80), ('Baby Girl', 80), ('Kids Girl', 80), ('Baby Essentials & Complements', 54)]
lady:[('Ladies Denim', 68), ('Ladies H&M Sport', 68), ('Ladies Other', 68), ('Womens Everyday Basics', 45), ('Womens Small accessories', 45)]
man:[('Womens Everyday Basics', 60), ('Womens Lingerie', 60), ('Men Underwear', 60), ('Womens Small accessories'

In [71]:
threshold_dict={'product_type_name':90,
               'product_group_name':75,
               'graphical_appearance_name':75,
               'colour_group_name':90,
               'perceived_colour_value_name':90,
               'department_name':75,
               'index_group_name':70,
               'index_name':60,
               'garment_group_name':64,
               'section_name':95}

In [72]:
##function to get fuzzy matches from the query using the master dict
##gives the attributes/categories/subcategories the words in the query match to, and the values they match to
def get_matches(dct):
    ##take in query and convert into a list of strings
    query=word_tokenize(input())
    ##a dict to store all matches
    output_dct=dict()
    for word in query:
        for key in dct.keys():
            ##calculate matching score with top 3 fuzzy matches
            result=process.extract(word,dct[key],limit=10)
            for i in range(0,len(result)):
                if result[i][1]>=threshold_dict[key]:##threshold value taken from threshold dict
                    if key not in output_dct.keys():
                        output_dct[key]=np.array(result[i][0])
                    elif key in output_dct.keys():
                        output_dct[key]=np.append(output_dct[key],result[i][0])
    return output_dct

In [74]:
get_matches(master_dict)

black denim for women


{'colour_group_name': array('Black', dtype='<U5'),
 'graphical_appearance_name': array('Denim', dtype='<U5'),
 'department_name': array(['Kids Boy Denim', 'Denim Other Garments',
        'Everyday Waredrobe Denim', 'Denim Trousers', 'Young Boy Denim',
        'Young Girl Denim', 'Denim shorts', 'Denim trousers',
        'Denim wardrobe H&M man inactive from S.6', 'Kids Girl Denim',
        'Woven'], dtype='<U40'),
 'garment_group_name': array(['Trousers Denim', 'Woven/Jersey/Knitted mix Baby'], dtype='<U29'),
 'index_name': array(['Ladies Accessories', 'Sport'], dtype='<U18')}

In [75]:
get_matches(master_dict)

yellow kurtas for special occasions


{'colour_group_name': array(['Yellow', 'Yellowish Brown', 'Dark Yellow', 'Other Yellow',
        'Light Yellow'], dtype='<U15'),
 'garment_group_name': array(['Skirts', 'Special Offers'], dtype='<U14'),
 'index_name': array(['Ladies Accessories', 'Sport'], dtype='<U18'),
 'department_name': array(['EQ & Special Collections', 'Special Collection', 'Woven Occasion',
        'Jersey Occasion'], dtype='<U24')}

In [76]:
get_matches(master_dict)

black solid jeans for men


{'colour_group_name': array('Black', dtype='<U5'),
 'graphical_appearance_name': array(['Solid', 'Mixed solid/pattern', 'Treatment', 'Placement print'],
       dtype='<U19'),
 'index_name': array(['Ladies Accessories', 'Sport', 'Menswear', 'Children Sizes 92-140',
        'Children Sizes 134-170'], dtype='<U22'),
 'product_type_name': array('Garment Set', dtype='<U11'),
 'product_group_name': array(['Garment Upper body', 'Garment Lower body', 'Garment Full body',
        'Garment and Shoe care'], dtype='<U21'),
 'department_name': array(['Men Sport Woven', 'Men Sport Bottoms', 'Men Sport Acc',
        'Men Sport Tops', 'Denim Other Garments', 'Equatorial Assortment',
        'Asia Assortment'], dtype='<U21'),
 'index_group_name': array('Menswear', dtype='<U8')}

In [77]:
get_matches(master_dict)

red dress for women


{'product_type_name': array(['Tailored Waistcoat', 'Dress', 'Underdress'], dtype='<U18'),
 'colour_group_name': array(['Red', 'Dark Red', 'Other Red', 'Light Red'], dtype='<U9'),
 'department_name': array(['Everyday Waredrobe Denim', 'Dress', 'Dresses DS',
        'Projects Dresses', 'Kids Dress-up/Football', 'Young Girl Dresses',
        'Blouse & Dress', 'Kids Girl Dresses', 'Dress-up Boys',
        'AK Dresses & Outdoor', 'Dresses', 'Woven'], dtype='<U24'),
 'index_name': array(['Lingeries/Tights', 'Ladies Accessories', 'Children Sizes 92-140',
        'Divided', 'Children Sizes 134-170', 'Ladies Accessories', 'Sport'],
       dtype='<U22'),
 'garment_group_name': array(['Dresses Ladies', 'Dresses/Skirts girls', 'Dressed',
        'Woven/Jersey/Knitted mix Baby'], dtype='<U29')}