In [1]:
##importing libraries
import numpy as np
import pandas as pd

from fuzzywuzzy import process
from nltk.tokenize import word_tokenize



In [2]:
##import the product catalogue dataset
articles=pd.read_csv('articles.csv')
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [3]:
##checking all columns in articles
articles.columns

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

In [4]:
##list of columns used as attributes
att_cols=['product_type_name','product_group_name','graphical_appearance_name','colour_group_name',
          'perceived_colour_value_name','department_name','index_group_name','index_name','garment_group_name',
          'section_name']

In [5]:
##creating a dictionary with attributes and categories/subcategories mapped to their unique values
master_dict={}
for col in att_cols:
    master_dict[col]=articles[col].unique()
##show master dict
master_dict

{'product_type_name': array(['Vest top', 'Bra', 'Underwear Tights', 'Socks', 'Leggings/Tights',
        'Sweater', 'Top', 'Trousers', 'Hair clip', 'Umbrella',
        'Pyjama jumpsuit/playsuit', 'Bodysuit', 'Hair string', 'Unknown',
        'Hoodie', 'Sleep Bag', 'Hair/alice band', 'Belt', 'Boots',
        'Bikini top', 'Swimwear bottom', 'Underwear bottom', 'Swimsuit',
        'Skirt', 'T-shirt', 'Dress', 'Hat/beanie', 'Kids Underwear top',
        'Shorts', 'Shirt', 'Cap/peaked', 'Pyjama set', 'Sneakers',
        'Sunglasses', 'Cardigan', 'Gloves', 'Earring', 'Bag', 'Blazer',
        'Other shoe', 'Jumpsuit/Playsuit', 'Sandals', 'Jacket', 'Costumes',
        'Robe', 'Scarf', 'Coat', 'Other accessories', 'Polo shirt',
        'Slippers', 'Night gown', 'Alice band', 'Straw hat', 'Hat/brim',
        'Tailored Waistcoat', 'Necklace', 'Ballerinas', 'Tie',
        'Pyjama bottom', 'Felt hat', 'Bracelet', 'Blouse',
        'Outdoor overall', 'Watch', 'Underwear body', 'Beanie', 'Giftbox',
 

In [9]:
##function to get fuzzy matches from the query using the master dict
##gives the attributes/categories/subcategories the words in the query match to, and the values they match to
def get_matches(dct):
    ##take in query and convert into a list of strings
    query=word_tokenize(input())
    ##a dict to store all matches
    output_dct=dict()
    for word in query:
        for key in dct.keys():
            ##calculate matching score with top 3 fuzzy matches
            result=process.extract(word,dct[key],limit=10)
            for i in range(0,len(result)):
                if result[i][1]>=90:##threshold to be determined
                    if key not in output_dct.keys():
                        output_dct[key]=np.array(result[i][0])
                    elif key in output_dct.keys():
                        output_dct[key]=np.append(output_dct[key],result[i][0])
    return output_dct

In [10]:
get_matches(master_dict)

black denims for men


{'colour_group_name': array('Black', dtype='<U5'),
 'graphical_appearance_name': array(['Denim', 'Treatment', 'Placement print'], dtype='<U15'),
 'product_type_name': array('Garment Set', dtype='<U11'),
 'product_group_name': array(['Garment Upper body', 'Garment Lower body', 'Garment Full body',
        'Garment and Shoe care'], dtype='<U21'),
 'department_name': array(['Men Sport Woven', 'Men Sport Bottoms', 'Men Sport Acc',
        'Men Sport Tops', 'Denim Other Garments', 'Equatorial Assortment',
        'Asia Assortment'], dtype='<U21'),
 'index_group_name': array('Menswear', dtype='<U8'),
 'index_name': array('Menswear', dtype='<U8'),
 'section_name': array(['Womens Everyday Basics', 'Womens Lingerie', 'Men Underwear',
        'Womens Small accessories', 'Men H&M Sport', 'Mens Outerwear',
        'Womens Big accessories', 'Men Accessories',
        'Men Suits & Tailoring', 'Men Shoes'], dtype='<U24')}

In [11]:
get_matches(master_dict)

kurtas for special occasions


{'department_name': array(['EQ & Special Collections', 'Special Collection'], dtype='<U24'),
 'garment_group_name': array('Special Offers', dtype='<U14'),
 'section_name': array('Special Collections', dtype='<U19')}

In [None]:
get_matches(master_dict)