In [1]:
import json
import nltk
import re
import numpy as np
import pandas as pd
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from nltk.stem.snowball import SnowballStemmer
from scipy.spatial.distance import cosine, euclidean

In [2]:
# read all furniture jsons
data = []
for file_name in glob('../../desktop/data/*.json'):
    with open(file_name) as f:
        temp = json.load(f)
        for sku in temp:
            # read parent product list and product list
            parent_prod = list(sku['elasticsearch_result']['parentProducts'].keys())
            prod = list(sku['elasticsearch_result']['products'].keys())
            if parent_prod:
                sku['parentProducts'] = parent_prod[0]
                if prod[0] == parent_prod[0]:
                    sku['products'] = prod[1]
                else:
                    sku['products'] = prod[0]
            else:
                sku['parentProducts'] = np.nan
                if prod:
                    sku['products'] = prod[0]
                else:
                    sku['products'] = np.nan
        data = data + temp


# convert list into dataframe
raw_df = pd.DataFrame(data)


# select key features
# features = ['title', 'category_id', 'category_level_0', 'category_level_1',
#             'brand', 'attributes', 'price_hint', 'description', 'sku_id']
features = ['products', 'parentProducts', 'brand', 'price_hint', 'title', 'category_id']
fts = {}
for i in range(len(features)):
    fts[features[i]] = i
df = raw_df[features].copy()


In [3]:
# pre-process data
# convert to float
df.price_hint = df.price_hint.astype(float)
# fill missing
df.price_hint.fillna(df.price_hint.median(), inplace=True)

In [4]:

# tokenize and stem function for feature extraction
def tokenize_and_stem(text):
    # load nltk's stemmer object
    stemmer = SnowballStemmer("english")
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

# title processing
def title_process(df):
    # calculate tfidf matrix for title
    tf = TfidfVectorizer(analyzer='word', min_df=0, max_df=0.9, tokenizer=tokenize_and_stem, stop_words='english')
    tfidf_matrix = tf.fit_transform(df['title'])
    
    # Latent semantic analysis and re-normalization for tfidf matrix (dimension reduction)
    # seq = range(200, 1001, 50)
    # var_track = []
    # for i in seq:
    #     svd = TruncatedSVD(n_components=i, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
    #     tfidf_rd = svd.fit_transform(tfidf_matrix)
    #     var_track.append(svd.explained_variance_ratio_.sum())
    svd = TruncatedSVD(n_components=300, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    tfidf_rd = lsa.fit_transform(tfidf_matrix)
    return tfidf_rd

title_mat = title_process(df)
# col_name = ['t'+str(i) for i in range(200)]
# df_clean = pd.concat([df, pd.DataFrame(title_mat, columns=col_name)], axis=1)
mat = np.concatenate((df.values, title_mat), axis=1)


In [5]:
def prod_process(a, b):
    # calculate products and parentProducts distance
    # either one is missing
    if pd.isnull(a[fts['products']]) or pd.isnull(b[fts['products']]):
        prod_dist = 0.8
    # same products
    elif a[fts['products']] == b[fts['products']]:
        prod_dist = 0
    # one's product is same as another's parentProduct
    elif a[fts['products']] == b[fts['parentProducts']] or b[fts['products']] == a[fts['parentProducts']]:
        prod_dist = 0.1
    # one's parentProduct is part of another's product
    elif (pd.notnull(a[fts['parentProducts']]) and a[fts['parentProducts']] in b[fts['products']]) or \
            (pd.notnull(b[fts['parentProducts']]) and b[fts['parentProducts']] in a[fts['products']]):
        prod_dist = 0.3
    # one's product is part of another's product
    elif a[fts['products']] in b[fts['products']] or b[fts['products']] in a[fts['products']]:
        prod_dist = 0.3
    else:
        prod_dist = 1
    return prod_dist


# price processing
def price_process(a, b):
    i = a[fts['price_hint']]
    j = b[fts['price_hint']]
    return np.abs(i - j)/(i + j)


# brand processing
def brand_process(a, b):
    if a[fts['brand']] == b[fts['brand']]:
        return 0
    return 1


# calculate weighted distance
def mixed_dist(a, b, prod_wt=0.5, brand_wt=0.2, title_wt=0.2, price_wt=0.1):
    # calculate title_dist
    title_dist = (1 - np.dot(a[6:], b[6:])) * 0.5
    # calculate prod_dist
    prod_dist = prod_process(a, b)
    # calculate price_dist
    price_dist = price_process(a, b)
    # calculate brand_dist
    brand_dist = brand_process(a, b)
    distance = np.dot([prod_wt, brand_wt, title_wt, price_wt],
                  [prod_dist, brand_dist, title_dist, price_dist])
    return distance


# calculate title distance only
def title_only(a, b):
    title_dist = (1 - np.dot(a[6:], b[6:])) * 0.5
    return title_dist

In [6]:
def query(id, k, dist=mixed_dist, data=mat):
    dist_mat = np.apply_along_axis(dist, axis=1, arr=data, b=mat[id, :])
    idx = np.argpartition(dist_mat, k+1)[:k+1]
    top_idx = idx[np.argsort(dist_mat[idx])]
    result = df.iloc[top_idx, :]
    return result

In [10]:
query(100000,10, mixed_dist)

Unnamed: 0,products,parentProducts,brand,price_hint,title,category_id
100000,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Wing Lounge Chair,18000069
105778,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Lounge Wing Chair,18000069
105062,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Wing Lounge Chair,18000069
136110,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Wing Lounge Chair,18000096
137491,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Lounge Wing Chair,18000096
101315,chair,,Kardiel,795.0,Kardiel Cantilever Modern Lounge Chair,18000069
104983,chair,,Kardiel,590.0,Kardiel Modern Easy Lounge Chair,18000069
102114,chair,,Kardiel,1095.0,Kardiel Retropolitan Modern Lounge Chair,18000069
101511,chair,,Kardiel,529.0,Kardiel Tripod Plywood Modern Lounge Chair,18000069
137782,chair,,Kardiel,795.0,Kardiel PK20 Style Cantilever Lounge Modern Ch...,18000096


In [11]:
query(100000,10, title_only)

Unnamed: 0,products,parentProducts,brand,price_hint,title,category_id
100000,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Wing Lounge Chair,18000069
105062,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Wing Lounge Chair,18000069
137491,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Lounge Wing Chair,18000096
136110,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Wing Lounge Chair,18000096
105778,chair,,Kardiel,795.0,Kardiel Retropolitan Modern Lounge Wing Chair,18000069
102114,chair,,Kardiel,1095.0,Kardiel Retropolitan Modern Lounge Chair,18000069
101315,chair,,Kardiel,795.0,Kardiel Cantilever Modern Lounge Chair,18000069
101511,chair,,Kardiel,529.0,Kardiel Tripod Plywood Modern Lounge Chair,18000069
104983,chair,,Kardiel,590.0,Kardiel Modern Easy Lounge Chair,18000069
102742,chair,,Kardiel,1195.0,Kardiel PK24 Modern Chaise Lounge Chair,18000069
