In [337]:
!pip -q install gensim

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [0]:
!wget -qO "training data.xlsx" https://cdn.skillenza.com/files/6a5fa354-63f4-4075-ae9f-ed47b60c41c7/Train-Data.xlsx

In [0]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.decomposition import TruncatedSVD
from itertools import chain
from collections import Counter
import regex as re
import pickle

In [0]:
t = pd.read_excel("training data.xlsx")

In [0]:
t["wt"] = np.asarray([int(float(re.sub("\D", "", str(x)))) if re.sub("\D", "", str(x)) != "" else -99999 for x in t['Grammage'].values], dtype=int)

In [0]:
t.to_pickle("train.pkl")

In [0]:
def preproc(s):
  return [x for x in gensim.utils.simple_preprocess(s) if x not in {'gm', 'ml', 'kg', 'with', 'for', 'the', 'rs', 'of', 'under', 'less', 'more', 'than', 'lower', 'greater'}]

In [0]:
desc = t["Product Description"].tolist()
dlist = []
for i, des in enumerate(desc):
    dlist.append(preproc(des))
with open('dlist.pkl', 'wb') as f:
    pickle.dump(dlist, f)

In [0]:
model = Word2Vec(dlist, size=50, window=15, min_count=1, workers=8, iter=50)
model.save("word2vec.model")

In [0]:
frequencies = Counter(list(chain.from_iterable(dlist)))
with open('freq.pkl', 'wb') as f:
    pickle.dump(frequencies, f)

In [0]:

def run_sif(query, sentences2, model, freqs={}, a=0.001): 
    total_freq = sum(freqs.values())
    embeddings = []
    tokens1 = [token for token in query if token in model.wv]
    if not tokens1: return None
    weights1 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens1]
    embedding1 = np.zeros((len(sentences2), model.trainables.layer1_size)) + np.average([model.wv[token] for token in tokens1], axis=0, weights=weights1)
    
    embedding2 = np.zeros((len(sentences2), model.trainables.layer1_size))

      # SIF requires us to first collect all sentence embeddings and then perform 
      # common component analysis.
    for i, sent2 in enumerate(sentences2): 
          
          tokens2 = [token for token in sent2 if token in model.wv]             
          n = len(set(tokens1) & set(tokens2))/len(tokens1)
          
          weights2 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens2]          
          embedding2[i] = np.average([model.wv[token] for token in tokens2], axis=0, weights=weights2)                  
          embedding1[i] += 15*n*embedding2[i]

    sims = np.einsum('ij,ij->i', embedding1, embedding2)/(np.linalg.norm(embedding1, axis=1)*np.linalg.norm(embedding2, axis=1))

    return sims

In [0]:
def wt(q):
    w = i = None
    for i, x in enumerate(q):
        if x in ('gm', 'ml', 'kg', 'g', 'l', 'lt', 'ltr', 'ml', 'pcs', 'xgm', ):
          if i:
              try:
                  w = int(float(q[i-1]))
              except: pass
    return w

In [0]:
def cost(q):
    more = c = i = None
    for i, x in enumerate(q):
        if x in ('less', 'lower'):
            if i < len(q)-2:
                try:
                    c = int(float(q[i+2]))
                    more = 1
                except: pass
        elif x == "under":
            if i < len(q)-1:
                try:
                    c = int(float(q[i+1]))
                    more = 1
                except: pass
        elif x in ('more', 'greater'):
            if i < len(q)-2:
                try:
                    c = int(float(q[i+2]))
                    more = 3
                except: pass
        if x == 'rs':
            if i:
                try:
                    c = int(float(q[i-1]))
                    more = 2
                except: pass
            else:
                try:
                    c = int(float(q[i+1]))
                    more = 2
                except: pass
        if c: break
    return more, c

In [0]:
def run(q, boost=[], b=1, n=10):
    qcheck = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", q.lower()).strip().split()
    grammage = wt(qcheck)
    op, price = cost(qcheck)

    q += 4*int(b)*(' ' + ' '.join(boost))
    scores = run_sif(preproc(q), dlist, freqs=frequencies, model=model)
    df = t.copy()
    df["scores"] = scores

    #price
    if price:
        if op==1:
            df.loc[df["Final Price"] < price, "scores"] += 0.005
        elif op==2:
            df.loc[df["Final Price"].between(price-10, price+10), "scores"] += 0.005
        elif op==3:
            df.loc[df["Final Price"] > price, "scores"] += 0.005

    #grammage
    if grammage:
        df.loc[df["wt"] == grammage, "scores"] += 0.005

    return df

In [355]:
df=run("powder 250 gm 150 rs", boost=["ayghd"])
df.sort_values("scores", ascending=False)[["Product Description", "Grammage", "Final Price", "scores"]].head(10)

Unnamed: 0,Product Description,Grammage,Final Price,scores
2185,AYGHD 500 GM PLPCH DETERGENT POWDER AB OR ADHI...,500 GM,152,1.004234
3545,TXICJ 250 GM PLPCH SUPER ACTIVE PLUS DETERGENT...,250 GM,154,1.003893
5199,EDLUQ 80 GM PLPCH SAFEDI MAGIC DETERGENT POWDE...,80 GM,140,1.002644
4580,KSNGC 250 GM PLPCH SHAKTI DETERGENT POWDER SAF...,250 GM,12,1.001559
1500,VGMWB 13 GM PLPCH DET POWDER WD ADVANCE FORMULA,13 GM,148,1.0012
7950,VVMOD 80 GM PLPCH WASHING POWDER WOOSH KARE KH...,80 GM,150,1.000845
3652,ULBCY 500 GM PLPCH EXCEL DETERGENT POWDER MORE...,500 GM,140,1.000675
6984,UEQMD 100 GM PLPCH SUPER WASHING POWDER,100 GM,145,1.000543
709,FAOJA 180 GM PLPCH SUPER WHITE WASHING POWDER,180 GM,154,1.000516
3284,FQNKJ 250 GM PLPCH SUPER WHITE EXTRA POWER WAS...,250 GM,411,1.000287


In [357]:
%timeit run("powder 250 gm 150 rs", boost=["ayghd"])

1 loop, best of 3: 575 ms per loop
