### Item Search Algorithm
This algorithm preprocesses the item data and uses TF-IDF to vectorize the item data. Then, it uses the cosine similarity along with fuzzy search to find the most similar items to the search query.

This data then feed to user recommendation algorithm to find the most similar users to the search query. 

Note: Choosen Dataset doesn't have any other information about the items except the item name, description and price. So, the algorithm only uses these features to find the most similar items.

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
from thefuzz import fuzz

In [42]:
# item model for recommend system
df = pd.read_csv('ecommerce.csv',index_col=0)
# check if product id is unique
# take only distinct product id
# find all columns
df.head(5)

Unnamed: 0_level_0,ProductTitle,Image,Price,cluster
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,https://m.media-amazon.com/images/I/31UISB90sY...,32999,159
1,LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...,https://m.media-amazon.com/images/I/51JFb7FctD...,46490,159
2,LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...,https://m.media-amazon.com/images/I/51JFb7FctD...,34490,159
3,LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...,https://m.media-amazon.com/images/I/51JFb7FctD...,37990,159
4,Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...,https://m.media-amazon.com/images/I/41lrtqXPiW...,34490,159


In [40]:
# split image and take one
# for x in df.index:
#     df.loc[x, 'Image'] = str(df.loc[x, 'images']).split('~')[0]

In [29]:
# text search based model using clustering techniques
df1 =df.loc[:,['ProductTitle','Image','Price']] # only select required   columns
df1 = df1.dropna()
df1.head(5)


Unnamed: 0,ProductTitle,Image,Price
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,https://m.media-amazon.com/images/I/31UISB90sY...,"₹32,999"
1,LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...,https://m.media-amazon.com/images/I/51JFb7FctD...,"₹46,490"
2,LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...,https://m.media-amazon.com/images/I/51JFb7FctD...,"₹34,490"
3,LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...,https://m.media-amazon.com/images/I/51JFb7FctD...,"₹37,990"
4,Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...,https://m.media-amazon.com/images/I/41lrtqXPiW...,"₹34,490"


In [31]:
# remove rupee & , symbol from price column and make it int
df1['Price'] = df1['Price'].str.replace('₹', '').str.replace(',', '')
# round off and remove invalid values
# remove rows containg non nmeric price value
df1 = df1[df1['Price'].notna()]
df1 = df1[df1['Price'].str.isnumeric()]
df1['Price'] = df1['Price'].astype(float).round(0).astype(int)

In [30]:
# get all item with product id 0
df1[df1.index==0]

Unnamed: 0,ProductTitle,Image,Price
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,https://m.media-amazon.com/images/I/31UISB90sY...,"₹32,999"


### Tokenize both title and description

In [43]:
tfd = TfidfVectorizer(stop_words='english')
# concatenate the two columns description and product title
X = tfd.fit_transform((df1['ProductTitle']))
k = 300
model = KMeans(n_clusters=k, init='k-means++', max_iter=100)
model.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


### Predict

In [44]:
# prediction utils
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfd.get_feature_names_out()
def get_cluster(id):
    res =[]
    for ind in order_centroids[id, :10]:
        res.append(terms[ind])
    return res
def search(term):
    x = tfd.transform(term.split())
    # print(x,term.split())
    y = model.predict(x)
    return y[0]

def similar_terms(term):
    cluster = search(term)
    return get_cluster(cluster)
def fuzzy_search(term,related_items):
    # sort by partial ratio
    related_items = sorted(related_items,key=lambda x: fuzz.partial_ratio(term,x),reverse=True)
    return related_items
def get_recommendation(term):
    cluster = search(term)
    return fuzzy_search(term,df1[df1['cluster'] == cluster]['ProductTitle'].values.tolist())
    

In [62]:
# add new column to the dataframe to store the predicted cluster
df['cluster'] = model.labels_

In [56]:
# for i in range(k): print(get_cluster(i))
get_recommendation("shoes")[:10]

["Asian shoes Men's Mesh Bullet 13 Navy Blue Rede Range Running Shoes-9 UK",
 "Asian shoes Men's Mesh Bullet 13 Navy Blue Rede Range Running Shoes-7 UK",
 "Asian shoes Men's Mesh Bullet 13 Navy Blue Rede Range Running Shoes-8 UK",
 'FORTIVA smart shoes|black shoes|sports shoe for men|running shoes|loafers|sneakers',
 "Asian shoes Men's Mesh Bullet 13 Navy Blue Rede Range Running Shoes-6 UK",
 "Asian shoes Men's Mesh Bullet 13 Navy Blue Rede Range Running Shoes-10 UK",
 "URBAN SHOES Men's Running Stylish Casual shoes Black - 9 UK",
 "running shoes for men's.",
 'BRUTON Shoes for Exclusive Trendy, Casual, Sports Shoes for Men, Running Shoes for Men (Black, Red)',
 "Shoefly Men's (1244-9310) Casual Sports Running Shoes"]

In [65]:
# save process data for later use
# rename index as ProductId and save as csv
df.to_csv('data.csv', index_label='ProductId')


### Save for later use

In [54]:
# save df1,tfdf, kmeans model to pickle file
import pickle
with open('tfdf.pkl', 'wb') as f:
    pickle.dump(tfd, f, pickle.HIGHEST_PROTOCOL)
with open('kmeans.pkl', 'wb') as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
