In [1]:
import pandas as pd

In [2]:
articles = pd.read_csv('database.csv')

In [3]:
articles.head()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati..."
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""..."
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",..."


In [42]:
# Taking only 10000 items :-
# articles = articles[:10000]

In [4]:
# Combine all info from descriptive columns to a single column separated by space :-
cols = ['product_name', 'product_category_tree', 'description', 'brand']

articles['combined_cols'] = articles[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [5]:
articles = articles[['uniq_id', 'combined_cols']]

In [29]:
articles.head()

Unnamed: 0,uniq_id,combined_cols
0,c2d766ca982eca8304150849735ffef9,"Alisha Solid Women's Cycling Shorts [""Clothing..."
1,7f7036a6d550aaa89d34c77bd39a5e48,"FabHomeDecor Fabric Double Sofa Bed [""Furnitur..."
2,f449ec65dcbc041b6ae5e6a32717d01b,"AW Bellies [""Footwear >> Women's Footwear >> B..."
3,0973b37acd0c664e3de26e97e5571454,"Alisha Solid Women's Cycling Shorts [""Clothing..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,"Sicons All Purpose Arnica Dog Shampoo [""Pet Su..."


In [47]:
# Download the stopwords corpus and Porter stemming algorithm :-

# import nltk

# nltk.download('stopwords')
# nltk.download('punkt')

In [30]:
# Data cleaning :-
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def text_process(desc):
    articles['combined_cols'].fillna(value='', inplace=True) # Fill the null values with empty string
    # Remove punctuation :-
    noPunc = [c for c in desc if c.isalnum() or c.isspace()]
    noPunc = ''.join(noPunc)
    noPunc = noPunc.split()
    # Remove stopwords :-
    stopword = stopwords.words('english')
    desc_stopwords = [word.lower() for word in noPunc if word.lower() not in stopword]
    # Replace words with their respective stems :-
    stemmer = PorterStemmer()
    desc_cleaned = [stemmer.stem(word) for word in desc_stopwords]
    return desc_cleaned

In [31]:
# Vectorizing the data :-
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer=text_process)
tfidf_matrix = tfidf.fit_transform(articles['combined_cols'])

In [32]:
# Recommender system :-
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [33]:
# Map the article_id to their indices :-
indices = pd.Series(articles.index, index=articles['uniq_id']).drop_duplicates()

In [34]:
# Method to predict similar articles :-
def recommendations(uniq_id):
    i = indices[uniq_id] # Index of the articles that match the given article
    sim_scores = list(enumerate(cos_sim[i])) # Similarity scores of all articles w.r.t. to the given article
    sim_scores.sort(key=lambda x: x[1], reverse=True) # Sort the similarity scores in descending order
    # Get the scores of the 10 most similar articles
    sim_scores = sim_scores[:10]
    # Get the article indices
    article_indices = [score[0] for score in sim_scores]
    return articles['uniq_id'].iloc[article_indices].values

In [35]:
# Get the recommendations for a sample product :-
predict_recom = recommendations("c2d766ca982eca8304150849735ffef9")
print(predict_recom)

['c2d766ca982eca8304150849735ffef9' 'd95b0456a0350bc42f2393c6e84b0f09'
 '9aacdecceb404c74abddc513fd2756a8' 'ce5a6818f7707e2cb61fdcdbba61f5ad'
 '0973b37acd0c664e3de26e97e5571454' '4044c0ac52c1ee4b28777417651faf42'
 'ea98a65ad1e1b8688eddf89fbc7b3e27' '181136677467016d6986b77f70d54558'
 '39d784dd3fb323768028008b2f5ff3e4' 'c383c66d5d32e4f7f59a371fd8952132']


In [36]:
# Save the recommender as a file :-
import pickle

# Save the similarity matrix to a file using pickle :-
with open('recommender.pkl', 'wb') as file:
    pickle.dump(cos_sim, file)

In [37]:
# Method for showing search results with a given description of the product :-
def search_result(desc):
    search_tfidf = tfidf.transform([desc])
    cos_sim = cosine_similarity(search_tfidf, tfidf_matrix)
    sim_scores = list(enumerate(cos_sim[0]))
    sim_scores.sort(key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[:50]
    article_indices = [score[0] for score in sim_scores]
    return articles['uniq_id'].iloc[article_indices].values

In [40]:
# Sample search :-
search_result("Track pants for men")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  articles['combined_cols'].fillna(value='', inplace=True) # Fill the null values with empty string


array(['0370897d8e2647883151ad6948f7769b',
       '0c42c972eca4fbdc7604d4c1c676c0b5',
       '4afcbd2c8a4009fa63eb542c4b60ea35',
       'c885d8407b867f55ef204b8bf9385140',
       'a2575f3a6712823fb0dd0e3fca02c0e8',
       '5e3e203afc9c3058f3b9d9ef9c1b70cd',
       'ced189e5250a1458bd6edca43d67da0a',
       'ce2ec7c6546921f076099a61d88c6446',
       '38c30b0d78d08f62b1824560b25fd142',
       '47d830af6eef7053a542c643b9086f8b',
       'e2e25e25abfce0b5c107613c031a25fd',
       'e81b9e7f47cd0c05aeac4cff864f84e7',
       '48702749935b80974896c9b864d0e87e',
       '0754b91420bcbc9290e474432f7d2d69',
       '3d6c1d9ba9a5d7f8dbf06c43f1068f77',
       'fa8393fa724d47b46b33528697dacf0c',
       '4a0f012a89720462d21dcc86bc336c5a',
       'd14bc32eb8430ce98fe306d753845828',
       '739015759c91f3861e9a46d60b5e09cc',
       '65d7bdc0f3aabf3fa08b6d44badcf152',
       '2146f219121cb781b5e84cfca5383432',
       '92ec6c477bbdbb210708f4ef54f98fa8',
       '0bb61c94868e866fb432a7608e87460d',
       '417

In [41]:
# Save both the TfidfVectorizer and tfidf_matrix as files for searching products :-
with open('tfidf.pkl', 'wb') as file:
    pickle.dump(tfidf, file)
with open('tfidf_matrix.pkl', 'wb') as file:
    pickle.dump(tfidf_matrix, file)