For a business without any user-item purchase history, a search engine based recommendation system can be designed for users. The product recommendations can be based on textual clustering analysis given in product description.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [2]:
product_descriptions = pd.read_csv('data.csv',encoding='unicode_escape')
product_descriptions.shape

(541909, 8)

In [3]:
product_descriptions = product_descriptions.dropna()

In [4]:
product_descriptions1 = product_descriptions.head(500)

In [5]:
product_descriptions1["Description"].head(10)

0     WHITE HANGING HEART T-LIGHT HOLDER
1                    WHITE METAL LANTERN
2         CREAM CUPID HEARTS COAT HANGER
3    KNITTED UNION FLAG HOT WATER BOTTLE
4         RED WOOLLY HOTTIE WHITE HEART.
5           SET 7 BABUSHKA NESTING BOXES
6      GLASS STAR FROSTED T-LIGHT HOLDER
7                 HAND WARMER UNION JACK
8              HAND WARMER RED POLKA DOT
9          ASSORTED COLOUR BIRD ORNAMENT
Name: Description, dtype: object

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

In [7]:
tfv_matrix = tfv.fit_transform(product_descriptions1['Description'])


In [8]:
tfv_matrix

<500x457 sparse matrix of type '<class 'numpy.float64'>'
	with 3113 stored elements in Compressed Sparse Row format>

In [9]:
tfv_matrix.shape

(500, 457)

In [10]:
from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [11]:
sig[0]

array([0.76251161, 0.76164574, 0.76159416, 0.76159416, 0.76166941,
       0.76159416, 0.76187959, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76166747,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76163629,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76251161,
       0.76164574, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76162537, 0.76162206, 0.76162852,
       0.76163361, 0.76159416, 0.76166941, 0.76159416, 0.76187959,
       0.76159416, 0.76251161, 0.76164574, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76162

In [12]:
indices = pd.Series(product_descriptions1.index, index=product_descriptions1['Description']).drop_duplicates()

In [13]:
indices

Description
WHITE HANGING HEART T-LIGHT HOLDER       0
WHITE METAL LANTERN                      1
CREAM CUPID HEARTS COAT HANGER           2
KNITTED UNION FLAG HOT WATER BOTTLE      3
RED WOOLLY HOTTIE WHITE HEART.           4
                                      ... 
RED HEART LUGGAGE TAG                  495
RED GLASS TASSLE BAG CHARM             496
CLEAR ACRYLIC FACETED BANGLE           497
5 STRAND GLASS NECKLACE CRYSTAL        498
DOORMAT UNION JACK GUNS AND ROSES      499
Length: 500, dtype: int64

In [30]:
def give_rec(title, sig=sig):
    idx = indices[title]
    
    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))
    
    # Sort the products
    sig_scores = sorted(sig_scores, key=lambda x: x[1].all(), reverse=True)
    
    # Scores of the 10 most similar products
    sig_scores = sig_scores[1:10]
    
    # Products indices
    product_indices = [i[0] for i in sig_scores]
    
    return product_descriptions1['Description'].iloc[product_indices]

In [32]:
give_rec('RED GLASS TASSLE BAG CHARM')

1                    WHITE METAL LANTERN
2         CREAM CUPID HEARTS COAT HANGER
3    KNITTED UNION FLAG HOT WATER BOTTLE
4         RED WOOLLY HOTTIE WHITE HEART.
5           SET 7 BABUSHKA NESTING BOXES
6      GLASS STAR FROSTED T-LIGHT HOLDER
7                 HAND WARMER UNION JACK
8              HAND WARMER RED POLKA DOT
9          ASSORTED COLOUR BIRD ORNAMENT
Name: Description, dtype: object

In [None]:
# product_descriptions = pd.read_csv('data.csv',encoding='unicode_escape')
# product_descriptions.shape

In [None]:
product_descriptions = product_descriptions.dropna()

In [None]:
product_descriptions1 = product_descriptions.head(500)

In [None]:
product_descriptions1["Description"].head(10)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X1 = vectorizer.fit_transform(product_descriptions1["Description"])
X1

In [None]:
# X=X1

# kmeans = KMeans(n_clusters = 10, init = 'k-means++')
# y_kmeans = kmeans.fit_predict(X)
# plt.plot(y_kmeans, ".")
# plt.show()

In [None]:
# def print_cluster(i):
#     print("Cluster %d:" % i),
#     for ind in order_centroids[i, :10]:
#         print(' %s' % terms[ind]),
#     print

In [None]:
# true_k = 10

# model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
# model.fit(X1)

# print("Top terms per cluster:")
# order_centroids = model.cluster_centers_.argsort()[:, ::-1]
# terms = vectorizer.get_feature_names()
# for i in range(true_k):
#     print_cluster(i)

In [None]:
# def show_recommendations(product):
#     Y = vectorizer.transform([product])
# #     prediction = model.predict(Y)
#     print_cluster(prediction[0])

In [None]:
# show_recommendations("phone")