In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pylab as plt

In [2]:
nike_df = pd.read_csv("dataset/nike_2020_04_13.csv")
adidas_df = pd.read_csv("dataset/Adidas final.csv", delimiter=";")

In [8]:
useful_cols = ["Product Name", "Sale Price", "Brand", "Description","Product ID"]
my_nike  = nike_df[useful_cols]
my_adidas = adidas_df[useful_cols]

In [9]:
def checkNull(df, features):
    for col in features:
        print(col + " " + str(df[col].isnull().sum()))
        
def fillNull(df, features):
    for feature in useful_cols:
        my_nike[feature] = my_nike[feature].fillna('')
        
def combined_features(row, combine_cols):
    res = ""
    for feature in combine_cols:
        res += str(row[feature]) + " "
    return res

In [5]:
checkNull(my_nike, useful_cols)

Product Name 0
Sale Price 0
Brand 0
Description 3


In [6]:
checkNull(my_adidas, useful_cols)

Product Name 0
Sale Price 0
Brand 0
Description 0


In [7]:
fillNull(my_nike, useful_cols)
fillNull(my_adidas, useful_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_nike[feature] = my_nike[feature].fillna('')


In [8]:
my_nike["combined_features"] = my_nike.apply(combined_features, axis =1)
my_adidas["combined_features"] = my_adidas.apply(combined_features, axis =1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_nike["combined_features"] = my_nike.apply(combined_features, axis =1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_adidas["combined_features"] = my_adidas.apply(combined_features, axis =1)


In [9]:
my_products = pd.concat([my_nike, my_adidas], ignore_index=True)

In [10]:
cv = CountVectorizer(stop_words='english')
count_matrix = cv.fit_transform(my_products["combined_features"])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [11]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(my_products["combined_features"])
len(vectorizer.get_feature_names())

4750

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_vectorizer.fit(my_products["combined_features"])
vector_spaces = tf_vectorizer.transform(my_products["combined_features"])

In [13]:
vector_spaces.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
len(vector_spaces.toarray())

3268

In [15]:
tf_cosine_sim = cosine_similarity(vector_spaces)
tf_products = list(enumerate(tf_cosine_sim[1]))
tf_sorted_similar_products = sorted(tf_products, key=lambda x:x[1], reverse=True)

In [16]:
tf_sorted_similar_products[1:9]

[(172, 0.7771855298198772),
 (316, 0.7535082902268035),
 (18, 0.7474575021641137),
 (433, 0.7467330852711282),
 (553, 0.6577354095527589),
 (11, 0.6216601325226891),
 (0, 0.5487546748561601),
 (143, 0.548036485587108)]

In [17]:
cosine_sim = cosine_similarity(count_matrix)

In [18]:
cosine_sim

array([[1.        , 0.63305416, 0.40657856, ..., 0.04767313, 0.02635231,
        0.02076137],
       [0.63305416, 1.        , 0.42276002, ..., 0.02624319, 0.        ,
        0.        ],
       [0.40657856, 0.42276002, 1.        , ..., 0.06460957, 0.02380952,
        0.01875806],
       ...,
       [0.04767313, 0.02624319, 0.06460957, ..., 1.        , 0.25125945,
        0.29692784],
       [0.02635231, 0.        , 0.02380952, ..., 0.25125945, 1.        ,
        0.21884405],
       [0.02076137, 0.        , 0.01875806, ..., 0.29692784, 0.21884405,
        1.        ]])

In [19]:
similar_shoes = list(enumerate(cosine_sim[1]))

In [20]:
sorted_similar_shoes = sorted(similar_shoes, key=lambda x:x[1], reverse=True)

In [21]:
sorted_similar_shoes[0:9]

[(1, 1.0),
 (172, 0.8484848484848485),
 (18, 0.8379305815963923),
 (316, 0.8189346173511123),
 (433, 0.8189346173511123),
 (553, 0.7526178090063818),
 (11, 0.6777389936698861),
 (143, 0.6524726973924443),
 (36, 0.6411188743387697)]

In [22]:
my_products.iloc[1]

Product Name                                      Nike Air Force 1 '07
Sale Price                                                        7495
Brand                                                             Nike
Description          The legend lives on in the Nike Air Force 1 '0...
combined_features    Nike Air Force 1 '07 7495 Nike The legend live...
Name: 1, dtype: object

In [23]:
my_products.iloc[172]

Product Name                                Nike Air Force 1 Low Retro
Sale Price                                                       10995
Brand                                                             Nike
Description          The legend lives on in the Nike Air Force 1 '0...
combined_features    Nike Air Force 1 Low Retro 10995 Nike The lege...
Name: 172, dtype: object

In [24]:
my_products.iloc[36]

Product Name                                  Nike Air Force 1 '07 LV8
Sale Price                                                        7595
Brand                                                             Nike
Description          The Nike Air Force 1 '07 LV8 incorporates the ...
combined_features    Nike Air Force 1 '07 LV8 7595 Nike The Nike Ai...
Name: 36, dtype: object

In [25]:
my_products.iloc[0]

Product Name                            Nike Air Force 1 '07 Essential
Sale Price                                                        7495
Brand                                                             Nike
Description          Let your shoe game shimmer in the Nike Air For...
combined_features    Nike Air Force 1 '07 Essential 7495 Nike Let y...
Name: 0, dtype: object

In [26]:
my_products.iloc[257]

Product Name                                            Jordan Max 200
Sale Price                                                       10495
Brand                                                             Nike
Description          With design elements inspired by the Air Jorda...
combined_features    Jordan Max 200 10495 Nike With design elements...
Name: 257, dtype: object