In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pylab as plt

In [39]:
nike_df = pd.read_csv("dataset/nike_2020_04_13.csv")
adidas_df = pd.read_csv("dataset/Adidas final.csv", delimiter=";")

In [40]:
useful_cols = ["Product Name", "Sale Price", "Brand", "Description"]
my_nike  = nike_df[useful_cols]
my_adidas = adidas_df[useful_cols]

In [41]:
def checkNull(df, features):
    for col in features:
        print(col + " " + str(df[col].isnull().sum()))
        
def fillNull(df, features):
    for feature in useful_cols:
        my_nike[feature] = my_nike[feature].fillna('')
        
def combined_features(row):
    res = ""
    for feature in useful_cols:
        res += str(row[feature]) + " "
    return res

In [42]:
checkNull(my_nike, useful_cols)

Product Name 0
Sale Price 0
Brand 0
Description 3


In [43]:
checkNull(my_adidas, useful_cols)

Product Name 0
Sale Price 0
Brand 0
Description 0


In [44]:
fillNull(my_nike, useful_cols)
fillNull(my_adidas, useful_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_nike[feature] = my_nike[feature].fillna('')


In [45]:
my_nike["combined_features"] = my_nike.apply(combined_features, axis =1)
my_adidas["combined_features"] = my_adidas.apply(combined_features, axis =1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_nike["combined_features"] = my_nike.apply(combined_features, axis =1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_adidas["combined_features"] = my_adidas.apply(combined_features, axis =1)


In [46]:
my_products = pd.concat([my_nike, my_adidas], ignore_index=True)

In [47]:
cv = CountVectorizer(stop_words='english')
count_matrix = cv.fit_transform(my_products["combined_features"])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [49]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(my_products["combined_features"])
len(vectorizer.get_feature_names())

4750

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_vectorizer.fit(my_products["combined_features"])
vector_spaces = tf_vectorizer.transform(my_products["combined_features"])

In [53]:
vector_spaces.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [56]:
len(count_matrix.toarray())

3268

In [57]:
len(vector_spaces.toarray())

3268

In [58]:
tf_cosine_sim = cosine_similarity(vector_spaces)
tf_products = list(enumerate(tf_cosine_sim[1]))
tf_sorted_similar_movies = sorted(tf_products, key=lambda x:x[1], reverse=True)

In [64]:
tf_sorted_similar_movies[1:9]

[(172, 0.7771855298198772),
 (316, 0.7535082902268035),
 (18, 0.7474575021641137),
 (433, 0.7467330852711282),
 (553, 0.6577354095527589),
 (11, 0.6216601325226891),
 (0, 0.5487546748561601),
 (143, 0.548036485587108)]

In [22]:
cosine_sim = cosine_similarity(count_matrix)

In [23]:
cosine_sim

array([[1.        , 0.62589878, 0.40369868, ..., 0.21630814, 0.13795154,
        0.19002385],
       [0.62589878, 1.        , 0.39186207, ..., 0.2119221 , 0.10916311,
        0.17184995],
       [0.40369868, 0.39186207, 1.        , ..., 0.22080322, 0.09857281,
        0.21983576],
       ...,
       [0.21630814, 0.2119221 , 0.22080322, ..., 1.        , 0.34582476,
        0.42068384],
       [0.13795154, 0.10916311, 0.09857281, ..., 0.34582476, 1.        ,
        0.3176117 ],
       [0.19002385, 0.17184995, 0.21983576, ..., 0.42068384, 0.3176117 ,
        1.        ]])

In [24]:
similar_movies = list(enumerate(cosine_sim[1]))

In [28]:
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

In [65]:
sorted_similar_movies[1:9]

[(172, 0.8571817324793919),
 (316, 0.8369171530598073),
 (433, 0.8369171530598073),
 (18, 0.8296455196719188),
 (553, 0.7614553930883753),
 (11, 0.720069696319024),
 (267, 0.6902628054688186),
 (143, 0.6866171722688431)]

In [67]:
my_products.iloc[1]

Product Name                                      Nike Air Force 1 '07
Sale Price                                                        7495
Brand                                                             Nike
Description          The legend lives on in the Nike Air Force 1 '0...
combined_features    Nike Air Force 1 '07 7495 Nike The legend live...
Name: 1, dtype: object

In [68]:
my_products.iloc[433]

Product Name                                  Nike Air Force 1 Mid '07
Sale Price                                                        6317
Brand                                                             Nike
Description          The legend lives on in the Nike Air Force 1 Mi...
combined_features    Nike Air Force 1 Mid '07 6317 Nike The legend ...
Name: 433, dtype: object

In [69]:
my_products.iloc[18]

Product Name                                      Nike Air Force 1 '07
Sale Price                                                        7995
Brand                                                             Nike
Description          The legend lives on in the Nike Air Force 1 '0...
combined_features    Nike Air Force 1 '07 7995 Nike The legend live...
Name: 18, dtype: object

In [70]:
my_products.iloc[0]

Product Name                            Nike Air Force 1 '07 Essential
Sale Price                                                        7495
Brand                                                             Nike
Description          Let your shoe game shimmer in the Nike Air For...
combined_features    Nike Air Force 1 '07 Essential 7495 Nike Let y...
Name: 0, dtype: object

In [71]:
my_products.iloc[257]

Product Name                                            Jordan Max 200
Sale Price                                                       10495
Brand                                                             Nike
Description          With design elements inspired by the Air Jorda...
combined_features    Jordan Max 200 10495 Nike With design elements...
Name: 257, dtype: object