In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pylab as plt

In [2]:
nike_df = pd.read_csv("dataset/nike_2020_04_13.csv")
adidas_df = pd.read_csv("dataset/Adidas final.csv", delimiter=";")

In [3]:
useful_cols = ["Product Name", "Sale Price", "Brand", "Description","Product ID"]
my_nike  = nike_df[useful_cols]
my_adidas = adidas_df[useful_cols]

In [4]:
def checkNull(df, features):
    for col in features:
        print(col + " " + str(df[col].isnull().sum()))
        
def fillNull(df, features):
    for feature in useful_cols:
        my_nike[feature] = my_nike[feature].fillna('')
        
def combined_features(row):
    combine_cols = ["Product Name", "Sale Price", "Brand", "Description"]
    res = ""
    for feature in combine_cols:
        res += str(row[feature]) + " "
    return res

In [5]:
checkNull(my_nike, useful_cols)

Product Name 0
Sale Price 0
Brand 0
Description 3
Product ID 0


In [6]:
checkNull(my_adidas, useful_cols)

Product Name 0
Sale Price 0
Brand 0
Description 0
Product ID 0


In [7]:
fillNull(my_nike, useful_cols)
fillNull(my_adidas, useful_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_nike[feature] = my_nike[feature].fillna('')


In [8]:
my_nike["combined_features"] = my_nike.apply(combined_features, axis =1)
my_adidas["combined_features"] = my_adidas.apply(combined_features, axis =1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_nike["combined_features"] = my_nike.apply(combined_features, axis =1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_adidas["combined_features"] = my_adidas.apply(combined_features, axis =1)


In [9]:
my_products = pd.concat([my_nike, my_adidas], ignore_index=True)

In [10]:
my_products

Unnamed: 0,Product Name,Sale Price,Brand,Description,Product ID,combined_features
0,Nike Air Force 1 '07 Essential,7495,Nike,Let your shoe game shimmer in the Nike Air For...,CJ1646-600,Nike Air Force 1 '07 Essential 7495 Nike Let y...
1,Nike Air Force 1 '07,7495,Nike,The legend lives on in the Nike Air Force 1 '0...,CT4328-101,Nike Air Force 1 '07 7495 Nike The legend live...
2,Nike Air Force 1 Sage Low LX,9995,Nike,"Taking both height and craft to new levels, th...",CI3482-200,Nike Air Force 1 Sage Low LX 9995 Nike Taking ...
3,Nike Air Max Dia SE,9995,Nike,"Designed for a woman's foot, the Nike Air Max ...",CD0479-200,Nike Air Max Dia SE 9995 Nike Designed for a w...
4,Nike Air Max Verona,9995,Nike,Pass on the good vibes in the Nike Air Max Ver...,CZ6156-101,Nike Air Max Verona 9995 Nike Pass on the good...
...,...,...,...,...,...,...
3263,Men's adidas Adipower Vector 20 Shoes,10999,SPORT PERFORMANCE,It's just you and the batsman. As you steam to...,EF3503,Men's adidas Adipower Vector 20 Shoes 10999 SP...
3264,Men's adidas Cricket Howzat Shoes,6999,SPORT PERFORMANCE,"Batting, fielding or bowling, these Howzat Spi...",EF3505,Men's adidas Cricket Howzat Shoes 6999 SPORT P...
3265,Men's Cricket Cri Hase Shoes,3999,SPORT PERFORMANCE,Thiese shoes are a great choice for the amatue...,CM6008,Men's Cricket Cri Hase Shoes 3999 SPORT PERFOR...
3266,Unisex adidas Outdoor Terrex Daroga Water Shoes,7999,SPORT PERFORMANCE,These shoes fit easily into a travel bag so yo...,BC0980,Unisex adidas Outdoor Terrex Daroga Water Shoe...


In [43]:
my_products = my_products.drop_duplicates()

In [53]:
my_products[my_products['Product ID'].duplicated()]

Unnamed: 0,Product Name,Sale Price,Brand,Description,Product ID,combined_features


In [44]:
my_products

Unnamed: 0,Product Name,Sale Price,Brand,Description,Product ID,combined_features
0,Nike Air Force 1 '07 Essential,7495,Nike,Let your shoe game shimmer in the Nike Air For...,CJ1646-600,Nike Air Force 1 '07 Essential 7495 Nike Let y...
1,Nike Air Force 1 '07,7495,Nike,The legend lives on in the Nike Air Force 1 '0...,CT4328-101,Nike Air Force 1 '07 7495 Nike The legend live...
2,Nike Air Force 1 Sage Low LX,9995,Nike,"Taking both height and craft to new levels, th...",CI3482-200,Nike Air Force 1 Sage Low LX 9995 Nike Taking ...
3,Nike Air Max Dia SE,9995,Nike,"Designed for a woman's foot, the Nike Air Max ...",CD0479-200,Nike Air Max Dia SE 9995 Nike Designed for a w...
4,Nike Air Max Verona,9995,Nike,Pass on the good vibes in the Nike Air Max Ver...,CZ6156-101,Nike Air Max Verona 9995 Nike Pass on the good...
...,...,...,...,...,...,...
3263,Men's adidas Adipower Vector 20 Shoes,10999,SPORT PERFORMANCE,It's just you and the batsman. As you steam to...,EF3503,Men's adidas Adipower Vector 20 Shoes 10999 SP...
3264,Men's adidas Cricket Howzat Shoes,6999,SPORT PERFORMANCE,"Batting, fielding or bowling, these Howzat Spi...",EF3505,Men's adidas Cricket Howzat Shoes 6999 SPORT P...
3265,Men's Cricket Cri Hase Shoes,3999,SPORT PERFORMANCE,Thiese shoes are a great choice for the amatue...,CM6008,Men's Cricket Cri Hase Shoes 3999 SPORT PERFOR...
3266,Unisex adidas Outdoor Terrex Daroga Water Shoes,7999,SPORT PERFORMANCE,These shoes fit easily into a travel bag so yo...,BC0980,Unisex adidas Outdoor Terrex Daroga Water Shoe...


In [45]:
cv = CountVectorizer(stop_words='english')
count_matrix = cv.fit_transform(my_products["combined_features"])
print("Count Matrix:", count_matrix.toarray())
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(my_products["combined_features"])
len(vectorizer.get_feature_names())
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_vectorizer.fit(my_products["combined_features"])
vector_spaces = tf_vectorizer.transform(my_products["combined_features"])

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [46]:
tf_cosine_sim = cosine_similarity(vector_spaces)

In [47]:
def produce_rank():
    res = []
    idx = 0
    for csl in tf_cosine_sim:
        single = [my_products.iloc[idx]["Product ID"]]
        ssl = list(enumerate(csl))
        sss = sorted(ssl, key=lambda x:x[1], reverse=True)
        count = 0
        for a in sss:
            if(a[0] != idx and count < 10):
                count += 1
                single.append(my_products.iloc[a[0]]["Product ID"])
            if(count == 10):
                break
        res.append(single)
        idx += 1
    return res

In [48]:
lst = produce_rank()

In [49]:
ans = pd.DataFrame(lst, columns =["productId", "s_1", "s_2", "s_3", "s_4", "s_5", "s_6", "s_7", "s_8", "s_9", "s_10"])

In [50]:
ans

Unnamed: 0,productId,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10
0,CJ1646-600,CT4328-101,CQ0492-001,AA0287-002,AO2132-401,315115-112,315123-111,366731-100,CI3446-001,CJ1379-001,BQ4424-700
1,CT4328-101,CQ0492-001,315123-111,CJ1379-001,366731-100,AA0287-002,315115-112,CJ1646-600,CD0887-201,CI3446-001,CD0888-002
2,CI3482-200,AR5339-002,CJ1642-002,CQ0492-001,BV1712-001,AV4417-002,CT1020-001,CJ0625-700,BQ3611-100,CK4126-001,CD4366-002
3,CD0479-200,CI3898-200,CJ0636-100,AQ4312-107,BQ9665-301,CI1214-004,AR7410-603,CQ2503-900,CD6615-100,CD0132-001,CI3709-001
4,CZ6156-101,CI9842-500,CK7200-800,CQ6639-001,AQ0927-100,CN8490-100,AH6789-023,CD0132-001,CD6615-100,CI3709-001,CI3868-001
...,...,...,...,...,...,...,...,...,...,...,...
3174,EF3503,EF3504,EF3505,EG0964,EG0953,EF1664,310805-137,FV6056,FV6058,FV6057,EG0963
3175,EF3505,EF3504,EF3509,EF3503,844127-108,CM6007,CM6008,EG0963,EG0964,EG0953,AQ8528
3176,CM6008,CM6007,EF3509,CL7591,CL7590,CL7588,CL7589,CM0009,CM0010,CM0008,CJ0183
3177,BC0980,CM7531,EG1713,CK1088,CJ0180,CJ0177,CK1086,CL9990,BC0973,EG6208,CM7593


In [52]:
ans.to_csv("no_duplicate_10_most_recommend.csv", index=False)