In [49]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

df = pd.read_csv("../data/sample_product.csv")

In [50]:
df.head()

Unnamed: 0,Uniq Id,Product Name,Brand Name,Asin,Category,Upc Ean Code,List Price,Selling Price,Quantity,Model Number,...,Product Url,Stock,Product Details,Dimensions,Color,Ingredients,Direction To Use,Is Amazon Seller,Size Quantity Variant,Product Description
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",,,Sports & Outdoors | Outdoor Recreation | Skate...,,,$237.68,,,...,https://www.amazon.com/DB-Longboards-CoreFlex-...,,,,,,,Y,,
1,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",,,Toys & Games | Learning & Education | Science ...,,,$99.95,,55324.0,...,https://www.amazon.com/Electronic-Circuits-Cla...,,,,,,,Y,,
2,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,,,Toys & Games | Arts & Crafts | Craft Kits,,,$34.99,,,...,https://www.amazon.com/3Doodler-Plastic-Innova...,,,,,,,Y,,
3,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,,,Toys & Games | Hobbies | Models & Model Kits |...,,,$28.91,,142.0,...,https://www.amazon.com/Guillow-Airplane-Design...,,,,,,,Y,,
4,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,,,Toys & Games | Puzzles | Jigsaw Puzzles,,,$17.49,,62151.0,...,https://www.amazon.com/Woodstock-Collage-500-p...,,,,,,,Y,,


In [51]:
df.shape  # 10,002 row, 28 column

(10002, 28)

In [52]:
df.isnull().sum()

Uniq Id                      0
Product Name                 0
Brand Name               10002
Asin                     10002
Category                   830
Upc Ean Code              9968
List Price               10002
Selling Price              107
Quantity                 10002
Model Number              1770
About Product              273
Product Specification     1632
Technical Details          790
Shipping Weight           1138
Product Dimensions        9523
Image                        0
Variants                  7524
Sku                      10002
Product Url                  0
Stock                    10002
Product Details          10002
Dimensions               10002
Color                    10002
Ingredients              10002
Direction To Use         10002
Is Amazon Seller             0
Size Quantity Variant    10002
Product Description      10002
dtype: int64

In [53]:
df.columns

Index(['Uniq Id', 'Product Name', 'Brand Name', 'Asin', 'Category',
       'Upc Ean Code', 'List Price', 'Selling Price', 'Quantity',
       'Model Number', 'About Product', 'Product Specification',
       'Technical Details', 'Shipping Weight', 'Product Dimensions', 'Image',
       'Variants', 'Sku', 'Product Url', 'Stock', 'Product Details',
       'Dimensions', 'Color', 'Ingredients', 'Direction To Use',
       'Is Amazon Seller', 'Size Quantity Variant', 'Product Description'],
      dtype='object')

In [54]:
df[df['Product Name'].str.contains('umbrella', case=False, na=False)]['Product Name']

138     Dark Horse Deluxe The Umbrella Academy Playing...
1589    Babalu Kids Umbrella - Childrens 18 Inch Rainy...
5521    KidKraft Wooden Outdoor Children's Patio Set w...
Name: Product Name, dtype: object

In [55]:
df.iloc[138]

Uniq Id                                   6bbd68078c73c7e597720418d5347070
Product Name             Dark Horse Deluxe The Umbrella Academy Playing...
Brand Name                                                             NaN
Asin                                                                   NaN
Category                 Toys & Games | Games & Accessories | Card Game...
Upc Ean Code                                                           NaN
List Price                                                             NaN
Selling Price                                                        $4.99
Quantity                                                               NaN
Model Number                                                     DEC180427
About Product            Celebrate the return of the hit series BY Gera...
Product Specification    ProductDimensions:2.6x0.7x3.6inches|ItemWeight...
Technical Details        Go to your orders and start the return Select ...
Shipping Weight          

In [56]:
features = ['Product Name', 'Category', 'About Product', 'Product Specification', 'Technical Details']
df[features].isnull().sum()

Product Name                0
Category                  830
About Product             273
Product Specification    1632
Technical Details         790
dtype: int64

In [57]:

for feature in features:
    df[feature] = df[feature].fillna("")  # เพิ่มส่วนที่เป็น missing ด้วย  ""

In [58]:
df[features].isnull().sum()

Product Name             0
Category                 0
About Product            0
Product Specification    0
Technical Details        0
dtype: int64

In [59]:
df['combined_features'] = df['Product Name'] + ' ' + df['About Product'] + ' ' + df['Category'] + ' ' + df[
    'Product Specification'] + ' ' + df['Technical Details']

df.combined_features.head()

0    DB Longboards CoreFlex Crossbow 41" Bamboo Fib...
1    Electronic Snap Circuits Mini Kits Classpack, ...
2    3Doodler Create Flexy 3D Printing Filament Ref...
3    Guillow Airplane Design Studio with Travel Cas...
4    Woodstock- Collage 500 pc Puzzle Puzzle has 50...
Name: combined_features, dtype: object

In [60]:
df.head()

Unnamed: 0,Uniq Id,Product Name,Brand Name,Asin,Category,Upc Ean Code,List Price,Selling Price,Quantity,Model Number,...,Stock,Product Details,Dimensions,Color,Ingredients,Direction To Use,Is Amazon Seller,Size Quantity Variant,Product Description,combined_features
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",,,Sports & Outdoors | Outdoor Recreation | Skate...,,,$237.68,,,...,,,,,,,Y,,,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib..."
1,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",,,Toys & Games | Learning & Education | Science ...,,,$99.95,,55324.0,...,,,,,,,Y,,,"Electronic Snap Circuits Mini Kits Classpack, ..."
2,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,,,Toys & Games | Arts & Crafts | Craft Kits,,,$34.99,,,...,,,,,,,Y,,,3Doodler Create Flexy 3D Printing Filament Ref...
3,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,,,Toys & Games | Hobbies | Models & Model Kits |...,,,$28.91,,142.0,...,,,,,,,Y,,,Guillow Airplane Design Studio with Travel Cas...
4,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,,,Toys & Games | Puzzles | Jigsaw Puzzles,,,$17.49,,62151.0,...,,,,,,,Y,,,Woodstock- Collage 500 pc Puzzle Puzzle has 50...


In [61]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df.combined_features)

In [62]:
len(cv.get_feature_names())



53004

In [63]:
count_matrix.shape  # จำนวนแถว, colume

(10002, 53004)

In [64]:
print(cv.get_feature_names()[0:10])
print(cv.get_feature_names()[-10:])
print(count_matrix.toarray()[:10, 10:30])

['00', '000', '0000', '000000', '00000080', '00001', '00002', '00006', '00007', '00009']
['été', 'être', 'ñtoys', 'ømm', 'μsec', 'оne', 'оnе', 'расk', 'ﬂip', 'ﬂoat']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [65]:
A_sparse = sparse.csr_matrix(count_matrix)
sparse.save_npz("amazon_model.npz", A_sparse)

In [66]:
A_sparse = sparse.load_npz("amazon_model.npz")

In [67]:
user_choice = 'b2bbae612611f0093cf7342369db62fa'
ref_index = df[df['Uniq Id'].str.contains(user_choice, case=False)].index[0]

print('user choice ref_index = {}'.format(ref_index))

user choice ref_index = 3851


In [68]:
df.iloc[ref_index]

Uniq Id                                   b2bbae612611f0093cf7342369db62fa
Product Name             Michael Jackson Performance Accessory Kit (Adult)
Brand Name                                                             NaN
Asin                                                                   NaN
Category                 Clothing, Shoes & Jewelry | Costumes & Accesso...
Upc Ean Code                                                           NaN
List Price                                                             NaN
Selling Price                                                       $30.95
Quantity                                                               NaN
Model Number                                                           NaN
About Product            Synthetic | 0.7" high | 14" wide | Convenient ...
Product Specification                                                     
Technical Details        show up to 2 reviews by default Includes: Wig,...
Shipping Weight          

In [69]:
cosine_sim[ref_index]

array([0.28673772, 0.31610984, 0.32391883, ..., 0.2222372 , 0.05974863,
       0.21116915])

In [70]:
similar_products = list(enumerate(cosine_sim[ref_index]))
similar_products

[(0, 0.2867377175841095),
 (1, 0.31610983658293396),
 (2, 0.3239188292789932),
 (3, 0.30558380829703397),
 (4, 0.1865669088428978),
 (5, 0.35400109174105043),
 (6, 0.4301978576914552),
 (7, 0.03575421226192905),
 (8, 0.28066800206574133),
 (9, 0.2272657812634997),
 (10, 0.3485604296731738),
 (11, 0.3577266936923235),
 (12, 0.14494698828703267),
 (13, 0.3416154398707725),
 (14, 0.30087973968232024),
 (15, 0.24694095001014588),
 (16, 0.26585946291649803),
 (17, 0.30800847650991187),
 (18, 0.30747501866955845),
 (19, 0.3130810117903309),
 (20, 0.2792031731573867),
 (21, 0.16709876030230614),
 (22, 0.34808245858009995),
 (23, 0.19068913206362156),
 (24, 0.24543794791647114),
 (25, 0.10947447024610965),
 (26, 0.30875749944522946),
 (27, 0.2899774044474399),
 (28, 0.21591915092209962),
 (29, 0.17796743476240465),
 (30, 0.3238054091832553),
 (31, 0.34931601213223107),
 (32, 0.3467049861057149),
 (33, 0.3112611666346145),
 (34, 0.4110351401856283),
 (35, 0.15512207476128345),
 (36, 0.298260077

In [71]:
sorted_similar_products = sorted(similar_products, key=lambda x: x[1], reverse=True)[1:]

sorted_similar_products[:8]

[(9252, 0.49193285614091176),
 (9679, 0.4880597052943128),
 (7008, 0.48776230455235364),
 (469, 0.4864395818496832),
 (5778, 0.4814781246500955),
 (2115, 0.47660237490506585),
 (5807, 0.4764381941182063),
 (5789, 0.47626099013987444)]

In [72]:
print('Recommend products for [' + user_choice + ']')
print('------------------------------------')
for i, element in enumerate(sorted_similar_products):
    similar_product_id = element[0]
    print(similar_product_id)
    similar_product_title = df['Product Name'].iloc[similar_product_id]
    similar_product_url = df['Product Url'].iloc[similar_product_id]
    s_score = element[1]
    print('{:30} , {:30} -> {:.3f}'.format(similar_product_title, similar_product_url, s_score))

    if i > 6:
        break

Recommend products for [b2bbae612611f0093cf7342369db62fa]
------------------------------------
9252
Indiana Jones - Indiana Jones Hat and Whip Set Child , https://www.amazon.com/Indiana-Jones-Childs-Hat-Whip/dp/B001EDF3OU -> 0.492
9679
Rubies Star Wars Clone Wars Child's Deluxe Cad Bane Costume and Mask, Small , https://www.amazon.com/Rubies-Clone-Childs-Deluxe-Costume/dp/B003KN3O2U -> 0.488
7008
Forum Great Detective Costume Accessory Kit , https://www.amazon.com/Forum-Great-Detective-Costume-Accessory/dp/B003VLVRQ6 -> 0.488
469
Lord of The Rings Bilbo Baggins Sword , https://www.amazon.com/The-Hobbit-Sting-Lightup-Sword/dp/B00BG34DK8 -> 0.486
5778
Rubie's Marvel Classic Child's American Dream Metallic Costume, Small , https://www.amazon.com/Rubies-Classic-American-Metallic-Costume/dp/B00HA4ZCK2 -> 0.481
2115
Rubie's Costume Co. Women's Suicide Squad Harley Costume Jewelry Set , https://www.amazon.com/Rubies-Costume-Womens-Suicide-Jewelry/dp/B01BFFIQ86 -> 0.477
5807
Rubie's Costume Co