# Project: Recommender System

***Nguyen Thi Tuong Vy***

## III. Content-based Filtering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from underthesea import word_tokenize, pos_tag, sent_tokenize
from gensim import corpora, models, similarities
import jieba
import re
import warnings
warnings.filterwarnings('ignore')

**Loading the data**

In [2]:
products = pd.read_csv('data/ProductNew.csv')
reviews = pd.read_csv('data/ReviewNew.zip')

In [3]:
products.head()

Unnamed: 0,item_id,name,description,rating,price,list_price,brand,group,url,image
0,48102821,Tai nghe Bluetooth Inpods 12 - Cảm biến vân ta...,THÔNG TIN CHI TIẾT\nDung lượng pin 300\nThời g...,4.0,77000,300000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-inpods-12-cam-bien-...,https://salt.tikicdn.com/cache/280x280/ts/prod...
1,52333193,Tai nghe bluetooth không dây F9 True wireless ...,THÔNG TIN CHI TIẾT\nDung lượng pin 2000mah\nTh...,4.5,132000,750000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-khong-day-f9-true-w...,https://salt.tikicdn.com/cache/280x280/ts/prod...
2,299461,Chuột Không Dây Logitech M331 Silent - Hàng Ch...,THÔNG TIN CHI TIẾT\nThương hiệu Logitech\nĐộ p...,4.8,299000,399000,Logitech,Thiết Bị Số - Phụ Kiện Số/Phụ kiện máy tính và...,https://chuot-khong-day-logitech-m331-silent-p...,https://salt.tikicdn.com/cache/280x280/media/c...
3,57440329,Loa Bluetooth 5.0 Kiêm Đồng Hồ Báo Thức - [[ 2...,THÔNG TIN CHI TIẾT\nThương hiệu Acome\nXuất xứ...,4.7,149000,350000,Acome,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://loa-bluetooth-5-0-kiem-dong-ho-bao-thu...,https://salt.tikicdn.com/cache/280x280/ts/prod...
4,38458616,Tai Nghe Bluetooth Apple AirPods Pro True Wire...,THÔNG TIN CHI TIẾT\nThương hiệu Apple\nXuất xứ...,4.8,5090000,8500000,Apple,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-apple-airpods-pro-t...,https://salt.tikicdn.com/cache/280x280/ts/prod...


In [4]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4373 entries, 0 to 4372
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   item_id      4373 non-null   int64  
 1   name         4373 non-null   object 
 2   description  4370 non-null   object 
 3   rating       4373 non-null   float64
 4   price        4373 non-null   int64  
 5   list_price   4373 non-null   int64  
 6   brand        4373 non-null   object 
 7   group        4373 non-null   object 
 8   url          4373 non-null   object 
 9   image        4373 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 341.8+ KB


In [5]:
products.shape

(4373, 10)

In [6]:
products = products[products['name'].notnull()]
products['name_description'] = products['name'] + products['description']
products = products[products['name_description'].notnull()]
products['name_description_pre'] = products['name_description'].apply(lambda x: word_tokenize(x, format='text'))

In [7]:
products.shape

(4370, 12)

In [8]:
products.head()

Unnamed: 0,item_id,name,description,rating,price,list_price,brand,group,url,image,name_description,name_description_pre
0,48102821,Tai nghe Bluetooth Inpods 12 - Cảm biến vân ta...,THÔNG TIN CHI TIẾT\nDung lượng pin 300\nThời g...,4.0,77000,300000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-inpods-12-cam-bien-...,https://salt.tikicdn.com/cache/280x280/ts/prod...,Tai nghe Bluetooth Inpods 12 - Cảm biến vân ta...,Tai_nghe Bluetooth_Inpods 12 - Cảm_biến vân ta...
1,52333193,Tai nghe bluetooth không dây F9 True wireless ...,THÔNG TIN CHI TIẾT\nDung lượng pin 2000mah\nTh...,4.5,132000,750000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-khong-day-f9-true-w...,https://salt.tikicdn.com/cache/280x280/ts/prod...,Tai nghe bluetooth không dây F9 True wireless ...,Tai_nghe bluetooth không dây F9_True wireless ...
2,299461,Chuột Không Dây Logitech M331 Silent - Hàng Ch...,THÔNG TIN CHI TIẾT\nThương hiệu Logitech\nĐộ p...,4.8,299000,399000,Logitech,Thiết Bị Số - Phụ Kiện Số/Phụ kiện máy tính và...,https://chuot-khong-day-logitech-m331-silent-p...,https://salt.tikicdn.com/cache/280x280/media/c...,Chuột Không Dây Logitech M331 Silent - Hàng Ch...,Chuột Không Dây Logitech_M331_Silent - Hàng Ch...
3,57440329,Loa Bluetooth 5.0 Kiêm Đồng Hồ Báo Thức - [[ 2...,THÔNG TIN CHI TIẾT\nThương hiệu Acome\nXuất xứ...,4.7,149000,350000,Acome,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://loa-bluetooth-5-0-kiem-dong-ho-bao-thu...,https://salt.tikicdn.com/cache/280x280/ts/prod...,Loa Bluetooth 5.0 Kiêm Đồng Hồ Báo Thức - [[ 2...,Loa Bluetooth 5.0 Kiêm Đồng_Hồ Báo_Thức - [ [ ...
4,38458616,Tai Nghe Bluetooth Apple AirPods Pro True Wire...,THÔNG TIN CHI TIẾT\nThương hiệu Apple\nXuất xứ...,4.8,5090000,8500000,Apple,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-apple-airpods-pro-t...,https://salt.tikicdn.com/cache/280x280/ts/prod...,Tai Nghe Bluetooth Apple AirPods Pro True Wire...,Tai_Nghe Bluetooth_Apple_AirPods_Pro_True_Wire...


In [9]:
products = products.reset_index()

**Stop Words**

In [10]:
STOP_WORD_FILE = 'data/vietnamese-stopwords.txt'

In [11]:
with open(STOP_WORD_FILE, 'r', encoding='utf-8') as file:
    stop_words = file.read()
    
stop_words = stop_words.split('\n')

**TF-IDF**

In [12]:
tf = TfidfVectorizer(analyzer='word', min_df=0, stop_words=stop_words)

In [13]:
tfidf_matrix = tf.fit_transform(products['name_description_pre'])

## III.1. Cosine Similarity

In [14]:
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_similarities

array([[1.        , 0.40075346, 0.08112324, ..., 0.01782164, 0.01366667,
        0.06182976],
       [0.40075346, 1.        , 0.08310518, ..., 0.01635142, 0.0362413 ,
        0.09042972],
       [0.08112324, 0.08310518, 1.        , ..., 0.03441329, 0.03081951,
        0.06934047],
       ...,
       [0.01782164, 0.01635142, 0.03441329, ..., 1.        , 0.00868335,
        0.03151153],
       [0.01366667, 0.0362413 , 0.03081951, ..., 0.00868335, 1.        ,
        0.05662914],
       [0.06182976, 0.09042972, 0.06934047, ..., 0.03151153, 0.05662914,
        1.        ]])

In [15]:
cosine_similarities.shape

(4370, 4370)

**The most related products**

In [16]:
results = {}

for idx, row in products.iterrows():    
    similar_indices = cosine_similarities[idx].argsort()[-10:-1]
    similar_items = [(cosine_similarities[idx][i]) for i in similar_indices]
    similar_items = [(cosine_similarities[idx][i], products['item_id'][i]) for i in similar_indices]
    print(similar_items[0:])
    results[idx] = similar_items[0:]

[(0.37963189598354974, 26348659), (0.3843128095067755, 52889826), (0.39197111830614545, 72928043), (0.40075345942249213, 52333193), (0.4010595902845202, 50319688), (0.40906108816405334, 70772235), (0.4149413835537337, 48273751), (0.433537681425906, 22413470), (0.46072473304957806, 56365197)]
[(0.5933460329281932, 46695756), (0.6274916942345825, 56365197), (0.6421462163115421, 41574489), (0.6661236985220453, 22413470), (0.7018195236420965, 48273751), (0.8556366522956007, 72928043), (0.8652193001996021, 26348659), (0.8780732288086202, 35373097), (0.926884543360688, 52889826)]
[(0.4479962299998251, 469404), (0.45343774278491955, 9830444), (0.46270591410232675, 899654), (0.4914847877567686, 203734), (0.500120998754173, 54665), (0.5626031681651118, 389145), (0.5676573974577104, 74267277), (0.5920786097725307, 56930154), (0.7382484486019627, 299431)]
[(0.21106202610247043, 71505389), (0.21698011312630838, 11239836), (0.2192848415596796, 47994357), (0.22534018999445943, 51884751), (0.23624844

[(0.3636572801464468, 56365197), (0.36587429327029797, 72928043), (0.37421271811988044, 27977454), (0.3748286601722087, 26348659), (0.38119871745392087, 52889826), (0.3846288045658593, 35373097), (0.3949469852283929, 52333193), (0.3973211790574792, 22413470), (0.3981891090331983, 70772235)]
[(0.13845052937346367, 74285829), (0.13918509463158082, 73314682), (0.14250379039475697, 35726089), (0.14260885999499984, 35726908), (0.14340544430545546, 59050694), (0.14547675122741063, 41574489), (0.14571601028771275, 76134137), (0.15805205232481698, 25770121), (0.15822704355060954, 59418162)]
[(0.41975225979294606, 2566051), (0.4255519554544116, 54818426), (0.4303878829773076, 25402524), (0.44495965837448964, 32021463), (0.4905626853622518, 2546979), (0.49255554594880435, 24489730), (0.6309781012708499, 34520643), (0.8263395509174422, 40691443), (0.8265528705409981, 4539349)]
[(0.386967206081753, 72308191), (0.4020805477872828, 72308418), (0.4136576190616765, 487532), (0.44457843830763843, 56922

[(0.09612250801421149, 415060), (0.0975641396891723, 71051342), (0.09846685403734294, 37878694), (0.10097373545311063, 21439813), (0.10411557609650572, 556260), (0.10547197046379675, 415071), (0.1064885503563135, 54663477), (0.13103075875145162, 5950457), (0.16240259748011304, 48056733)]
[(0.23842272643872897, 25062191), (0.23962115708491313, 55445629), (0.24000827772179575, 75520677), (0.2505603250989168, 34557311), (0.25543075266466975, 66940778), (0.26943101192333546, 23503622), (0.3036280870145982, 22153584), (0.3169966923395109, 34574630), (0.3431632403234647, 40737659)]
[(0.20076566664398607, 8112536), (0.2012859473675993, 17038778), (0.20248664365003738, 2054629), (0.2359901259357998, 51510460), (0.30495584369597895, 5844567), (0.3111169105381465, 14078676), (0.4557575881503108, 2054469), (0.5610231903824934, 5987217), (0.8504746968557707, 5986499)]
[(0.5155230224056039, 29950661), (0.5344589612619497, 62266000), (0.5484575127127752, 13119283), (0.5548841343891633, 10860173), (0

[(0.29649806225069253, 74955104), (0.30298336802397724, 52539829), (0.30702872325092284, 72796676), (0.317019027524997, 24932961), (0.32532163163150707, 55445629), (0.3378478521729395, 71291990), (0.3524334340051781, 54628125), (0.36713861198296355, 74572230), (0.42295833569186986, 52945747)]
[(0.22926523349727082, 950452), (0.2424430522831261, 19767845), (0.24821276964143638, 32038170), (0.2526527397621551, 41502230), (0.25332305256617477, 25404882), (0.2591301284076703, 25501830), (0.26643425925533104, 55035537), (0.29055202436631566, 70116493), (0.3001093691488425, 73612703)]
[(0.30578315122685856, 8159799), (0.3087387573739817, 8023300), (0.3107214210006346, 65395332), (0.3116612974636467, 7383363), (0.3183262186559691, 37503894), (0.33479350381299877, 57524517), (0.34042101435212074, 24989503), (0.4293187093201904, 31547651), (0.4841757123243555, 12716267)]
[(0.1486748539460071, 540865), (0.1499621570343338, 615977), (0.15368651206174333, 55873620), (0.15402018653968302, 41446843)

[(0.12020330170668865, 7089529), (0.12347439097279643, 4755095), (0.13058746841278304, 4727085), (0.13145879090440735, 4727019), (0.18689836546509273, 10001479), (0.2255638726107571, 21731970), (0.369804999272344, 4754815), (0.5594962547145326, 847472), (0.6160294167494462, 54852821)]
[(0.5292128337887267, 68455358), (0.5382598581448661, 5866569), (0.5414363745335329, 5866557), (0.5550347314730342, 66117818), (0.6379967332362713, 44801520), (0.681290852428251, 45075149), (0.8505183860577511, 5899427), (0.8625873965300337, 5899501), (0.8663936272995036, 5899493)]
[(0.1941360860411847, 73476077), (0.19638961646662037, 75759411), (0.20169758169140115, 58666606), (0.21332341456604806, 48483836), (0.21763230432316544, 58849381), (0.22220407153162408, 68259526), (0.22992081159043912, 5839337), (0.24746417242779303, 78872402), (0.2907738383643853, 2184775)]
[(0.23058588000668753, 56250119), (0.2310609883409137, 28754687), (0.24152311712186783, 7252155), (0.24398811972124765, 17589287), (0.245

[(0.24920097038551017, 55731368), (0.24980138558407158, 19774958), (0.24984889482530348, 19775193), (0.25537053162173906, 14966475), (0.27123905974290885, 56498902), (0.2836493425732691, 16963971), (0.29857643308056053, 55467693), (0.48897229247470875, 8021522), (0.5250180699520367, 4092401)]
[(0.325145284924691, 15669042), (0.3260351260270472, 6921779), (0.3398338080966629, 54353418), (0.5010360642757754, 53444660), (0.5046640435620007, 14039112), (0.5546807615865414, 33115720), (0.6211057005693749, 57811160), (0.6233821453388699, 15669028), (0.6620237067140229, 48430140)]
[(0.38564097928187185, 484372), (0.4016697435227747, 711323), (0.4185278605307015, 1602285), (0.42390677358874373, 711112), (0.4555212577342322, 642648), (0.5025785630455262, 1760987), (0.5027424073188111, 21022062), (0.5387315999427664, 1498487), (0.8820324513410714, 2680853)]
[(0.28853313357091337, 74983716), (0.29478317857244496, 71209699), (0.30009129734659756, 27910586), (0.3013206919941036, 72836349), (0.31238

[(0.28189770766475075, 73640620), (0.39980230057420446, 7708661), (0.5188294983579083, 10108419), (0.5312928075832855, 10126962), (0.9387496551526161, 7325021), (0.9415511294490463, 7327987), (0.9420051942815001, 7362171), (0.9488060131211509, 7252601), (0.9551424077598819, 7237635)]
[(0.319888552358723, 9996258), (0.3211317174519138, 49863949), (0.3379302879287137, 72045167), (0.3483492262533649, 37530700), (0.35080374986009893, 9982809), (0.43573021102100734, 71043508), (0.4785323070422253, 71045829), (0.503032046777731, 19374087), (0.8042796090947186, 69625196)]
[(0.18736598278517003, 63147488), (0.18821947345422754, 21614756), (0.19142667261632842, 44579109), (0.19169491355945117, 17589287), (0.19281999706215802, 66385255), (0.20024603975567176, 56498902), (0.22753257810082195, 61280865), (0.7047200047611297, 68222196), (0.7051949871153367, 68200288)]
[(0.3564370885824897, 53997901), (0.37444342714937345, 25273762), (0.3790981597748489, 21921714), (0.40280445438024465, 31685803), (

[(0.27243316538889284, 459152), (0.27366349018415054, 1582161), (0.2838255503342349, 459151), (0.2982151259005004, 54439144), (0.2994537604937255, 416978), (0.31919433041831496, 595607), (0.32632400295456415, 416990), (0.3350224224424739, 56248606), (0.3511648868383702, 74547151)]
[(0.39651129229110865, 547563), (0.3982024790339324, 20904905), (0.3993039899100431, 3209613), (0.428731983943057, 20412302), (0.43954659181123523, 546664), (0.46000369065326296, 754249), (0.5955825404089263, 46645696), (0.9167318253065114, 54021947), (0.9404647722922574, 17967481)]
[(0.29063250396673795, 595607), (0.29337414639443066, 48509229), (0.2973728120102054, 459151), (0.3014869607013823, 20111924), (0.31808489802140977, 459077), (0.32333144619761217, 416985), (0.33642960566463737, 19606295), (0.3467901429429807, 459084), (0.3905757628467933, 416613)]
[(0.25530020260997055, 3614507), (0.2594803633627292, 52033195), (0.27088942764497204, 51599592), (0.27254883675833935, 1582161), (0.2824941492546735, 4

[(0.33789590482311216, 465219), (0.34764191478016254, 51599592), (0.3553423152213532, 2052377), (0.35579832405625816, 8851126), (0.36003669213839556, 56697301), (0.3878114390947106, 51200545), (0.398093805792235, 21526737), (0.4602762635560322, 9850897), (0.4810440339131961, 14301144)]
[(0.33280137165320733, 51615378), (0.333499595599562, 51987865), (0.3368217569661293, 14469139), (0.3410912768413837, 457194), (0.3480510588186397, 491431), (0.35857542510655027, 491442), (0.37348172398910134, 10021395), (0.9597286480337347, 746639), (0.9660772039152493, 746663)]
[(0.08635410406314455, 33793341), (0.08958802817688284, 33732222), (0.08962195037931232, 518679), (0.09178937636477168, 518633), (0.09438905418888531, 38518662), (0.09947415260542572, 33748673), (0.15925248464055441, 46757834), (0.28512814744742204, 68250770), (0.9752894854731851, 66730368)]
[(0.35564367537935176, 4747467), (0.36021163168352444, 4210949), (0.3756995352679644, 57316596), (0.3994861156451881, 529902), (0.402648996

[(0.37075754477191475, 8105301), (0.3707612783129222, 50604092), (0.3726649037948873, 44224953), (0.3773369985422275, 30857999), (0.3939344899112584, 63483847), (0.4156816247821481, 11199435), (0.43736068272625683, 17954263), (0.450559059957211, 4747467), (0.4597918128171799, 49368888)]
[(0.3181633335800049, 8851126), (0.33377062858552486, 52033195), (0.3536597127466187, 18003803), (0.36611643211429373, 12011510), (0.3732720796620315, 56697301), (0.3831957692827381, 51200545), (0.39887152520741825, 21526737), (0.40111751981353666, 1582161), (0.40538552098548997, 465219)]
[(0.33324856767986283, 51615378), (0.3339477298524426, 51987865), (0.33727435531528255, 14469139), (0.3415496122832503, 457194), (0.3485187463458988, 491431), (0.35905725456707027, 491442), (0.37398358352822164, 10021395), (0.9597286480337347, 746648), (0.9673753532734913, 746663)]
[(0.23292793293698444, 77105849), (0.2384438853237728, 1610105), (0.24094666544902218, 76297472), (0.253865465624015, 1907615), (0.25759723

[(0.25744901056413316, 35527589), (0.2623849555099318, 73102935), (0.26322205039702784, 73103291), (0.2635015680600153, 73103412), (0.26585859460304456, 73103094), (0.27666057801113686, 72957781), (0.2790907131377615, 40348221), (0.2924448306040782, 2640471), (0.3013399085183219, 40350394)]
[(0.15711203930096682, 11251735), (0.1656372273020177, 11251743), (0.16706093867277932, 11376644), (0.17848788985964437, 4065607), (0.18001217295450048, 41239825), (0.19861858759697856, 4048605), (0.20176713765754664, 20711020), (0.206627887584421, 53642202), (0.21085838084817776, 11415485)]
[(0.28606695936175264, 2640471), (0.421227279438964, 40343230), (0.44178271464856506, 40348221), (0.4481652510623393, 40350394), (0.8946484287485932, 72957781), (0.9127652813316947, 77756018), (0.985248557994317, 73102935), (0.987452382490429, 73103412), (0.9936289104188878, 73103094)]
[(0.3421707308998105, 13328288), (0.34422491768181096, 47993877), (0.34704267492463586, 54697472), (0.34939419907384683, 8822674

[(0.37529946804303144, 16563297), (0.3965590995683124, 35580965), (0.45839511117890497, 45125717), (0.47701568383569826, 46607008), (0.5015561900893102, 45125720), (0.5145028064290091, 46627072), (0.5196843693685845, 19362591), (0.5418818971328989, 74348608), (0.8719804369761824, 58629879)]
[(0.5906018788424546, 50685581), (0.5910677191408521, 50425707), (0.5993625817303421, 50421562), (0.6093889153246262, 50685658), (0.6212343547388839, 68655745), (0.6409083645917854, 50592897), (0.6497409184805516, 50702921), (0.6669012317664129, 50692303), (0.6726736758674011, 50421561)]
[(0.24988774400298835, 13943889), (0.25533083083842134, 14525848), (0.25544081999564394, 14534789), (0.340094291321346, 46179293), (0.35709855537043944, 57751841), (0.4429095375406166, 20833539), (0.4780186417729091, 58086409), (0.5277522364201929, 72954000), (0.9140427448735174, 52864995)]
[(0.7147320643372302, 16966887), (0.734543779633322, 32931450), (0.7354621232193606, 23556604), (0.7369707115109664, 17908150),

[(0.25965614029656575, 8326109), (0.25987397996380224, 12370066), (0.2628253070863113, 64319112), (0.2629739784675989, 33307083), (0.2640593930876875, 12041791), (0.2833782364183933, 73125075), (0.2939614956229584, 37000912), (0.3009931365483773, 20475644), (0.3059319854541752, 37000902)]
[(0.7567169747277122, 23556576), (0.7760735643065111, 52702602), (0.7794403577234122, 1513667), (0.7795055714441746, 52774785), (0.7809440709382678, 1675793), (0.7924734886138215, 52774775), (0.816870642917556, 21440464), (0.8208234602079413, 486701), (0.9974474274268156, 36996478)]
[(0.37127727087762036, 11897820), (0.3882713930203186, 42192494), (0.3915914999996826, 51518382), (0.3916138027695961, 56015185), (0.3995581594392291, 46691906), (0.44607749043000267, 46209324), (0.4513548295403288, 56090374), (0.4613867532601842, 56720370), (0.6999912197439316, 51368046)]
[(0.632157315423381, 4440545), (0.6322895423704418, 4441053), (0.6401056152790614, 54406059), (0.6415611183255897, 54406065), (0.644566

[(0.5154508232588085, 4296745), (0.5296271792837367, 44065922), (0.6433398814369343, 822916), (0.64404813809465, 813200), (0.7838380856401703, 823106), (0.7846056076960931, 813202), (0.9292617598554861, 813210), (0.9292617598554861, 823532), (0.9925065730682858, 813212)]
[(0.45245234085296715, 52433186), (0.45782230235304083, 16398913), (0.4852119367815553, 52432443), (0.5193829720946855, 460145), (0.5217041159567025, 4794489), (0.5383157462227615, 50497876), (0.5397139628895757, 52433106), (0.6309098339294936, 54784316), (0.8276507527149087, 54750222)]
[(0.1762103752709422, 53727656), (0.1817080938251333, 48490928), (0.18702389075008002, 71105466), (0.1891739908286037, 32205118), (0.19202377696713818, 22383687), (0.19373250736751207, 53115576), (0.20834727269492748, 14035750), (0.21647680994446483, 56540134), (0.2165093955303349, 60228865)]
[(0.27061788725013697, 22383687), (0.2878638853232849, 57642927), (0.2887249943594622, 32205118), (0.2897726195268199, 71105466), (0.3557996247306

In [17]:
results[0]

[(0.37963189598354974, 26348659),
 (0.3843128095067755, 52889826),
 (0.39197111830614545, 72928043),
 (0.40075345942249213, 52333193),
 (0.4010595902845202, 50319688),
 (0.40906108816405334, 70772235),
 (0.4149413835537337, 48273751),
 (0.433537681425906, 22413470),
 (0.46072473304957806, 56365197)]

**Getting the information of one product**

In [18]:
def item(id):
    return products.loc[products['item_id'] == id]['name'].to_list()[0].split('-')[0]

**Getting the recommended products**

In [19]:
def recommend(item_id, n):
    print('There are ', str(n) + ' recommended products similar to ' + item(item_id))
    print('\n')
    recs = results[item_id][:n]

In [20]:
# recommend(56365197, 5)

## III.2. Gensim

**Tokenizing (split) the sentences into words**

In [21]:
products_gs = [[text for text in x.split()] for x in products.name_description_pre]

In [22]:
len(products_gs)

4370

In [23]:
products_gs[:1]

[['Tai_nghe',
  'Bluetooth_Inpods',
  '12',
  '-',
  'Cảm_biến',
  'vân',
  'tay',
  ',',
  'chống',
  'nước',
  ',',
  'màu_sắc',
  'đa_dạng',
  '-',
  '5',
  'màu_sắc',
  'lựa',
  'chọnTHÔNG',
  'TIN',
  'CHI_TIẾT',
  'Dung_lượng',
  'pin',
  '300',
  'Thời_gian',
  'pin',
  '-',
  'Thời_gian',
  'nghe',
  'nhạc',
  'liên_tục',
  'từ',
  '2.5',
  '-',
  '4',
  'h',
  '-',
  'Thời_gian',
  'sạc',
  'đầy',
  'chỉ',
  'khoảng',
  '60',
  'p',
  '-',
  'Thời_gian',
  'chờ',
  'lên',
  'tới',
  '140',
  'giờ',
  'Bluetooth',
  '5',
  'Thương_hiệu',
  'OEM',
  'Xuất_xứ',
  'thương_hiệu',
  'Trung_Quốc',
  'Độ',
  'nhạy_cảm_biến',
  'vân',
  'tay',
  'Model',
  'i12',
  'Loại',
  'Jack',
  'cắm',
  'USB_Cable',
  'Trọng_lượng',
  '300',
  'g',
  'Thời_gian',
  'sử_dụng',
  '-',
  'Thời_gian',
  'nghe',
  'nhạc',
  'liên_tục',
  'từ',
  '2.5',
  '-',
  '4',
  'h',
  'SKU',
  '4096608751631',
  'MÔ_TẢ',
  'SẢN_PHẨM',
  'INPOD',
  '12',
  'là',
  'phiên_bản',
  'nâng_cấp',
  'mới',
  'nhất',
 

**Removing some special elements in texts**

In [24]:
products_gs_re = [[re.sub('[0-9]+','', e) for e in text] for text in products_gs]
products_gs_re = [[t.lower() for t in text if not t in ['', ' ', ',', '.', '...', '-',':', ';', '?', '%', '(', ')',
                                                        '+', '/', 'p', 'g', 'ml', 'h', 'v', 'm']] for text in  products_gs_re]
products_gs_re = [[t for t in text if not t in stop_words] for text in products_gs_re]

In [25]:
products_gs_re[:1]

[['tai_nghe',
  'bluetooth_inpods',
  'cảm_biến',
  'vân',
  'chống',
  'màu_sắc',
  'đa_dạng',
  'màu_sắc',
  'lựa',
  'chọnthông',
  'chi_tiết',
  'dung_lượng',
  'pin',
  'pin',
  'nhạc',
  'liên_tục',
  'sạc',
  'chờ',
  'bluetooth',
  'thương_hiệu',
  'oem',
  'xuất_xứ',
  'thương_hiệu',
  'trung_quốc',
  'độ',
  'nhạy_cảm_biến',
  'vân',
  'model',
  'jack',
  'cắm',
  'usb_cable',
  'trọng_lượng',
  'nhạc',
  'liên_tục',
  'sku',
  'mô_tả',
  'sản_phẩm',
  'inpod',
  'phiên_bản',
  'nâng_cấp',
  'tai_nghe',
  'bluetooth',
  'thiết_kế',
  'tỉ_lệ',
  'chuẩn',
  'tai',
  'airpod',
  'hãng',
  'lược_bỏ',
  'nút',
  'bấm',
  'thân',
  'tai',
  'thay',
  'nút',
  'cảm_ứng',
  'dễ_dàng',
  'thuận_tiện',
  'thao_tác',
  'nhạc',
  'dễ_dàng',
  'chạm',
  'bluetooth',
  'kết_nối',
  'vô_cùng',
  'ổn_định',
  'bluetooth',
  'kết_nối',
  'vô_cùng',
  'ổn_định',
  'tai',
  'kết_nối',
  'dock',
  'sạc',
  'chất',
  'âm',
  'thời_lượng',
  'pin',
  'cải_thiện',
  'tối_ưu',
  'dock',
  'sạc',
  

**Obtaining the number of features based on dictionary: Use corpora.Dictionary**

In [26]:
dictionary = corpora.Dictionary(products_gs_re)

**List of features in dictionary**

In [27]:
dictionary.token2id

{'airpod': 0,
 'apple': 1,
 'bao_gồm': 2,
 'bluetooth': 3,
 'bluetooth_inpods': 4,
 'bấm': 5,
 'chi_phí': 6,
 'chi_tiết': 7,
 'chuẩn': 8,
 'chạm': 9,
 'chất': 10,
 'chọnthông': 11,
 'chống': 12,
 'chờ': 13,
 'cải_thiện': 14,
 'cảm_biến': 15,
 'cảm_ứng': 16,
 'cắm': 17,
 'cồng_kềnh': 18,
 'dock': 19,
 'dung_lượng': 20,
 'dễ_dàng': 21,
 'giao': 22,
 'giá': 23,
 'hiện_hành': 24,
 'huawei': 25,
 'hàng': 26,
 'hãng': 27,
 'inpod': 28,
 'jack': 29,
 'kết_nối': 30,
 'lenovo': 31,
 'liên_tục': 32,
 'luật': 33,
 'lược_bỏ': 34,
 'lựa': 35,
 'model': 36,
 'màu_sắc': 37,
 'mô_tả': 38,
 'nhạc': 39,
 'nhạy_cảm_biến': 40,
 'nâng_cấp': 41,
 'nút': 42,
 'oem': 43,
 'oppo': 44,
 'phiên_bản': 45,
 'phát_sinh': 46,
 'phí': 47,
 'phương_thức': 48,
 'phụ_phí': 49,
 'pin': 50,
 'samsung': 51,
 'sku': 52,
 'sạc': 53,
 'sản_phẩm': 54,
 'tablet': 55,
 'tai': 56,
 'tai_nghe': 57,
 'thao_tác': 58,
 'thay': 59,
 'thiết_bị': 60,
 'thiết_kế': 61,
 'thuận_tiện': 62,
 'thuế': 63,
 'thân': 64,
 'thương_hiệu': 65,
 'thờ

**Number of features (word) in dictionary**

In [28]:
feature_cnt = len(dictionary.token2id)
feature_cnt

37209

**Obtaining corpus based on dictionary (dense matrix)**

In [29]:
corpus = [dictionary.doc2bow(text) for text in products_gs_re]

In [30]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 5),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 2),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 2),
 (20, 1),
 (21, 2),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 2),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 3),
 (31, 1),
 (32, 3),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 2),
 (38, 1),
 (39, 4),
 (40, 1),
 (41, 1),
 (42, 2),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 4),
 (51, 1),
 (52, 1),
 (53, 5),
 (54, 3),
 (55, 1),
 (56, 3),
 (57, 3),
 (58, 1),
 (59, 1),
 (60, 2),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 2),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 2),
 (78, 2),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1),
 (86, 2)]

**Using TF-IDF Model to process corpus, obtaining index**

In [31]:
tfidf = models.TfidfModel(corpus)

In [32]:
# Tính toán sự tương tự trong ma trận thưa thớt
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt)

**Getting the information of one product**

In [33]:
product_selection = products.head(1)

In [34]:
product_selection

Unnamed: 0,index,item_id,name,description,rating,price,list_price,brand,group,url,image,name_description,name_description_pre
0,0,48102821,Tai nghe Bluetooth Inpods 12 - Cảm biến vân ta...,THÔNG TIN CHI TIẾT\nDung lượng pin 300\nThời g...,4.0,77000,300000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-inpods-12-cam-bien-...,https://salt.tikicdn.com/cache/280x280/ts/prod...,Tai nghe Bluetooth Inpods 12 - Cảm biến vân ta...,Tai_nghe Bluetooth_Inpods 12 - Cảm_biến vân ta...


**Getting the recommended products**

In [35]:
presentPro = product_selection['name_description_pre'].to_string(index=False)

In [36]:
presentPro

' Tai_nghe Bluetooth_Inpods 12 - Cảm_biến vân ta...'

In [37]:
presentPro = presentPro.lower().split()

In [38]:
kwVec = dictionary.doc2bow(presentPro)

In [39]:
kwVec

[(4, 1), (15, 1), (57, 1), (77, 1)]

In [40]:
sim = index[tfidf[kwVec]]

In [41]:
# Print result
for i in range(len(sim)):
    # Vì lấy mẫu đầu tiên để xem nên bỏ qua mẫu đầu tiên
    if i!=0:
        print('keyword is similar to doc_index %d: %.2f' %(i, sim[i]))

keyword is similar to doc_index 1: 0.12
keyword is similar to doc_index 2: 0.01
keyword is similar to doc_index 3: 0.00
keyword is similar to doc_index 4: 0.10
keyword is similar to doc_index 5: 0.09
keyword is similar to doc_index 6: 0.09
keyword is similar to doc_index 7: 0.00
keyword is similar to doc_index 8: 0.07
keyword is similar to doc_index 9: 0.00
keyword is similar to doc_index 10: 0.01
keyword is similar to doc_index 11: 0.00
keyword is similar to doc_index 12: 0.00
keyword is similar to doc_index 13: 0.04
keyword is similar to doc_index 14: 0.00
keyword is similar to doc_index 15: 0.00
keyword is similar to doc_index 16: 0.00
keyword is similar to doc_index 17: 0.00
keyword is similar to doc_index 18: 0.06
keyword is similar to doc_index 19: 0.02
keyword is similar to doc_index 20: 0.01
keyword is similar to doc_index 21: 0.00
keyword is similar to doc_index 22: 0.00
keyword is similar to doc_index 23: 0.15
keyword is similar to doc_index 24: 0.08
keyword is similar to doc

keyword is similar to doc_index 2363: 0.00
keyword is similar to doc_index 2364: 0.00
keyword is similar to doc_index 2365: 0.00
keyword is similar to doc_index 2366: 0.00
keyword is similar to doc_index 2367: 0.00
keyword is similar to doc_index 2368: 0.00
keyword is similar to doc_index 2369: 0.00
keyword is similar to doc_index 2370: 0.00
keyword is similar to doc_index 2371: 0.00
keyword is similar to doc_index 2372: 0.00
keyword is similar to doc_index 2373: 0.00
keyword is similar to doc_index 2374: 0.00
keyword is similar to doc_index 2375: 0.00
keyword is similar to doc_index 2376: 0.00
keyword is similar to doc_index 2377: 0.00
keyword is similar to doc_index 2378: 0.00
keyword is similar to doc_index 2379: 0.00
keyword is similar to doc_index 2380: 0.00
keyword is similar to doc_index 2381: 0.00
keyword is similar to doc_index 2382: 0.00
keyword is similar to doc_index 2383: 0.00
keyword is similar to doc_index 2384: 0.00
keyword is similar to doc_index 2385: 0.00
keyword is 

**Kết luận:**
Top 5 sản phẩm có liên quan đến sản phẩm đang xem được đề xuất là

In [59]:
product_familier = products[1:6]

In [60]:
product_familier

Unnamed: 0,index,item_id,name,description,rating,price,list_price,brand,group,url,image,name_description,name_description_pre
1,1,52333193,Tai nghe bluetooth không dây F9 True wireless ...,THÔNG TIN CHI TIẾT\nDung lượng pin 2000mah\nTh...,4.5,132000,750000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-khong-day-f9-true-w...,https://salt.tikicdn.com/cache/280x280/ts/prod...,Tai nghe bluetooth không dây F9 True wireless ...,Tai_nghe bluetooth không dây F9_True wireless ...
2,2,299461,Chuột Không Dây Logitech M331 Silent - Hàng Ch...,THÔNG TIN CHI TIẾT\nThương hiệu Logitech\nĐộ p...,4.8,299000,399000,Logitech,Thiết Bị Số - Phụ Kiện Số/Phụ kiện máy tính và...,https://chuot-khong-day-logitech-m331-silent-p...,https://salt.tikicdn.com/cache/280x280/media/c...,Chuột Không Dây Logitech M331 Silent - Hàng Ch...,Chuột Không Dây Logitech_M331_Silent - Hàng Ch...
3,3,57440329,Loa Bluetooth 5.0 Kiêm Đồng Hồ Báo Thức - [[ 2...,THÔNG TIN CHI TIẾT\nThương hiệu Acome\nXuất xứ...,4.7,149000,350000,Acome,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://loa-bluetooth-5-0-kiem-dong-ho-bao-thu...,https://salt.tikicdn.com/cache/280x280/ts/prod...,Loa Bluetooth 5.0 Kiêm Đồng Hồ Báo Thức - [[ 2...,Loa Bluetooth 5.0 Kiêm Đồng_Hồ Báo_Thức - [ [ ...
4,4,38458616,Tai Nghe Bluetooth Apple AirPods Pro True Wire...,THÔNG TIN CHI TIẾT\nThương hiệu Apple\nXuất xứ...,4.8,5090000,8500000,Apple,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-apple-airpods-pro-t...,https://salt.tikicdn.com/cache/280x280/ts/prod...,Tai Nghe Bluetooth Apple AirPods Pro True Wire...,Tai_Nghe Bluetooth_Apple_AirPods_Pro_True_Wire...
5,5,12567795,Tai Nghe Bluetooth Nhét Tai Apple AirPods 2 Tr...,THÔNG TIN CHI TIẾT\nThương hiệu Apple\nXuất xứ...,4.8,3490000,5490000,Apple,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-nhet-tai-apple-airp...,https://salt.tikicdn.com/cache/280x280/ts/prod...,Tai Nghe Bluetooth Nhét Tai Apple AirPods 2 Tr...,Tai_Nghe Bluetooth Nhét Tai_Apple_AirPods 2 Tr...
