In [38]:
import pandas as pd
import json
from typing import List, Dict
from collections import defaultdict

# **1 Load data**

In [2]:
def parse(path):
    with open(path, 'r') as f:
        for line in f:
            yield json.loads(line)

def getDF(path):
    df = [d for d in parse(path)]
    return pd.DataFrame(df)

In [5]:
file_path = './data/All_Beauty.json'
df_all_beauty = getDF(file_path)

In [6]:
df_all_beauty

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,1.0,True,"02 19, 2015",A1V6B6TNIC10QE,0143026860,theodore j bigham,great,One Star,1424304000,,,
1,4.0,True,"12 18, 2014",A2F5GHSXFQ0W6J,0143026860,Mary K. Byke,My husband wanted to reading about the Negro ...,... to reading about the Negro Baseball and th...,1418860800,,,
2,4.0,True,"08 10, 2014",A1572GUYS7DGSR,0143026860,David G,"This book was very informative, covering all a...",Worth the Read,1407628800,,,
3,5.0,True,"03 11, 2013",A1PSGLFK1NSVO,0143026860,TamB,I am already a baseball fan and knew a bit abo...,Good Read,1362960000,,,
4,5.0,True,"12 25, 2011",A6IKXKZMTKGSC,0143026860,shoecanary,This was a good story of the Black leagues. I ...,"More than facts, a good story read!",1324771200,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...
371340,1.0,True,"07 20, 2017",A202DCI7TV1022,B01HJEGTYK,Sam,It was awful. It was super frizzy and I tried ...,It was super frizzy and I tried to comb it and...,1500508800,,,
371341,5.0,True,"03 16, 2017",A3FSOR5IJOFIBE,B01HJEGTYK,TYW,I was skeptical about buying this. Worried it...,Awesome,1489622400,34,,
371342,5.0,True,"03 1, 2017",A1B5DK6CTP2P24,B01HJEGTYK,Norma Jennings,Makes me look good fast.,Five Stars,1488326400,46,,
371343,2.0,True,"02 21, 2017",A23OUYS5IRMJS9,B01HJEGTYK,Lee,Way lighter than photo\nNot mix blend of color...,Ok but color way off and volume as well,1487635200,,,


In [7]:
file_path1 = './data/AMAZON_FASHION.json'
file_path2 = './data/Appliances.json'

df_amazon_fashion = getDF(file_path1)
df_appliances = getDF(file_path2)

In [16]:
df1_selected = df_all_beauty[['reviewerID', 'asin', 'overall', 'unixReviewTime']]
df2_selected = df_amazon_fashion[['reviewerID', 'asin', 'overall', 'unixReviewTime']]
df3_selected = df_appliances[['reviewerID', 'asin', 'overall', 'unixReviewTime']]

df_combined = pd.concat([df1_selected, df2_selected, df3_selected], axis=0)

df_combined

Unnamed: 0,reviewerID,asin,overall,unixReviewTime
0,A1V6B6TNIC10QE,0143026860,1.0,1424304000
1,A2F5GHSXFQ0W6J,0143026860,4.0,1418860800
2,A1572GUYS7DGSR,0143026860,4.0,1407628800
3,A1PSGLFK1NSVO,0143026860,5.0,1362960000
4,A6IKXKZMTKGSC,0143026860,5.0,1324771200
...,...,...,...,...
602772,A24A9P4F2SLTK5,B01HJH2PY0,5.0,1502323200
602773,A2JCB4KHBWEELW,B01HJHHEA0,2.0,1533081600
602774,A1LDYYVTLPP2Z5,B01HJHHEA0,5.0,1523577600
602775,AP1M5O06IOYZ7,B01HJH92JQ,1.0,1521763200


In [17]:
df_combined.to_csv('df_combined.csv', index=False)

In [2]:
df_combined = pd.read_csv('df_combined.csv')

# **2 Transform data**

cleaning the dataset from duplicates, missing overall will be set to 0, changing the unix time format to UTC datetime format reformatting the ID to simple unique integer values and finally, transforming of the shape of the dataset to 2D matrix (RM)

In [3]:
# Kiểm tra các giá trị bị thiếu
missing_summary = df_combined.isnull().sum()
print("Missing values per column:")
print(missing_summary[missing_summary > 0])

# Kiểm tra trùng lặp toàn bộ dòng
duplicates = df_combined.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Kiểm tra trùng lặp cho từng cột
duplicate_columns = {}
for col in df_combined.columns:
    duplicate_columns[col] = df_combined[col].duplicated().sum()

print("\nDuplicate values per column:")
print(pd.Series(duplicate_columns).sort_values(ascending=False))


Missing values per column:
Series([], dtype: int64)

Number of duplicate rows: 28561

Duplicate values per column:
overall           1857753
unixReviewTime    1852343
asin              1608731
reviewerID         340802
dtype: int64


In [4]:
df_combined = df_combined.drop_duplicates()
duplicates = df_combined.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")


Number of duplicate rows: 0


In [3]:
# Duplicate reviews by same user for the same product
# Chuyển đổi cột unixReviewTime từ Unix timestamp sang datetime theo UTC
df_combined['reviewTime'] = pd.to_datetime(df_combined['unixReviewTime'], unit='s', utc=True)
duplicate_reviews = df_combined[df_combined.duplicated(subset=['reviewerID', 'asin'], keep=False)]
duplicate_reviews

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,reviewTime
2352,AD80MGOY5CJZ4,1620213982,5.0,1451692800,2016-01-02 00:00:00+00:00
4017,AD80MGOY5CJZ4,1620213982,5.0,1422662400,2015-01-31 00:00:00+00:00
6904,ACTVXNBEPLW2S,B000052YAN,4.0,1422144000,2015-01-25 00:00:00+00:00
6905,ACTVXNBEPLW2S,B000052YAN,4.0,1422144000,2015-01-25 00:00:00+00:00
6941,A2CTM1BYAXTYLX,B0000530HU,5.0,1243987200,2009-06-03 00:00:00+00:00
...,...,...,...,...,...
1830861,AAXZWNM0SGJ6V,B00W0WXHCO,4.0,1531699200,2018-07-16 00:00:00+00:00
1832245,A33E3AB96IZHE9,B00X9H5S62,3.0,1445212800,2015-10-19 00:00:00+00:00
1832246,A33E3AB96IZHE9,B00X9H5S62,2.0,1439942400,2015-08-19 00:00:00+00:00
1841744,A1HT6VX64S9NE8,B015HUNWQG,5.0,1517097600,2018-01-28 00:00:00+00:00


In [5]:
# Sắp xếp DataFrame theo cột 'time' giảm dần
df_combined = df_combined.sort_values(by=['reviewerID', 'asin', 'reviewTime'], ascending=[True, True, False])

# Giữ lại dòng đầu tiên (gần đây nhất) cho mỗi cặp reviewerID và asin
df_combined = df_combined.drop_duplicates(subset=['reviewerID', 'asin'], keep='first')

In [7]:
df_preprocessed = df_combined.drop(columns=['unixReviewTime', 'reviewTime'])
# Tạo cột mã hóa cho `reviewerID` và `asin`
df_preprocessed['userID'] = 'u' + (df_combined['reviewerID'].astype('category').cat.codes + 1).astype(str)
df_preprocessed['itemID'] = 'i' + (df_combined['asin'].astype('category').cat.codes + 1).astype(str)

df_preprocessed

Unnamed: 0,reviewerID,asin,overall,userID,itemID
1593537,A0001528BGUBOEVR6T5U,B00MVVITWC,5.0,u1,i91564
1382775,A00032921HLX2KJJVXRS,B0045LLC7K,5.0,u2,i17567
568898,A0007604Q2582KFW7N4B,B00L8J2RF8,5.0,u3,i81636
1785338,A00086729ZDSXGG2E481,B00E1IUTOY,1.0,u4,i48836
1708824,A0009408W4B7B4DKF0XN,B01DO0ZR50,1.0,u5,i219479
...,...,...,...,...,...
250877,AZZZ5UJWUVCYZ,B01FNJ9MOW,5.0,u1516952,i234681
1463413,AZZZGPFIX5NEY,B0097C43BO,1.0,u1516953,i33551
508737,AZZZMCJO078D2,B00EONSCKO,2.0,u1516954,i52115
1006582,AZZZU2YUCMUUW,B00WQEPQ20,5.0,u1516955,i132125


In [8]:
df_preprocessed.to_csv('df_preprocessed.csv', index=False)

In [6]:
df_preprocessed = pd.read_csv('df_preprocessed.csv')

In [8]:
print(df_preprocessed.isnull().sum())

# Kiểm tra trùng lặp toàn bộ dòng
duplicates = df_preprocessed.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Kiểm tra trùng lặp cho từng cột
duplicate_columns = {}
for col in df_preprocessed.columns:
    duplicate_columns[col] = df_preprocessed[col].duplicated().sum()

print("\nDuplicate values per column:")
print(pd.Series(duplicate_columns).sort_values(ascending=False))

print(df_preprocessed.isnull().sum())

reviewerID    0
asin          0
overall       0
userID        0
itemID        0
dtype: int64

Number of duplicate rows: 0

Duplicate values per column:
overall       1827565
asin          1578543
itemID        1578543
reviewerID     310614
userID         310614
dtype: int64
reviewerID    0
asin          0
overall       0
userID        0
itemID        0
dtype: int64


In [13]:
print(df_preprocessed['userID'].nunique())
print(df_preprocessed['itemID'].nunique()) 

1516956
249027


Excluded the users who rated less than 20 items and ignored the items that appeared less than 40 times in the data set. It was decided to remove the low-rated users from the data set since they do not have a stored ranking that could be used in the evaluation.

In [17]:
# Lọc người dùng đã đánh giá ít nhất 20 sản phẩm
user_rating_count = df_preprocessed.groupby('userID')['itemID'].count()
users_to_keep = user_rating_count[user_rating_count >= 20].index

# Lọc sản phẩm được đánh giá ít nhất 40 lần
item_rating_count = df_preprocessed.groupby('itemID')['userID'].count()
items_to_keep = item_rating_count[item_rating_count >= 40].index

# Lọc lại dataframe theo người dùng và sản phẩm thỏa mãn điều kiện
filtered_df = df_preprocessed[df_preprocessed['userID'].isin(users_to_keep) & df_preprocessed['itemID'].isin(items_to_keep)]

# Kiểm tra lại số lượng người dùng và sản phẩm sau khi lọc
print(f"Number of users after filtering: {filtered_df['userID'].nunique()}")
print(f"Number of items after filtering: {filtered_df['itemID'].nunique()}")

# Hiển thị 5 dòng dữ liệu sau khi lọc
print(filtered_df.head())


Number of users after filtering: 59
Number of items after filtering: 575
           reviewerID        asin  overall  userID  itemID
42078  A13391AZAFJ67K  B001DHLGOS      5.0  u35049   i9283
42079  A13391AZAFJ67K  B00470A1S0      5.0  u35049  i17667
42080  A13391AZAFJ67K  B004ULZC8U      5.0  u35049  i19868
42081  A13391AZAFJ67K  B004XLDDNI      5.0  u35049  i20606
42082  A13391AZAFJ67K  B004XLDE5A      5.0  u35049  i20607


In [18]:
# Chuyển đổi thành ma trận 2D
rating_matrix = filtered_df.pivot_table(index='userID', columns='itemID', values='overall', fill_value=0)

rating_matrix

itemID,i10006,i100213,i100843,i101109,i10118,i101350,i101667,i102276,i102635,i10309,...,i97406,i97770,i97929,i97991,i98360,i98819,i99121,i99429,i9955,i9978
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u1011559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u1014919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
u1015689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u1016707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u1045492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u1097009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u1104521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u1110482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u1134159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
u1159375,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **3 Similarity calculation**

In [7]:
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics import jaccard_score
from scipy.stats import pearsonr, spearmanr, kendalltau

In [8]:
def calculate_similarity(matrix, metric='pearson'):
    users = matrix.index
    n_users = len(users)
    similarity_matrix = pd.DataFrame(np.zeros((n_users, n_users)), index=users, columns=users)
    
    for i, user1 in enumerate(users):
        for j, user2 in enumerate(users):
            if i >= j:  # Avoid duplicate calculations
                continue
                
            ratings1 = matrix.loc[user1].values
            ratings2 = matrix.loc[user2].values
            
            # Compute similarity based on specified metric
            if metric in ['pearson', 'kendall', 'spearman']:
                mask = ~np.isnan(ratings1) & ~np.isnan(ratings2)
                if np.sum(mask) > 1:  # At least 2 ratings in common
                    if metric == 'pearson':
                        similarity = pearsonr(ratings1[mask], ratings2[mask])[0]
                    elif metric == 'kendall':
                        similarity = kendalltau(ratings1[mask], ratings2[mask])[0]
                    elif metric == 'spearman':
                        similarity = spearmanr(ratings1[mask], ratings2[mask])[0]
                else:
                    similarity = 0  # No common items
                    
            elif metric == 'cosine':
                similarity = 1 - cosine(ratings1, ratings2)  # Cosine similarity
                
            elif metric == 'jaccard':
                bin1 = ratings1 > 0
                bin2 = ratings2 > 0
                similarity = jaccard_score(bin1, bin2)
            
            # Handle NaN values that might occur in correlation calculations
            if np.isnan(similarity):
                similarity = 0
                
            similarity_matrix.at[user1, user2] = similarity
            similarity_matrix.at[user2, user1] = similarity

    return similarity_matrix


In [27]:
# Calculate similarities
pearson_sim = calculate_similarity(rating_matrix, metric='pearson')
cosine_sim = calculate_similarity(rating_matrix, metric='cosine')
jaccard_sim = calculate_similarity(rating_matrix, metric='jaccard')
spearman_sim = calculate_similarity(rating_matrix, metric='spearman')
kendall_sim = calculate_similarity(rating_matrix, metric='kendall')

In [20]:
print("\nSpearman Similarity Matrix:")
spearman_sim


Spearmanr Similarity Matrix:


userID,u1011559,u1014919,u1015689,u1016707,u1045492,u1097009,u1104521,u1110482,u1134159,u1159375,...,u670495,u682761,u716921,u823368,u830550,u883851,u945886,u981325,u987311,u992609
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u1011559,0.0,0.086154,-0.020388,0.056969,-0.013673,-0.006093,-0.023061,-0.021313,-0.018408,-0.013673,...,-0.018409,-0.013673,-0.024697,-0.02548,-0.014991,-0.026987,-0.013673,-0.025478,-0.043556,-0.031123
u1014919,0.086154,0.0,-0.016588,-0.018763,-0.011125,-0.004958,-0.018764,-0.017341,-0.014978,-0.011125,...,-0.014978,-0.011125,-0.020095,-0.020732,-0.012197,-0.021957,-0.011125,-0.02073,-0.035438,-0.025323
u1015689,-0.020388,-0.016588,0.0,-0.022061,-0.01308,-0.005829,-0.022061,0.06858,-0.01761,-0.01308,...,0.084672,-0.01308,0.051682,-0.024375,-0.014341,0.116236,-0.01308,-0.024373,0.051015,0.032588
u1016707,0.056969,-0.018763,-0.022061,0.0,0.106517,-0.006593,-0.024954,-0.023062,-0.019919,-0.014795,...,-0.019919,-0.014795,-0.026724,-0.027571,-0.016221,-0.029201,-0.014795,-0.027568,-0.047129,-0.033677
u1045492,-0.013673,-0.011125,-0.01308,0.106517,0.0,-0.003909,-0.014795,-0.013673,-0.01181,-0.008772,...,-0.01181,-0.008772,-0.015844,-0.016347,-0.009617,-0.017313,-0.008772,-0.016345,-0.027943,-0.019967
u1097009,-0.006093,-0.004958,-0.005829,-0.006593,-0.003909,0.0,-0.006594,-0.006094,-0.005263,-0.003909,...,-0.005263,-0.003909,-0.007061,-0.007285,-0.004286,-0.007716,-0.003909,-0.007284,-0.012453,-0.008898
u1104521,-0.023061,-0.018764,-0.022061,-0.024954,-0.014795,-0.006594,0.0,-0.023062,-0.019919,-0.014795,...,0.071142,-0.014795,-0.026724,-0.027572,-0.016221,0.034032,-0.014795,-0.027569,-0.005873,-0.033678
u1110482,-0.021313,-0.017341,0.06858,-0.023062,-0.013673,-0.006094,-0.023062,0.0,-0.018409,-0.013673,...,0.175913,-0.013673,-0.024698,0.116537,-0.014991,0.176131,-0.013673,-0.025479,0.000927,-0.031124
u1134159,-0.018408,-0.014978,-0.01761,-0.019919,-0.01181,-0.005263,-0.019919,-0.018409,0.0,-0.01181,...,-0.0159,-0.01181,-0.021332,-0.022009,-0.012948,-0.02331,-0.01181,-0.022006,-0.037621,-0.026882
u1159375,-0.013673,-0.011125,-0.01308,-0.014795,-0.008772,-0.003909,-0.014795,-0.013673,-0.01181,0.0,...,-0.01181,-0.008772,-0.015845,-0.016347,-0.009617,-0.017313,-0.008772,0.093455,-0.027943,-0.019967


In [None]:
print("\nKendalltau Similarity Matrix:")
kendall_sim


Kendalltau Similarity Matrix:


userID,u1011559,u1014919,u1015689,u1016707,u1045492,u1097009,u1104521,u1110482,u1134159,u1159375,...,u670495,u682761,u716921,u823368,u830550,u883851,u945886,u981325,u987311,u992609
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u1011559,0.0,0.08573,-0.020326,0.056676,-0.013615,-0.006075,-0.022973,-0.021231,-0.018306,-0.013622,...,-0.018353,-0.013615,-0.024585,-0.025382,-0.014919,-0.026905,-0.013622,-0.025259,-0.043423,-0.031003
u1014919,0.08573,0.0,-0.016556,-0.018688,-0.011089,-0.004948,-0.018712,-0.017293,-0.014911,-0.011095,...,-0.014949,-0.011089,-0.020025,-0.020674,-0.012152,-0.021915,-0.011095,-0.020574,-0.035369,-0.025252
u1015689,-0.020326,-0.016556,0.0,-0.022016,-0.013064,-0.005829,-0.022043,0.068525,-0.017566,-0.013071,...,0.084672,-0.013064,0.051604,-0.024356,-0.014316,0.116236,-0.013071,-0.024237,0.051015,0.032561
u1016707,0.056676,-0.018688,-0.022016,0.0,0.106174,-0.00658,-0.024883,-0.022996,-0.019828,-0.014754,...,-0.019879,-0.014746,-0.026629,-0.027492,-0.01616,-0.029141,-0.014754,-0.027359,-0.047033,-0.03358
u1045492,-0.013615,-0.011089,-0.013064,0.106174,0.0,-0.003904,-0.014765,-0.013646,-0.011766,-0.008755,...,-0.011796,-0.00875,-0.015801,-0.016314,-0.009589,-0.017292,-0.008755,-0.016235,-0.027909,-0.019926
u1097009,-0.006075,-0.004948,-0.005829,-0.00658,-0.003904,0.0,-0.006588,-0.006089,-0.00525,-0.003906,...,-0.005263,-0.003904,-0.00705,-0.007279,-0.004279,-0.007716,-0.003906,-0.007244,-0.012453,-0.008891
u1104521,-0.022973,-0.018712,-0.022043,-0.024883,-0.014765,-0.006588,0.0,-0.023025,-0.019853,-0.014773,...,0.071085,-0.014765,-0.026662,-0.027527,-0.01618,0.034005,-0.014773,-0.027394,-0.005869,-0.033623
u1110482,-0.021231,-0.017293,0.068525,-0.022996,-0.013646,-0.006089,-0.023025,0.0,-0.018348,-0.013653,...,0.175773,-0.013646,-0.024641,0.116353,-0.014953,0.175991,-0.013653,-0.025317,0.000926,-0.031074
u1134159,-0.018306,-0.014911,-0.017566,-0.019828,-0.011766,-0.00525,-0.019853,-0.018348,0.0,-0.011772,...,-0.015861,-0.011766,-0.021246,-0.021936,-0.012893,-0.023251,-0.011772,-0.021829,-0.037527,-0.026793
u1159375,-0.013622,-0.011095,-0.013071,-0.014754,-0.008755,-0.003906,-0.014773,-0.013653,-0.011772,0.0,...,-0.011802,-0.008755,-0.01581,-0.016322,-0.009594,-0.017301,-0.00876,0.092872,-0.027924,-0.019937


# **4 OCA Clustering**

In [34]:
def OCA(SM):
    """Thực hiện thuật toán Ordered Clustering."""
    clusters = {}

    # Duyệt qua các cặp user trong ma trận tương đồng
    for i, user_i in enumerate(SM.index):
        for j, user_j in enumerate(SM.columns):
            if i >= j:
                continue

            similarity = SM.iloc[i, j]
            
            if similarity > 0:
                # Tạo cluster mới nếu chưa có, key là giá trị tương đồng cụ thể
                if similarity not in clusters:
                     clusters[similarity] = []
                
                # Thêm user vào cluster, đảm bảo thứ tự dựa trên độ tương đồng
                if not clusters[similarity]:
                    clusters[similarity].extend([user_i,user_j])
                else:
                    user_i_index = -1
                    user_j_index = -1
                    
                    if user_i in clusters[similarity]:
                       user_i_index = clusters[similarity].index(user_i)
                    if user_j in clusters[similarity]:
                      user_j_index = clusters[similarity].index(user_j)
                                  
                    if user_i_index == -1 and user_j_index == -1:
                        inserted = False
                        for k in range(len(clusters[similarity])):
                            user_k = clusters[similarity][k]
                            sim_ik = SM.iloc[i,SM.index.get_loc(user_k)]
                            
                            if sim_ik < similarity:
                                clusters[similarity].insert(k,user_i)
                                inserted = True
                                break
                                
                        if not inserted:
                            clusters[similarity].append(user_i)
                          
                        inserted = False
                        for k in range(len(clusters[similarity])):
                            user_k = clusters[similarity][k]
                            sim_jk = SM.iloc[j,SM.index.get_loc(user_k)]
                            
                            if sim_jk < similarity:
                                clusters[similarity].insert(k,user_j)
                                inserted = True
                                break
                                
                        if not inserted:
                            clusters[similarity].append(user_j)
                    elif user_i_index > -1 and user_j_index == -1:
                        inserted = False
                        for k in range(len(clusters[similarity])):
                            user_k = clusters[similarity][k]
                            sim_jk = SM.iloc[j,SM.index.get_loc(user_k)]
                            
                            if sim_jk < similarity:
                                clusters[similarity].insert(k,user_j)
                                inserted = True
                                break
                                
                        if not inserted:
                            clusters[similarity].append(user_j)
                    elif user_i_index == -1 and user_j_index > -1:
                       inserted = False
                       for k in range(len(clusters[similarity])):
                           user_k = clusters[similarity][k]
                           sim_ik = SM.iloc[i,SM.index.get_loc(user_k)]
                            
                           if sim_ik < similarity:
                               clusters[similarity].insert(k,user_i)
                               inserted = True
                               break
                       if not inserted:
                           clusters[similarity].append(user_i)

    # Sắp xếp các cluster theo độ tương đồng giảm dần
    sorted_clusters = sorted(clusters.items(), key=lambda item: item, reverse=True)
    sorted_clusters_dict = {k: v for k, v in sorted_clusters}
    return sorted_clusters_dict

In [45]:
clusters_pearson = OCA(pearson_sim)

num_clusters = len(clusters_pearson)
print(f"Số cụm đã tạo ra: {num_clusters}")

print("\nCác cụm người dùng (theo thứ tự giảm dần độ tương đồng):")
for k, cluster in clusters_pearson.items():
    print(f"Độ tương đồng: {k}, Cụm: {cluster}")

Số cụm đã tạo ra: 220

Các cụm người dùng (theo thứ tự giảm dần độ tương đồng):
Độ tương đồng: 0.4403080542740387, Cụm: ['u1187806', 'u35049']
Độ tương đồng: 0.35786866437618337, Cụm: ['u35049', 'u883851']
Độ tương đồng: 0.24334919389133394, Cụm: ['u132202', 'u35049']
Độ tương đồng: 0.22613143482548237, Cụm: ['u1199509', 'u945886']
Độ tương đồng: 0.19749263889433236, Cụm: ['u1187806', 'u883851']
Độ tương đồng: 0.1934379437069131, Cụm: ['u883851', 'u987311']
Độ tương đồng: 0.19025437003762324, Cụm: ['u154233', 'u35049']
Độ tương đồng: 0.18477605837877126, Cụm: ['u132202', 'u440628']
Độ tương đồng: 0.18419821339053546, Cụm: ['u1292601', 'u154233']
Độ tương đồng: 0.16654864128438626, Cụm: ['u132202', 'u883851']
Độ tương đồng: 0.1468737573974622, Cụm: ['u328220', 'u987311']
Độ tương đồng: 0.14240124753190408, Cụm: ['u1045492', 'u546007']
Độ tương đồng: 0.14146900045669542, Cụm: ['u1325383', 'u883851']
Độ tương đồng: 0.1414690004566953, Cụm: ['u118059', 'u1325383']
Độ tương đồng: 0.14005294

In [52]:
clusters_cosine = OCA(cosine_sim)

num_clusters = len(clusters_cosine)
print(f"Số cụm đã tạo ra: {num_clusters}")

print("\nCác cụm người dùng (theo thứ tự giảm dần độ tương đồng):")
for k, cluster in clusters_cosine.items():
    print(f"Độ tương đồng: {k}, Cụm: {cluster}")

Số cụm đã tạo ra: 212

Các cụm người dùng (theo thứ tự giảm dần độ tương đồng):
Độ tương đồng: 0.4573295603800236, Cụm: ['u1187806', 'u35049']
Độ tương đồng: 0.37851664930511264, Cụm: ['u35049', 'u883851']
Độ tương đồng: 0.2625251128917314, Cụm: ['u132202', 'u35049']
Độ tương đồng: 0.2342460684935892, Cụm: ['u883851', 'u987311']
Độ tương đồng: 0.23211917272131477, Cụm: ['u1199509', 'u945886']
Độ tương đồng: 0.22256595362986287, Cụm: ['u1187806', 'u883851']
Độ tương đồng: 0.21677749238102995, Cụm: ['u1292601', 'u154233']
Độ tương đồng: 0.21629522817434998, Cụm: ['u154233', 'u35049']
Độ tương đồng: 0.2104883631139307, Cụm: ['u132202', 'u440628']
Độ tương đồng: 0.188280238907794, Cụm: ['u132202', 'u883851']
Độ tương đồng: 0.17887112908199843, Cụm: ['u328220', 'u987311']
Độ tương đồng: 0.17688728441930623, Cụm: ['u1187806', 'u987311']
Độ tương đồng: 0.17342199390482405, Cụm: ['u154233', 'u440628']
Độ tương đồng: 0.16692446522239712, Cụm: ['u1187806', 'u154233']
Độ tương đồng: 0.16269784336

In [None]:
clusters_jaccard = OCA(jaccard_sim)

num_clusters = len(clusters_jaccard)
print(f"Số cụm đã tạo ra: {num_clusters}")

print("\nCác cụm người dùng (theo thứ tự giảm dần độ tương đồng):")
for k, cluster in clusters_jaccard.items():
    print(f"Độ tương đồng: {k}, Cụm: {cluster}")

Số cụm đã tạo ra: 83

Các cụm người dùng (theo thứ tự giảm dần độ tương đồng):
Độ tương đồng: 0.2962962962962963, Cụm: ['u1187806', 'u35049']
Độ tương đồng: 0.23333333333333334, Cụm: ['u35049', 'u883851']
Độ tương đồng: 0.15384615384615385, Cụm: ['u132202', 'u35049']
Độ tương đồng: 0.125, Cụm: ['u1199509', 'u945886', 'u1187806', 'u883851']
Độ tương đồng: 0.12121212121212122, Cụm: ['u154233', 'u35049']
Độ tương đồng: 0.11904761904761904, Cụm: ['u1292601', 'u154233']
Độ tương đồng: 0.11864406779661017, Cụm: ['u883851', 'u987311']
Độ tương đồng: 0.1111111111111111, Cụm: ['u1110482', 'u35049']
Độ tương đồng: 0.10810810810810811, Cụm: ['u132202', 'u440628']
Độ tương đồng: 0.10714285714285714, Cụm: ['u132202', 'u1110482', 'u883851']
Độ tương đồng: 0.10526315789473684, Cụm: ['u1110482', 'u670495']
Độ tương đồng: 0.1, Cụm: ['u1325383', 'u532564', 'u1159375', 'u1161925']
Độ tương đồng: 0.09090909090909091, Cụm: ['u440628', 'u1345718', 'u427120', 'u1187806', 'u154233', 'u1110482', 'u132202']
Độ 

In [50]:
clusters_spearman = OCA(spearman_sim)

num_clusters = len(clusters_spearman)
print(f"Số cụm đã tạo ra: {num_clusters}")

print("\nCác cụm người dùng (theo thứ tự giảm dần độ tương đồng):")
for k, cluster in clusters_spearman.items():
    print(f"Độ tương đồng: {k}, Cụm: {cluster}")

Số cụm đã tạo ra: 205

Các cụm người dùng (theo thứ tự giảm dần độ tương đồng):
Độ tương đồng: 0.4403080542740385, Cụm: ['u1187806', 'u35049']
Độ tương đồng: 0.35786866437618337, Cụm: ['u35049', 'u883851']
Độ tương đồng: 0.252238456476337, Cụm: ['u132202', 'u35049']
Độ tương đồng: 0.2179222938548723, Cụm: ['u1199509', 'u945886']
Độ tương đồng: 0.19749263889433258, Cụm: ['u1187806', 'u883851']
Độ tương đồng: 0.19343794370691325, Cụm: ['u883851', 'u987311']
Độ tương đồng: 0.1902543700376234, Cụm: ['u154233', 'u35049']
Độ tương đồng: 0.18870730352295842, Cụm: ['u132202', 'u440628']
Độ tương đồng: 0.1841982133905353, Cụm: ['u1292601', 'u154233']
Độ tương đồng: 0.18225321999735333, Cụm: ['u1110482', 'u35049']
Độ tương đồng: 0.17613129817417597, Cụm: ['u132202', 'u1110482', 'u883851']
Độ tương đồng: 0.17591303768214872, Cụm: ['u1110482', 'u670495']
Độ tương đồng: 0.17375190434647322, Cụm: ['u1159375', 'u1161925']
Độ tương đồng: 0.1658967881540931, Cụm: ['u1325383', 'u532564']
Độ tương đồng: 

In [51]:
clusters_kendall = OCA(kendall_sim)

num_clusters = len(clusters_kendall)
print(f"Số cụm đã tạo ra: {num_clusters}")

print("\nCác cụm người dùng (theo thứ tự giảm dần độ tương đồng):")
for k, cluster in clusters_kendall.items():
    print(f"Độ tương đồng: {k}, Cụm: {cluster}")

Số cụm đã tạo ra: 206

Các cụm người dùng (theo thứ tự giảm dần độ tương đồng):
Độ tương đồng: 0.4403080542740385, Cụm: ['u1187806', 'u35049']
Độ tương đồng: 0.3578686643761834, Cụm: ['u35049', 'u883851']
Độ tương đồng: 0.2520376437309494, Cụm: ['u132202', 'u35049']
Độ tương đồng: 0.21777085558985376, Cụm: ['u1199509', 'u945886']
Độ tương đồng: 0.19749263889433258, Cụm: ['u1187806', 'u883851']
Độ tương đồng: 0.19343794370691328, Cụm: ['u883851', 'u987311']
Độ tương đồng: 0.19025437003762344, Cụm: ['u154233', 'u35049']
Độ tương đồng: 0.1882493314717624, Cụm: ['u132202', 'u440628']
Độ tương đồng: 0.1841982133905353, Cụm: ['u1292601', 'u154233']
Độ tương đồng: 0.18210812408305593, Cụm: ['u1110482', 'u35049']
Độ tương đồng: 0.17599107606043032, Cụm: ['u132202', 'u1110482', 'u883851']
Độ tương đồng: 0.17577298933052182, Cụm: ['u1110482', 'u670495']
Độ tương đồng: 0.17330665896285696, Cụm: ['u1159375', 'u1161925']
Độ tương đồng: 0.16550823137835974, Cụm: ['u1325383', 'u532564']
Độ tương đồng