In [1]:
import pandas as pd
import json

# **1 Load data**

In [2]:
def parse(path):
    with open(path, 'r') as f:
        for line in f:
            yield json.loads(line)

def getDF(path):
    df = [d for d in parse(path)]
    return pd.DataFrame(df)

In [5]:
file_path = './data/All_Beauty.json'
df_all_beauty = getDF(file_path)

In [6]:
df_all_beauty

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,1.0,True,"02 19, 2015",A1V6B6TNIC10QE,0143026860,theodore j bigham,great,One Star,1424304000,,,
1,4.0,True,"12 18, 2014",A2F5GHSXFQ0W6J,0143026860,Mary K. Byke,My husband wanted to reading about the Negro ...,... to reading about the Negro Baseball and th...,1418860800,,,
2,4.0,True,"08 10, 2014",A1572GUYS7DGSR,0143026860,David G,"This book was very informative, covering all a...",Worth the Read,1407628800,,,
3,5.0,True,"03 11, 2013",A1PSGLFK1NSVO,0143026860,TamB,I am already a baseball fan and knew a bit abo...,Good Read,1362960000,,,
4,5.0,True,"12 25, 2011",A6IKXKZMTKGSC,0143026860,shoecanary,This was a good story of the Black leagues. I ...,"More than facts, a good story read!",1324771200,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...
371340,1.0,True,"07 20, 2017",A202DCI7TV1022,B01HJEGTYK,Sam,It was awful. It was super frizzy and I tried ...,It was super frizzy and I tried to comb it and...,1500508800,,,
371341,5.0,True,"03 16, 2017",A3FSOR5IJOFIBE,B01HJEGTYK,TYW,I was skeptical about buying this. Worried it...,Awesome,1489622400,34,,
371342,5.0,True,"03 1, 2017",A1B5DK6CTP2P24,B01HJEGTYK,Norma Jennings,Makes me look good fast.,Five Stars,1488326400,46,,
371343,2.0,True,"02 21, 2017",A23OUYS5IRMJS9,B01HJEGTYK,Lee,Way lighter than photo\nNot mix blend of color...,Ok but color way off and volume as well,1487635200,,,


In [7]:
file_path1 = './data/AMAZON_FASHION.json'
file_path2 = './data/Appliances.json'

df_amazon_fashion = getDF(file_path1)
df_appliances = getDF(file_path2)

In [16]:
df1_selected = df_all_beauty[['reviewerID', 'asin', 'overall', 'unixReviewTime']]
df2_selected = df_amazon_fashion[['reviewerID', 'asin', 'overall', 'unixReviewTime']]
df3_selected = df_appliances[['reviewerID', 'asin', 'overall', 'unixReviewTime']]

df_combined = pd.concat([df1_selected, df2_selected, df3_selected], axis=0)

df_combined

Unnamed: 0,reviewerID,asin,overall,unixReviewTime
0,A1V6B6TNIC10QE,0143026860,1.0,1424304000
1,A2F5GHSXFQ0W6J,0143026860,4.0,1418860800
2,A1572GUYS7DGSR,0143026860,4.0,1407628800
3,A1PSGLFK1NSVO,0143026860,5.0,1362960000
4,A6IKXKZMTKGSC,0143026860,5.0,1324771200
...,...,...,...,...
602772,A24A9P4F2SLTK5,B01HJH2PY0,5.0,1502323200
602773,A2JCB4KHBWEELW,B01HJHHEA0,2.0,1533081600
602774,A1LDYYVTLPP2Z5,B01HJHHEA0,5.0,1523577600
602775,AP1M5O06IOYZ7,B01HJH92JQ,1.0,1521763200


In [17]:
df_combined.to_csv('df_combined.csv', index=False)

In [2]:
df_combined = pd.read_csv('df_combined.csv')

# **2 Transform data**

cleaning the dataset from duplicates, missing overall will be set to 0, changing the unix time format to UTC datetime format reformatting the ID to simple unique integer values and finally, transforming of the shape of the dataset to 2D matrix (RM)

In [14]:
# Kiểm tra các giá trị bị thiếu
missing_summary = df_combined.isnull().sum()
print("Missing values per column:")
print(missing_summary[missing_summary > 0])

# Kiểm tra trùng lặp toàn bộ dòng
duplicates = df_combined.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Kiểm tra trùng lặp cho từng cột
duplicate_columns = {}
for col in df_combined.columns:
    duplicate_columns[col] = df_combined[col].duplicated().sum()

print("\nDuplicate values per column:")
print(pd.Series(duplicate_columns).sort_values(ascending=False))


Missing values per column:
Series([], dtype: int64)

Number of duplicate rows: 0

Duplicate values per column:
overall           1827565
unixReviewTime    1822156
reviewTime        1822156
asin              1578543
reviewerID         310614
dtype: int64


In [15]:
df_combined = df_combined.drop_duplicates()
duplicates = df_combined.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")


Number of duplicate rows: 0


In [16]:
# Duplicate reviews by same user for the same product
# Chuyển đổi cột unixReviewTime từ Unix timestamp sang datetime theo UTC
df_combined['reviewTime'] = pd.to_datetime(df_combined['unixReviewTime'], unit='s', utc=True)
duplicate_reviews = df_combined[df_combined.duplicated(subset=['reviewerID', 'asin'], keep=False)]
duplicate_reviews

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,reviewTime


In [17]:
# Sắp xếp DataFrame theo cột 'time' giảm dần
df_combined = df_combined.sort_values(by=['reviewerID', 'asin', 'reviewTime'], ascending=[True, True, False])

# Giữ lại dòng đầu tiên (gần đây nhất) cho mỗi cặp reviewerID và asin
df_combined = df_combined.drop_duplicates(subset=['reviewerID', 'asin'], keep='first')

In [18]:
df_preprocessed = df_combined.drop(columns=['unixReviewTime', 'reviewTime'])
# Tạo cột mã hóa cho `reviewerID` và `asin`
df_preprocessed['userID'] = 'u' + (df_combined['reviewerID'].astype('category').cat.codes + 1).astype(str)
df_preprocessed['itemID'] = 'i' + (df_combined['asin'].astype('category').cat.codes + 1).astype(str)

df_preprocessed

KeyboardInterrupt: 

In [8]:
df_preprocessed.to_csv('df_preprocessed.csv', index=False)

In [19]:
df_preprocessed = pd.read_csv('df_preprocessed.csv')

In [None]:
print(df_preprocessed.isnull().sum())

# Kiểm tra trùng lặp toàn bộ dòng
duplicates = df_preprocessed.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Kiểm tra trùng lặp cho từng cột
duplicate_columns = {}
for col in df_preprocessed.columns:
    duplicate_columns[col] = df_preprocessed[col].duplicated().sum()

print("\nDuplicate values per column:")
print(pd.Series(duplicate_columns).sort_values(ascending=False))

print(df_preprocessed.isnull().sum())

reviewerID    0
asin          0
overall       0
userID        0
itemID        0
dtype: int64

Number of duplicate rows: 0

Duplicate values per column:
overall       1827565
asin          1578543
itemID        1578543
reviewerID     310614
userID         310614
dtype: int64
reviewerID    0
asin          0
overall       0
userID        0
itemID        0
dtype: int64


In [13]:
print(df_preprocessed['userID'].nunique())
print(df_preprocessed['itemID'].nunique()) 

1516956
249027


Excluded the users who rated less than 20 items and ignored the items that appeared less than 40 times in the data set. It was decided to remove the low-rated users from the data set since they do not have a stored ranking that could be used in the evaluation.

In [20]:
# Lọc người dùng đã đánh giá ít nhất 20 sản phẩm
user_rating_count = df_preprocessed.groupby('userID')['itemID'].count()
users_to_keep = user_rating_count[user_rating_count >= 20].index

# Lọc sản phẩm được đánh giá ít nhất 40 lần
item_rating_count = df_preprocessed.groupby('itemID')['userID'].count()
items_to_keep = item_rating_count[item_rating_count >= 40].index

# Lọc lại dataframe theo người dùng và sản phẩm thỏa mãn điều kiện
filtered_df = df_preprocessed[df_preprocessed['userID'].isin(users_to_keep) & df_preprocessed['itemID'].isin(items_to_keep)]

# Kiểm tra lại số lượng người dùng và sản phẩm sau khi lọc
print(f"Number of users after filtering: {filtered_df['userID'].nunique()}")
print(f"Number of items after filtering: {filtered_df['itemID'].nunique()}")

# Hiển thị 5 dòng dữ liệu sau khi lọc
print(filtered_df.head())


Number of users after filtering: 59
Number of items after filtering: 575
           reviewerID        asin  overall  userID  itemID
42078  A13391AZAFJ67K  B001DHLGOS      5.0  u35049   i9283
42079  A13391AZAFJ67K  B00470A1S0      5.0  u35049  i17667
42080  A13391AZAFJ67K  B004ULZC8U      5.0  u35049  i19868
42081  A13391AZAFJ67K  B004XLDDNI      5.0  u35049  i20606
42082  A13391AZAFJ67K  B004XLDE5A      5.0  u35049  i20607


In [24]:
# Chuyển đổi thành ma trận 2D
rating_matrix = filtered_df.pivot_table(index='userID', columns='itemID', values='overall', fill_value=0)

print(rating_matrix)

itemID    i10006  i100213  i100843  i101109  i10118  i101350  i101667  \
userID                                                                  
u1011559     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u1014919     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u1015689     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u1016707     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u1045492     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u1097009     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u1104521     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u1110482     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u1134159     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u1159375     0.0      5.0      0.0      0.0     0.0      0.0      0.0   
u1161925     0.0      0.0      0.0      0.0     0.0      0.0      0.0   
u118059      0.0      0.0      0.0      0.0     0.0

# **3 Similarity calculation**

In [25]:
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics import jaccard_score
from scipy.stats import pearsonr

In [26]:
# Function to calculate pairwise similarities
def calculate_similarity(matrix, metric='pearson'):
    users = matrix.index
    n_users = len(users)
    similarity_matrix = pd.DataFrame(np.zeros((n_users, n_users)), index=users, columns=users)

    for i, user1 in enumerate(users):
        for j, user2 in enumerate(users):
            if i >= j:  # Avoid duplicate calculations
                continue

            ratings1 = matrix.loc[user1].values
            ratings2 = matrix.loc[user2].values

            if metric == 'pearson':
                mask = ~np.isnan(ratings1) & ~np.isnan(ratings2)
                if np.sum(mask) > 1:  # At least 2 ratings in common
                    similarity = pearsonr(ratings1[mask], ratings2[mask])[0]
                else:
                    similarity = 0  # No common items
            elif metric == 'cosine':
                similarity = 1 - cosine(ratings1, ratings2)  # Cosine similarity
            elif metric == 'jaccard':
                bin1 = ratings1 > 0
                bin2 = ratings2 > 0
                similarity = jaccard_score(bin1, bin2)

            similarity_matrix.at[user1, user2] = similarity
            similarity_matrix.at[user2, user1] = similarity

    return similarity_matrix

In [27]:
# Calculate similarities
pearson_sim = calculate_similarity(rating_matrix, metric='pearson')
cosine_sim = calculate_similarity(rating_matrix, metric='cosine')
jaccard_sim = calculate_similarity(rating_matrix, metric='jaccard')

# Display results
print("Pearson Similarity Matrix:")
print(pearson_sim)
print("\nCosine Similarity Matrix:")
print(cosine_sim)
print("\nJaccard Similarity Matrix:")
print(jaccard_sim)

Pearson Similarity Matrix:
userID    u1011559  u1014919  u1015689  u1016707  u1045492  u1097009  \
userID                                                                 
u1011559  0.000000  0.090270 -0.019913  0.070150 -0.013137 -0.005952   
u1014919  0.090270  0.000000 -0.016062 -0.018101 -0.010596 -0.004800   
u1015689 -0.019913 -0.016062  0.000000 -0.021979 -0.012867 -0.005829   
u1016707  0.070150 -0.018101 -0.021979  0.000000  0.070347 -0.006569   
u1045492 -0.013137 -0.010596 -0.012867  0.070347  0.000000 -0.003846   
u1097009 -0.005952 -0.004800 -0.005829 -0.006569 -0.003846  0.000000   
u1104521 -0.022494 -0.018143 -0.022031 -0.024827 -0.014534 -0.006584   
u1110482 -0.020245 -0.016329  0.072715 -0.022345 -0.013082 -0.005926   
u1134159 -0.017580 -0.014180 -0.017219 -0.019404 -0.011360 -0.005146   
u1159375 -0.012880 -0.010389 -0.012615 -0.014216 -0.008323 -0.003770   
u1161925 -0.012928 -0.010428 -0.012662 -0.014269 -0.008354 -0.003784   
u118059  -0.026359 -0.021261  0.11623

# **4 OCA Clustering**

In [28]:
def OCA(SM):
    # Initialize clusters
    clusters = {}
    
    # Iterate through the similarity matrix entries
    for i, user_i in enumerate(SM.index):
        for j, user_j in enumerate(SM.columns):
            if i >= j:  # Avoid processing duplicate pairs
                continue
            
            similarity = SM.iloc[i, j]
            if similarity > 0:
                # Find the cluster with the highest similarity value
                cluster_key = max(clusters.keys(), default=0)
                if similarity > cluster_key:
                    cluster_key = similarity

                # Check if the cluster already exists
                if cluster_key not in clusters:
                    clusters[cluster_key] = set()
                
                # Insert users i and j into the cluster
                clusters[cluster_key].add(user_i)
                clusters[cluster_key].add(user_j)

    # Sort clusters by similarity value (descending order)
    sorted_clusters = sorted(clusters.items(), key=lambda x: -x[0])
    sorted_clusters_dict = {k: list(v) for k, v in sorted_clusters}

    return sorted_clusters_dict

In [29]:
person_clusters = OCA(pearson_sim)

# Display clusters
print("Clusters:")
for similarity, cluster in person_clusters.items():
    print(f"Similarity {similarity}: {cluster}")

Clusters:
Similarity 0.4403080542740387: ['u440628', 'u411873', 'u1494365', 'u1199509', 'u1377503', 'u546007', 'u716921', 'u883851', 'u328220', 'u499896', 'u992609', 'u474481', 'u532564', 'u190479', 'u585110', 'u132202', 'u1292601', 'u154233', 'u600724', 'u1325383', 'u427120', 'u1354630', 'u441580', 'u1284868', 'u987311', 'u624874', 'u488771', 'u263608', 'u1187806', 'u670495', 'u1345718', 'u945886', 'u35049', 'u1279400', 'u1448588', 'u823368']
Similarity 0.14240124753190408: ['u981325', 'u440628', 'u546007', 'u1377503', 'u883851', 'u328220', 'u499896', 'u474481', 'u190479', 'u532564', 'u585110', 'u1110482', 'u132202', 'u1292601', 'u154233', 'u600724', 'u1325383', 'u1104521', 'u1159375', 'u441580', 'u118059', 'u987311', 'u1161925', 'u624874', 'u1045492', 'u1187806', 'u670495', 'u35049', 'u823368', 'u1134159']
Similarity 0.1327798923699352: ['u1016707', 'u589649']
Similarity 0.12070831948714167: ['u1492149', 'u440628', 'u992609', 'u1045492', 'u1348676', 'u670495', 'u474481', 'u1016707', 

In [30]:
cosine_clusters = OCA(cosine_sim)

# Display clusters
print("Clusters:")
for similarity, cluster in cosine_clusters.items():
    print(f"Similarity {similarity}: {cluster}")

Clusters:
Similarity 0.4573295603800236: ['u440628', 'u411873', 'u1494365', 'u1199509', 'u1377503', 'u546007', 'u716921', 'u883851', 'u328220', 'u499896', 'u992609', 'u474481', 'u532564', 'u190479', 'u585110', 'u132202', 'u1292601', 'u154233', 'u600724', 'u1325383', 'u427120', 'u1354630', 'u441580', 'u1284868', 'u987311', 'u624874', 'u488771', 'u263608', 'u1187806', 'u670495', 'u1345718', 'u945886', 'u35049', 'u1279400', 'u1448588', 'u823368']
Similarity 0.16692446522239712: ['u1187806', 'u154233']
Similarity 0.16269784336399207: ['u1377503', 'u1187806']
Similarity 0.16141634707382535: ['u440628', 'u263608', 'u1187806', 'u670495', 'u1377503', 'u118059', 'u987311', 'u823368', 'u132202', 'u883851', 'u1292601', 'u600724', 'u1325383']
Similarity 0.15606369730577052: ['u981325', 'u440628', 'u883851', 'u499896', 'u474481', 'u532564', 'u585110', 'u1110482', 'u1292601', 'u600724', 'u1159375', 'u441580', 'u118059', 'u987311', 'u1161925', 'u1187806', 'u670495', 'u35049', 'u823368', 'u1134159']
S

In [31]:
jaccard_clusters = OCA(jaccard_sim)

# Display clusters
print("Clusters:")
for similarity, cluster in jaccard_clusters.items():
    print(f"Similarity {similarity}: {cluster}")

Clusters:
Similarity 0.2962962962962963: ['u440628', 'u411873', 'u1494365', 'u1199509', 'u1377503', 'u546007', 'u716921', 'u883851', 'u328220', 'u499896', 'u992609', 'u474481', 'u532564', 'u190479', 'u585110', 'u132202', 'u1292601', 'u154233', 'u600724', 'u1325383', 'u427120', 'u1354630', 'u441580', 'u1284868', 'u987311', 'u624874', 'u488771', 'u263608', 'u1187806', 'u670495', 'u1345718', 'u945886', 'u35049', 'u1279400', 'u1448588', 'u823368']
Similarity 0.1111111111111111: ['u981325', 'u440628', 'u1377503', 'u883851', 'u499896', 'u474481', 'u532564', 'u585110', 'u1110482', 'u132202', 'u1292601', 'u154233', 'u600724', 'u1325383', 'u1159375', 'u441580', 'u118059', 'u987311', 'u1161925', 'u263608', 'u1187806', 'u670495', 'u35049', 'u823368', 'u1134159']
Similarity 0.09090909090909091: ['u190479', 'u1377503', 'u1110482', 'u132202', 'u154233']
Similarity 0.08: ['u499896', 'u1104521', 'u1187806', 'u670495', 'u474481', 'u441580', 'u118059', 'u987311', 'u1110482', 'u883851', 'u1292601', 'u624