In [30]:
import pandas as pd
import json
from typing import List, Dict
from collections import defaultdict
import os
from tqdm import tqdm

# **1 Load data**

In [2]:
def parse(path):
    with open(path, 'r') as f:
        for line in f:
            yield json.loads(line)

def getDF(path):
    df = [d for d in parse(path)]
    return pd.DataFrame(df)

In [5]:
file_path = './data/All_Beauty.json'
df_all_beauty = getDF(file_path)

In [6]:
df_all_beauty

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,1.0,True,"02 19, 2015",A1V6B6TNIC10QE,0143026860,theodore j bigham,great,One Star,1424304000,,,
1,4.0,True,"12 18, 2014",A2F5GHSXFQ0W6J,0143026860,Mary K. Byke,My husband wanted to reading about the Negro ...,... to reading about the Negro Baseball and th...,1418860800,,,
2,4.0,True,"08 10, 2014",A1572GUYS7DGSR,0143026860,David G,"This book was very informative, covering all a...",Worth the Read,1407628800,,,
3,5.0,True,"03 11, 2013",A1PSGLFK1NSVO,0143026860,TamB,I am already a baseball fan and knew a bit abo...,Good Read,1362960000,,,
4,5.0,True,"12 25, 2011",A6IKXKZMTKGSC,0143026860,shoecanary,This was a good story of the Black leagues. I ...,"More than facts, a good story read!",1324771200,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...
371340,1.0,True,"07 20, 2017",A202DCI7TV1022,B01HJEGTYK,Sam,It was awful. It was super frizzy and I tried ...,It was super frizzy and I tried to comb it and...,1500508800,,,
371341,5.0,True,"03 16, 2017",A3FSOR5IJOFIBE,B01HJEGTYK,TYW,I was skeptical about buying this. Worried it...,Awesome,1489622400,34,,
371342,5.0,True,"03 1, 2017",A1B5DK6CTP2P24,B01HJEGTYK,Norma Jennings,Makes me look good fast.,Five Stars,1488326400,46,,
371343,2.0,True,"02 21, 2017",A23OUYS5IRMJS9,B01HJEGTYK,Lee,Way lighter than photo\nNot mix blend of color...,Ok but color way off and volume as well,1487635200,,,


In [7]:
file_path1 = './data/AMAZON_FASHION.json'
file_path2 = './data/Appliances.json'

df_amazon_fashion = getDF(file_path1)
df_appliances = getDF(file_path2)

In [16]:
df1_selected = df_all_beauty[['reviewerID', 'asin', 'overall', 'unixReviewTime']]
df2_selected = df_amazon_fashion[['reviewerID', 'asin', 'overall', 'unixReviewTime']]
df3_selected = df_appliances[['reviewerID', 'asin', 'overall', 'unixReviewTime']]

df_combined = pd.concat([df1_selected, df2_selected, df3_selected], axis=0)

df_combined

Unnamed: 0,reviewerID,asin,overall,unixReviewTime
0,A1V6B6TNIC10QE,0143026860,1.0,1424304000
1,A2F5GHSXFQ0W6J,0143026860,4.0,1418860800
2,A1572GUYS7DGSR,0143026860,4.0,1407628800
3,A1PSGLFK1NSVO,0143026860,5.0,1362960000
4,A6IKXKZMTKGSC,0143026860,5.0,1324771200
...,...,...,...,...
602772,A24A9P4F2SLTK5,B01HJH2PY0,5.0,1502323200
602773,A2JCB4KHBWEELW,B01HJHHEA0,2.0,1533081600
602774,A1LDYYVTLPP2Z5,B01HJHHEA0,5.0,1523577600
602775,AP1M5O06IOYZ7,B01HJH92JQ,1.0,1521763200


In [17]:
df_combined.to_csv('df_combined.csv', index=False)

In [2]:
df_combined = pd.read_csv('df_combined.csv')

# **2 Transform data**

cleaning the dataset from duplicates, missing overall will be set to 0, changing the unix time format to UTC datetime format reformatting the ID to simple unique integer values and finally, transforming of the shape of the dataset to 2D matrix (RM)

In [3]:
# Kiểm tra các giá trị bị thiếu
missing_summary = df_combined.isnull().sum()
print("Missing values per column:")
print(missing_summary[missing_summary > 0])

# Kiểm tra trùng lặp toàn bộ dòng
duplicates = df_combined.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Kiểm tra trùng lặp cho từng cột
duplicate_columns = {}
for col in df_combined.columns:
    duplicate_columns[col] = df_combined[col].duplicated().sum()

print("\nDuplicate values per column:")
print(pd.Series(duplicate_columns).sort_values(ascending=False))


Missing values per column:
Series([], dtype: int64)

Number of duplicate rows: 28561

Duplicate values per column:
overall           1857753
unixReviewTime    1852343
asin              1608731
reviewerID         340802
dtype: int64


In [4]:
df_combined = df_combined.drop_duplicates()
duplicates = df_combined.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")


Number of duplicate rows: 0


In [3]:
# Duplicate reviews by same user for the same product
# Chuyển đổi cột unixReviewTime từ Unix timestamp sang datetime theo UTC
df_combined['reviewTime'] = pd.to_datetime(df_combined['unixReviewTime'], unit='s', utc=True)
duplicate_reviews = df_combined[df_combined.duplicated(subset=['reviewerID', 'asin'], keep=False)]
duplicate_reviews

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,reviewTime
2352,AD80MGOY5CJZ4,1620213982,5.0,1451692800,2016-01-02 00:00:00+00:00
4017,AD80MGOY5CJZ4,1620213982,5.0,1422662400,2015-01-31 00:00:00+00:00
6904,ACTVXNBEPLW2S,B000052YAN,4.0,1422144000,2015-01-25 00:00:00+00:00
6905,ACTVXNBEPLW2S,B000052YAN,4.0,1422144000,2015-01-25 00:00:00+00:00
6941,A2CTM1BYAXTYLX,B0000530HU,5.0,1243987200,2009-06-03 00:00:00+00:00
...,...,...,...,...,...
1830861,AAXZWNM0SGJ6V,B00W0WXHCO,4.0,1531699200,2018-07-16 00:00:00+00:00
1832245,A33E3AB96IZHE9,B00X9H5S62,3.0,1445212800,2015-10-19 00:00:00+00:00
1832246,A33E3AB96IZHE9,B00X9H5S62,2.0,1439942400,2015-08-19 00:00:00+00:00
1841744,A1HT6VX64S9NE8,B015HUNWQG,5.0,1517097600,2018-01-28 00:00:00+00:00


In [5]:
# Sắp xếp DataFrame theo cột 'time' giảm dần
df_combined = df_combined.sort_values(by=['reviewerID', 'asin', 'reviewTime'], ascending=[True, True, False])

# Giữ lại dòng đầu tiên (gần đây nhất) cho mỗi cặp reviewerID và asin
df_combined = df_combined.drop_duplicates(subset=['reviewerID', 'asin'], keep='first')

In [7]:
df_preprocessed = df_combined.drop(columns=['unixReviewTime', 'reviewTime'])
# Tạo cột mã hóa cho `reviewerID` và `asin`
df_preprocessed['userID'] = 'u' + (df_combined['reviewerID'].astype('category').cat.codes + 1).astype(str)
df_preprocessed['itemID'] = 'i' + (df_combined['asin'].astype('category').cat.codes + 1).astype(str)

df_preprocessed

Unnamed: 0,reviewerID,asin,overall,userID,itemID
1593537,A0001528BGUBOEVR6T5U,B00MVVITWC,5.0,u1,i91564
1382775,A00032921HLX2KJJVXRS,B0045LLC7K,5.0,u2,i17567
568898,A0007604Q2582KFW7N4B,B00L8J2RF8,5.0,u3,i81636
1785338,A00086729ZDSXGG2E481,B00E1IUTOY,1.0,u4,i48836
1708824,A0009408W4B7B4DKF0XN,B01DO0ZR50,1.0,u5,i219479
...,...,...,...,...,...
250877,AZZZ5UJWUVCYZ,B01FNJ9MOW,5.0,u1516952,i234681
1463413,AZZZGPFIX5NEY,B0097C43BO,1.0,u1516953,i33551
508737,AZZZMCJO078D2,B00EONSCKO,2.0,u1516954,i52115
1006582,AZZZU2YUCMUUW,B00WQEPQ20,5.0,u1516955,i132125


In [8]:
df_preprocessed.to_csv('df_preprocessed.csv', index=False)

In [3]:
df_preprocessed = pd.read_csv('df_preprocessed.csv')

In [8]:
print(df_preprocessed.isnull().sum())

# Kiểm tra trùng lặp toàn bộ dòng
duplicates = df_preprocessed.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Kiểm tra trùng lặp cho từng cột
duplicate_columns = {}
for col in df_preprocessed.columns:
    duplicate_columns[col] = df_preprocessed[col].duplicated().sum()

print("\nDuplicate values per column:")
print(pd.Series(duplicate_columns).sort_values(ascending=False))

print(df_preprocessed.isnull().sum())

reviewerID    0
asin          0
overall       0
userID        0
itemID        0
dtype: int64

Number of duplicate rows: 0

Duplicate values per column:
overall       1827565
asin          1578543
itemID        1578543
reviewerID     310614
userID         310614
dtype: int64
reviewerID    0
asin          0
overall       0
userID        0
itemID        0
dtype: int64


In [13]:
print(df_preprocessed['userID'].nunique())
print(df_preprocessed['itemID'].nunique()) 

1516956
249027


## Sovle large scale of input data problem

In [39]:
from scipy.sparse import lil_matrix, csr_matrix, save_npz

In [6]:
def create_sparse_rating_matrix(df):
    """
    Tạo ma trận đánh giá thưa từ DataFrame.

    Args:
        df (pd.DataFrame): DataFrame chứa dữ liệu userID, itemID, overall.

    Returns:
        scipy.sparse.csr_matrix: Ma trận đánh giá thưa.
        dict: Ánh xạ từ user id gốc sang index.
        dict: Ánh xạ từ item id gốc sang index.
    """
    # 1. Tạo mapping
    unique_users = df['userID'].unique()
    unique_items = df['itemID'].unique()
    user_mapping = {user_id: index for index, user_id in enumerate(unique_users)}
    item_mapping = {item_id: index for index, item_id in enumerate(unique_items)}

    df['user_index'] = df['userID'].map(user_mapping)
    df['item_index'] = df['itemID'].map(item_mapping)

    # 2. Tạo sparse matrix
    num_users = len(unique_users)
    num_items = len(unique_items)
    ratings = df['overall'].values
    row_indices = df['user_index'].values
    col_indices = df['item_index'].values
    rating_matrix = csr_matrix((ratings, (row_indices, col_indices)), shape=(num_users, num_items))

    return rating_matrix, user_mapping, item_mapping

In [7]:
rating_matrix, user_mapping, item_mapping = create_sparse_rating_matrix(df_preprocessed)

# **3 Similarity calculation**

In [15]:
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from scipy.stats import pearsonr, spearmanr, kendalltau

In [40]:
def save_sparse_matrix(filename, matrix):
    """Helper function to save sparse matrix"""
    csr_matrix = matrix.tocsr()
    save_npz(filename, csr_matrix)

def load_sparse_matrix(filename):
    """Helper function to load sparse matrix"""
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                     shape=loader['shape'])

def calculate_similarity_batch(rating_matrix, batch_size=1000, similarity_metric='cosine', save_path=None):
    """
    Calculates user similarity matrix using batch processing to handle large datasets.
    
    Args:
        rating_matrix (csr_matrix): Sparse user-item rating matrix
        batch_size (int): Size of batches for processing
        similarity_metric (str): Similarity metric ('cosine', 'pearson', 'jaccard', 'spearman', 'kendall')
        save_path (str): Optional path to save intermediate results
        
    Returns:
        scipy.sparse.lil_matrix: Sparse similarity matrix
    """
    num_users = rating_matrix.shape[0]
    similarity_matrix = lil_matrix((num_users, num_users))
    
    # Create save directory if needed
    if save_path and not os.path.exists(save_path):
        os.makedirs(save_path)
    
    # Calculate number of batches
    num_batches = (num_users + batch_size - 1) // batch_size
    
    # Process batches with progress bar
    for i in tqdm(range(0, num_users, batch_size)):
        batch_end_i = min(i + batch_size, num_users)
        batch_i = rating_matrix[i:batch_end_i]
        
        for j in range(i, num_users, batch_size):
            batch_end_j = min(j + batch_size, num_users)
            batch_j = rating_matrix[j:batch_end_j]
            
            # Calculate similarities based on metric
            if similarity_metric == 'cosine':
                sim_batch = cosine_similarity(batch_i, batch_j)
            else:
                sim_batch = np.zeros((batch_end_i - i, batch_end_j - j))
                
                for bi in range(batch_end_i - i):
                    for bj in range(batch_end_j - j):
                        user1_ratings = batch_i[bi].toarray()[0]
                        user2_ratings = batch_j[bj].toarray()[0]
                        
                        # Get common rated items
                        mask = (user1_ratings != 0) & (user2_ratings != 0)
                        ratings1 = user1_ratings[mask]
                        ratings2 = user2_ratings[mask]
                        
                        if len(ratings1) > 1:
                            if similarity_metric == 'pearson':
                                sim = pearsonr(ratings1, ratings2)[0]
                            elif similarity_metric == 'spearman':
                                sim = spearmanr(ratings1, ratings2)[0]
                            elif similarity_metric == 'kendall':
                                sim = kendalltau(ratings1, ratings2)[0]
                            elif similarity_metric == 'jaccard':
                                set1 = set(np.nonzero(user1_ratings)[0])
                                set2 = set(np.nonzero(user2_ratings)[0])
                                sim = len(set1.intersection(set2)) / len(set1.union(set2)) if set1 or set2 else 0
                            
                            sim_batch[bi, bj] = 0 if np.isnan(sim) else sim
            
            # Update similarity matrix
            similarity_matrix[i:batch_end_i, j:batch_end_j] = sim_batch
            
            # Mirror the matrix for symmetry (if not on diagonal)
            if i != j:
                similarity_matrix[j:batch_end_j, i:batch_end_i] = sim_batch.T
            
            # Save intermediate results if path provided
            if save_path:
                temp_file = os.path.join(save_path, f'sim_batch_{i}_{j}.npz')
                save_sparse_matrix(temp_file, similarity_matrix)
    
    return similarity_matrix

def save_sparse_matrix(filename, matrix):
    """Helper function to save sparse matrix"""
    np.savez(filename, data=matrix.data, indices=matrix.indices,
             indptr=matrix.indptr, shape=matrix.shape)

def load_sparse_matrix(filename):
    """Helper function to load sparse matrix"""
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                     shape=loader['shape'])

In [31]:
BATCH_SIZE = 1000
SAVE_PATH = './similarity_matrices'  # Optional: create this directory first
    
# Create save directory if needed
if SAVE_PATH and not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

In [41]:
pearson_sim = calculate_similarity_batch(
    rating_matrix,
    batch_size=100,
    similarity_metric='pearson',
    save_path='./similarity_matrices'
)

user_ids = [f'u{i+1}' for i in range(rating_matrix.shape[0])]
pearson_sim_df = pd.DataFrame(
    pearson_sim.toarray(),
    index=user_ids,
    columns=user_ids
)

  0%|          | 0/15170 [00:41<?, ?it/s]


AttributeError: 'lil_matrix' object has no attribute 'indices'

# **4 OCA Clustering**

In [34]:
def OCA(SM):
    """Thực hiện thuật toán Ordered Clustering."""
    clusters = {}

    # Duyệt qua các cặp user trong ma trận tương đồng
    for i, user_i in enumerate(SM.index):
        for j, user_j in enumerate(SM.columns):
            if i >= j:
                continue

            similarity = SM.iloc[i, j]
            
            if similarity > 0:
                # Tạo cluster mới nếu chưa có, key là giá trị tương đồng cụ thể
                if similarity not in clusters:
                     clusters[similarity] = []
                
                # Thêm user vào cluster, đảm bảo thứ tự dựa trên độ tương đồng
                if not clusters[similarity]:
                    clusters[similarity].extend([user_i,user_j])
                else:
                    user_i_index = -1
                    user_j_index = -1
                    
                    if user_i in clusters[similarity]:
                       user_i_index = clusters[similarity].index(user_i)
                    if user_j in clusters[similarity]:
                      user_j_index = clusters[similarity].index(user_j)
                                  
                    if user_i_index == -1 and user_j_index == -1:
                        inserted = False
                        for k in range(len(clusters[similarity])):
                            user_k = clusters[similarity][k]
                            sim_ik = SM.iloc[i,SM.index.get_loc(user_k)]
                            
                            if sim_ik < similarity:
                                clusters[similarity].insert(k,user_i)
                                inserted = True
                                break
                                
                        if not inserted:
                            clusters[similarity].append(user_i)
                          
                        inserted = False
                        for k in range(len(clusters[similarity])):
                            user_k = clusters[similarity][k]
                            sim_jk = SM.iloc[j,SM.index.get_loc(user_k)]
                            
                            if sim_jk < similarity:
                                clusters[similarity].insert(k,user_j)
                                inserted = True
                                break
                                
                        if not inserted:
                            clusters[similarity].append(user_j)
                    elif user_i_index > -1 and user_j_index == -1:
                        inserted = False
                        for k in range(len(clusters[similarity])):
                            user_k = clusters[similarity][k]
                            sim_jk = SM.iloc[j,SM.index.get_loc(user_k)]
                            
                            if sim_jk < similarity:
                                clusters[similarity].insert(k,user_j)
                                inserted = True
                                break
                                
                        if not inserted:
                            clusters[similarity].append(user_j)
                    elif user_i_index == -1 and user_j_index > -1:
                       inserted = False
                       for k in range(len(clusters[similarity])):
                           user_k = clusters[similarity][k]
                           sim_ik = SM.iloc[i,SM.index.get_loc(user_k)]
                            
                           if sim_ik < similarity:
                               clusters[similarity].insert(k,user_i)
                               inserted = True
                               break
                       if not inserted:
                           clusters[similarity].append(user_i)

    # Sắp xếp các cluster theo độ tương đồng giảm dần
    sorted_clusters = sorted(clusters.items(), key=lambda item: item, reverse=True)
    sorted_clusters_dict = {k: v for k, v in sorted_clusters}
    return sorted_clusters_dict