In [1]:
# local에 padnas, numpy, matplotlib, surprise, sklearn 설치필요
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import seaborn as sns

# 1. Amazon Data

**1-1. Used Data**

*   https://amazon-reviews-2023.github.io/






In [2]:
categories = [
    "Electronics",
    "Movies_TV",
    "Sports_Outdoors",
    "Video_Games",
    "Software",
    "Fashion",
    "Grocery_Gourmet_Food",
    "Car",
    "Beauty",
    "Music"
    ]

# Colaborative Filtering

전체 상품에 대한 CF

In [3]:
def preprocess_review(review_path, min_user_cnt, min_review_cnt):
    try:
        review_df = pd.read_json(review_path, lines=True)
        filtered_review_df = review_df[["parent_asin", "rating", "user_id", "title", "category", "product_title"]]
        # filtered_review_df = filtered_review_df[filtered_review_df["category"] == category]
        distinct_review_df = filtered_review_df.drop_duplicates(subset=["user_id", "parent_asin"])
        
        review_counts = distinct_review_df["parent_asin"].value_counts()
        user_counts = distinct_review_df['user_id'].value_counts()
        
        result_df = distinct_review_df[(
            (distinct_review_df['user_id'].isin(user_counts[user_counts >= min_user_cnt].index)) &
            (distinct_review_df['parent_asin'].isin(review_counts[review_counts >= min_review_cnt].index))
        )]
        
        # # 리뷰 개수가 많은 순으로 정렬
        # sorted_parent_asin = result_df["parent_asin"].value_counts().index
        # result_df = result_df.set_index("parent_asin").loc[sorted_parent_asin].reset_index()
        
        return result_df
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [99]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def knn_predict(item_sim_df, train_matrix, k=5):
    item_ids = train_matrix.index
    user_ids = train_matrix.columns
    predictions = np.zeros(train_matrix.shape)
    
    for item_idx, item in enumerate(item_ids):
        for user_idx, user in enumerate(user_ids):
            user_ratings = train_matrix.iloc[:, user_idx].values  # 해당 사용자의 모든 평점
            relevant_items = item_sim_df.iloc[item_idx].values  # 현재 아이템과 다른 아이템 간 유사도
            
            # K개의 가장 유사한 이웃 선택
            neighbors_idx = np.argsort(relevant_items)[-k:]
            neighbors_sim = relevant_items[neighbors_idx]
            neighbors_ratings = user_ratings[neighbors_idx]
            
            # 가중합 계산
            weighted_sum = np.dot(neighbors_sim, neighbors_ratings)
            sim_sum = np.abs(neighbors_sim).sum()
            
            if sim_sum > 0:
                predictions[item_idx, user_idx] = weighted_sum / sim_sum
            else:
                # 유사도가 없는 경우 행 평균값으로 대체
                row_mean = train_matrix.iloc[item_idx, :].mean()
                predictions[item_idx, user_idx] = row_mean
    
    return predictions

def knn_based_cf(df, org_df, k=5):
    # 사용자-아이템 행렬 생성
    item_user_matrix = df.pivot(index='parent_asin', columns='user_id', values='rating').fillna(0)
    item_ids = item_user_matrix.index
    user_ids = item_user_matrix.columns
    
    if df.empty:
        return "Empty","Empty"
    
    # 훈련-테스트 데이터 분할
    trainset, testset = train_test_split(df, test_size=0.2, random_state=42)
    
    # 훈련 데이터로 사용자-아이템 행렬 생성
    train_matrix = trainset.pivot(index='parent_asin', columns='user_id', values='rating').reindex(index=item_ids, columns=user_ids).fillna(0)
    test_matrix = testset.pivot(index='parent_asin', columns='user_id', values='rating').reindex(index=item_ids, columns=user_ids).fillna(0)
    
    # 아이템 기반 코사인 유사도 계산
    item_sim_matrix = cosine_similarity(train_matrix)
    np.fill_diagonal(item_sim_matrix, 0)  # 자신과의 유사도는 0으로 설정
    item_sim_df = pd.DataFrame(item_sim_matrix, index=item_ids, columns=item_ids)
    
    # 예측값 생성
    predictions = knn_predict(item_sim_df, train_matrix, k)
    
    # 테스트 세트에서 RMSE 계산
    test_actual = test_matrix.values[test_matrix > 0]
    test_predicted = predictions[test_matrix > 0]
    rmse = sqrt(mean_squared_error(test_actual, test_predicted))
    
    # 커버리지 계산
    total_len = len(org_df['parent_asin'].unique())
    non_zero_predictions = np.sum(predictions > 0)
    coverage_value = non_zero_predictions / total_len
    return rmse, coverage_value

# Memory based CF
아이템-사용자 행렬에서 아이템 끼리의 코사인 유사도를 구한 후 K개의 근접한 이웃을 뽑아 평점 예측을 하였다. 성능 지표로는 정확성을 위한 테스트 셋과 예측으로 만들어진 행렬간의 RMSE와 다양성을 위한 전체 상품 수 중 몇 가지를 추천하는지 그 비율을 나타내는 Coverage 2가지를 채택하였다.

In [94]:
# 실행
review_path = os.path.abspath("./Data/Amazon/Review/combined_data2.json")
review_df = pd.read_json(review_path, lines=True)
raw_df = review_df[["parent_asin", "rating", "user_id", "title", "category", "product_title"]]
raw_df = raw_df.drop_duplicates(subset=["user_id", "parent_asin"])
review_df = preprocess_review(review_path,8,8)
 
# 아이템 기반 CF 실행
rmse,coverage_value = knn_based_cf(review_df,raw_df)
print("KNN based")
print(f"RMSE: {rmse:.3f}")
print(f"Coverage: {coverage_value:.2%}")

KNN based
RMSE: 4.144
Coverage: 33.59%


하지만 처음 시도한 결과는 좋지 않았다. RMSE가 3.975라는 만족스럽지 못한 수치가 나왔다. 이는 샘플로 입력된 데이터가 편향된 것으로 보였고, 이를 조사해보았다. 작성한 리뷰가 많은 사용자 N명 또는 리뷰가 많았던 상품 N개를 골라 데이터를 조정하여 테스트해보았다. N을 20~50 범위에서 Gridsearch를 이용하여 가장 좋은 성능일 때의 파라미터를 찾았다. 그 결과, 인기 사용자는 25명, 인기 상품은 45개 일때 각각 RMSE가 3.647, 3.743이 나왔다. 하지만 N의 값이 커질수록 coverage는 감소하며 상품 추천의 다양성은 줄어들었다.

In [95]:
def find_best_hyperparameter_x(review_df, raw_df, y=50, x_range=(20, 50)):
    best_rmse = float('inf')
    best_params = {'x': None, 'rmse': None, 'coverage': None}
    x_values = range(x_range[0], x_range[1] + 1)

    for x in x_values:
        # 상위 x명의 유저와 각 유저당 y개의 리뷰를 필터링
        top_user_id = (
            review_df['user_id']
            .value_counts()
            .head(x)
            .index
        )
        filtered_df = (
            review_df[review_df['user_id'].isin(top_user_id)]
            .groupby('user_id')
            .apply(lambda group: group.head(y))
            .reset_index(drop=True)
        )

        rmse, coverage_value = knn_based_cf(filtered_df, raw_df)
        if rmse < best_rmse:
            best_rmse = rmse
            best_params = {
                'x': x,
                'rmse': rmse,
                'coverage': coverage_value
            }

    return best_params

# 사용 예제
best_params = find_best_hyperparameter_x(review_df, raw_df, y=50, x_range=(20, 50))

print("\nBest Hyperparameters:")
print(f"x: {best_params['x']}")
print(f"RMSE: {best_params['rmse']:.3f}")
print(f"Coverage: {best_params['coverage']:.2%}")


  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lam


Best Hyperparameters:
x: 25
RMSE: 3.647
Coverage: 1.56%


In [96]:
def find_best_hyperparameter_x(review_df, raw_df, y=50, x_range=(20, 50)):
    best_rmse = float('inf')
    best_params = {'x': None, 'rmse': None, 'coverage': None}
    
    x_values = range(x_range[0], x_range[1] + 1)

    for x in x_values:
        # 상위 N개의 상품과 각 상품의 y개를 필터링
        top_item_id = (
            review_df['parent_asin']
            .value_counts()
            .head(x)
            .index
        )
        filtered_df = (
            review_df[review_df['parent_asin'].isin(top_item_id)]
            .groupby('parent_asin')
            .apply(lambda group: group.head(y))
            .reset_index(drop=True)
        )

        rmse, coverage_value = knn_based_cf(filtered_df, raw_df)
        if rmse < best_rmse:
            best_rmse = rmse
            best_params = {
                'x': x,
                'rmse': rmse,
                'coverage': coverage_value
            }

    return best_params

# 사용 예제
best_params = find_best_hyperparameter_x(review_df, raw_df, y=50, x_range=(20, 50))

print("\nBest Hyperparameters:")
print(f"x: {best_params['x']}")
print(f"RMSE: {best_params['rmse']:.3f}")
print(f"Coverage: {best_params['coverage']:.2%}")


  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lam


Best Hyperparameters:
x: 45
RMSE: 3.743
Coverage: 5.82%


특정 카테고리에서의 CF

In [102]:
def CategoryBasedCF(df,org_df,categories):
    for targetCategory in categories:
        print(targetCategory+"'s performance")
        filtered_org_df = org_df[org_df["category"] == targetCategory]
        filtered_df = df[df["category"] == targetCategory]
        rmse, coverage_value = knn_based_cf(filtered_df,filtered_org_df)
        if rmse == "Empty" or coverage_value == "Empty":
            print("Data is empty")
        else:
            print("KNN based")
            print(f"RMSE: {rmse:.3f}")
            print(f"Coverage: {coverage_value:.2%}")
        print("\n")

In [None]:
CategoryBasedCF(review_df,raw_df,categories)

실제 시나리오

1. 카테고리 입력
2. 해당 카테고리 인기 상품 5개 추출
3. 5개 상품에 대하여 각각 추천하여 유사도 높은 순으로 정렬

In [104]:
# 입력한 카테고리에서 인기 상품을 추천하는 함수
def CategoryBasedRecommend(review_df, targetCategory, top_n=5):
    
    filtered_df = review_df[review_df["category"] == targetCategory]
    
    if filtered_df.empty:
        print(f"카테고리 '{targetCategory}'에 해당하는 데이터가 없습니다.")
        return pd.DataFrame()

    popular_items = (
        filtered_df.groupby(["parent_asin", "category","product_title"])
        .agg(avg_rating=("rating", "mean"), num_reviews=("user_id", "count"))
        .reset_index()
        .sort_values(by=["num_reviews", "avg_rating"], ascending=[False, False])
    )
    top_items = popular_items.head(top_n)
    return top_items

In [116]:
def recommend_related_items(df, target_item, top_n=5):
    # 사용자-아이템 행렬 생성
    item_user_matrix = df.pivot(index='parent_asin', columns='user_id', values='rating').fillna(0)
    
    # 아이템 간 코사인 유사도 계산
    item_sim_matrix = cosine_similarity(item_user_matrix)
    item_sim_df = pd.DataFrame(item_sim_matrix, index=item_user_matrix.index, columns=item_user_matrix.index)
    
    # 대상 아이템이 존재하지 않으면 예외 처리
    if target_item not in item_sim_df.index:
        raise ValueError(f"Target item '{target_item}' not found in the dataset.")
    
    # 대상 아이템과 다른 아이템 간 유사도 정렬
    related_items = item_sim_df.loc[target_item].sort_values(ascending=False)
    
    # 상위 N개 아이템 추천 (자신 제외)
    recommendations = [
        {"parent_asin": item, "similarity": similarity}
        for item, similarity in related_items.items() if item != target_item
    ][:top_n]
    
    recommendations_df = pd.DataFrame(recommendations)
    recommendations_df = recommendations_df.merge(df[["parent_asin", "product_title", "category"]].drop_duplicates(), 
                                                   on="parent_asin", 
                                                   how="left")
    return recommendations_df

In [123]:
input_category = "Video_Games"
top_items = CategoryBasedRecommend(review_df,input_category)

In [122]:
recommend_result = []
for _, row in top_items.iterrows():
    target_item = row["parent_asin"]
    target_category = row["category"]
    result = recommend_related_items(review_df, target_item)

    for _, recommend_row in result.iterrows():
        recommended_asin = recommend_row["parent_asin"]
        if recommended_asin not in [item["parent_asin"] for item in recommend_result]:
            recommend_result.append({
                "parent_asin": recommended_asin,
                "product_title": recommend_row["product_title"],
                "category": recommend_row["category"],
                "similarity": recommend_row["similarity"]
            })

sorted_recommendations = sorted(recommend_result, key=lambda x: x["similarity"], reverse=True)[:5]

print("Input Category : ",input_category,"\n")

print("Top 5 Recommended Items:")
for item in sorted_recommendations:
    print(f"Parent ASIN: {item['parent_asin']}")
    print(f"Product Title: {item['product_title']}")
    print(f"Category: {item['category']}")
    print(f"Similarity: {item['similarity']:.3f}")
    print("-" * 30)


Input Category :  Video_Games 

Top 5 Recommended Items:
Parent ASIN: B00992CF6W
Product Title: Minecraft
Category: Software
Similarity: 0.303
------------------------------
Parent ASIN: B01GW3H3U8
Product Title: Xbox Wireless Controller – White
Category: Video_Games
Similarity: 0.298
------------------------------
Parent ASIN: B00KCTER3U
Product Title: Retinol Face Moisturizer Cream Natural Facial moisturizing Cream 1.76 OZ with Ortho-Hydroxybenzoic Acid,VE for Hydrating,Repairing
Category: Beauty
Similarity: 0.243
------------------------------
Parent ASIN: B01ARIX8ZS
Product Title: Coloring Apps for Adults Premium
Category: Software
Similarity: 0.242
------------------------------
Parent ASIN: B00BFWY2KG
Product Title: DisneyNOW – Episodes & Live TV
Category: Software
Similarity: 0.228
------------------------------
