<a href="https://colab.research.google.com/github/tpgus2603/DataMining/blob/main/RecommendSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd ./drive/MyDrive/json_csv_files/amazon_review2

/content/drive/MyDrive/json_csv_files/amazon_review2


In [3]:
# 라이브러리 설치 (필요시)
!pip install pandas scikit-learn

# 라이브러리 임포트
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



시나리오
1.   트위터에서 추출한 사용자의 프로필이 Movies_TV, Grocery_Gourmet_Food,Eletronics 라고 가정
2.   해당 사용자에게 content based와 colaborative filtering을 이용하여 각각 추천

# Content based

In [72]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


# 상품 데이터 파일 경로
products_file_path = './combined_data2.json'  # 실제 경로로 변경
products_df = pd.read_json(products_file_path, lines=True)

# 원하는 카테고리 목록
desired_categories = ['Movies_TV', 'Grocery_Gourmet_Food', 'Electronics']

# 카테고리 필터링
filtered_products_df = products_df[products_df['category'].isin(desired_categories)].copy()
filtered_products_df = filtered_products_df[['rating', 'product_title', 'category', 'parent_asin']]
print("Filtered Products DataFrame:")
print(filtered_products_df.head())

# user_reviews 데이터 로드
user_reviews_file_path = './user_reviews.json'  # 실제 경로로 변경
user_reviews_df = pd.read_json(user_reviews_file_path, lines=True)
user_reviews_df = user_reviews_df[['product_title', 'category', 'parent_asin']]
user_reviews_df = user_reviews_df[user_reviews_df['category'].isin(desired_categories)].copy()
print("User Reviews DataFrame:")
print(user_reviews_df.head())

# test_reviews 데이터 로드
test_reviews_file_path = './test_reviews.json'  # 실제 경로로 변경
test_reviews_df = pd.read_json(test_reviews_file_path, lines=True)
test_reviews_df = test_reviews_df[['product_title', 'category', 'parent_asin']]
test_reviews_df = test_reviews_df[test_reviews_df['category'].isin(desired_categories)].copy()
print("Test Reviews DataFrame:")
print(test_reviews_df.head())

def combine_product_title(df):
    return df['product_title'].fillna('')

# test_reviews를 Train/Val로 분할 (7:3)
train_reviews, val_reviews = train_test_split(test_reviews_df, test_size=0.3, random_state=42)

# 카테고리별로 TF-IDF를 계산하기 위해 각 카테고리별로 반복 수행
rating_threshold = 4.5  # 평점 임계값을 4.5 이상으로 설정

Filtered Products DataFrame:
       rating    product_title   category parent_asin
29669       5      Sneaky Pete  Movies_TV  B013488XFS
29670       5  Creative Galaxy  Movies_TV  B00CB6VTDS
29671       3             None  Movies_TV  B096Z8Z3R6
29672       4             None  Movies_TV  B09M14D9FZ
29673       5             None  Movies_TV  B001H1SVZC
User Reviews DataFrame:
                                      product_title   category parent_asin
0  Abbott & Costello: Universal Pictures Collection  Movies_TV  B00IJUIY3S
1                                        The Intern  Movies_TV  B015SKC7KW
2                                        Funny Farm  Movies_TV  B00901SNW2
3                                    Monsters, Inc.  Movies_TV  B00BHU9CCO
4                                          Die Hard  Movies_TV  B009EEQO08
Test Reviews DataFrame:
                     product_title   category parent_asin
0                  Sleeping Beauty  Movies_TV  B07WNYJV6C
1                  Big Little Lie

평점이 좋은 상품중에서 유사도를 측정해서 아이템을 추천하는 방식과 순수하게 제품 제목  유사도만으로 아이템을 추천하는 방식 두가지를 사용했습니다



In [73]:
def get_recommendations_based_on_high_rating(user_reviews_df, high_rating_products_df, tfidf_matrix, asin_to_index, category_df, top_n=5):
    recommendations = []
    reviewed_asins = set(user_reviews_df['parent_asin'])
    for _, review in user_reviews_df.iterrows():
        asin = review['parent_asin']
        if asin not in asin_to_index:
            continue
        idx = asin_to_index[asin]
        cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
        similar_indices = cosine_sim.argsort()[::-1][1:top_n+1]

        similar_products_all = category_df.iloc[similar_indices]
        similar_products = similar_products_all[similar_products_all['rating'] >= rating_threshold]

        recommendations.append(similar_products)

    if recommendations:
        recommendations_df = pd.concat(recommendations).drop_duplicates()
        recommendations_df = recommendations_df[~recommendations_df['parent_asin'].isin(reviewed_asins)]
        recommendations_df = recommendations_df.head(top_n)
        return recommendations_df
    else:
        return pd.DataFrame()

def get_recommendations_based_on_similarity(user_tfidf, tfidf_matrix, category_df, user_reviews_df, asin_to_index, top_n=5):
    cosine_sim_user = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    user_asins = user_reviews_df['parent_asin'].tolist()
    user_indices = [asin_to_index[asin] for asin in user_asins if asin in asin_to_index]
    sorted_indices = cosine_sim_user.argsort()[::-1]
    sorted_indices = [idx for idx in sorted_indices if idx not in user_indices]
    top_indices = sorted_indices[:top_n]
    recommended_products = category_df.iloc[top_indices]
    return recommended_products.head(top_n)

def precision_at_k(recommended_asins, val_asins, k=5):
    recommended_top_k = recommended_asins[:k]
    relevant = [asin for asin in recommended_top_k if asin in val_asins]
    precision = len(relevant) / k
    return precision

def recall_at_k(recommended_asins, val_asins, k=5):
    recommended_top_k = recommended_asins[:k]
    relevant = [asin for asin in recommended_top_k if asin in val_asins]
    recall = len(relevant) / len(val_asins) if len(val_asins) > 0 else 0
    return recall



In [74]:

top_n = 3

# 평점 데이터 타입 일관성 확인 및 변환
filtered_products_df['rating'] = filtered_products_df['rating'].astype(float)
for target_category in desired_categories:
    print("\n" + "="*50)
    print(f"Category: {target_category}")
    print("="*50)

    category_df = filtered_products_df[filtered_products_df['category'] == target_category].copy()
    category_df['combined_text'] = combine_product_title(category_df)

    category_train_reviews = train_reviews[train_reviews['category'] == target_category].copy()
    category_val_reviews = val_reviews[val_reviews['category'] == target_category].copy()

    category_train_reviews['combined_text'] = combine_product_title(category_train_reviews)
    category_val_reviews['combined_text'] = combine_product_title(category_val_reviews)

    tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
    tfidf_matrix = tfidf.fit_transform(category_df['combined_text'])

    category_df = category_df.reset_index(drop=True)
    category_df['index'] = category_df.index
    asin_to_index = pd.Series(category_df.index, index=category_df['parent_asin']).to_dict()

    print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

    high_rating_products_df = category_df[category_df['rating'] >= rating_threshold].copy()
    print(f"Number of products with rating >= {rating_threshold} in {target_category}: {len(high_rating_products_df)}")

    if len(category_train_reviews) > 0:
        user_query = ' '.join(category_train_reviews['combined_text'].tolist())
        user_tfidf = tfidf.transform([user_query])
    else:
        user_query = ''
        user_tfidf = tfidf.transform([user_query])

    val_asins = category_val_reviews['parent_asin'].unique().tolist()

    # 유사도 기반 추천
    recommended_products_similarity = get_recommendations_based_on_similarity(
        user_tfidf, tfidf_matrix, category_df, category_train_reviews, asin_to_index, top_n=top_n
    )
    recommended_asins_similarity = recommended_products_similarity['parent_asin'].tolist()
    precision_sim = precision_at_k(recommended_asins_similarity, val_asins, k=top_n)
    recall_sim = recall_at_k(recommended_asins_similarity, val_asins, k=top_n)

    print("순수 유사도 기반 추천 성능:")
    print(f"Precision@{top_n}: {precision_sim}, Recall@{top_n}: {recall_sim}")

    # 평점 기반 추천
    recommended_products_rating = get_recommendations_based_on_high_rating(
        category_train_reviews, high_rating_products_df, tfidf_matrix, asin_to_index, category_df, top_n=top_n
    )
    if not recommended_products_rating.empty:
        recommended_asins_rating = recommended_products_rating['parent_asin'].tolist()
        precision_rating = precision_at_k(recommended_asins_rating, val_asins, k=top_n)
        recall_rating = recall_at_k(recommended_asins_rating, val_asins, k=top_n)
        print("평점 + 유사도 기반 추천 성능:")
        print(f"Precision@{top_n}: {precision_rating}, Recall@{top_n}: {recall_rating}")
    else:
        print("평점 + 유사도 기반 추천 결과 없음 (해당 카테고리에서 rating >= 4.5 상품 부재)")


Category: Movies_TV
TF-IDF Matrix Shape: (9998, 5030)
Number of products with rating >= 4.5 in Movies_TV: 6378
순수 유사도 기반 추천 성능:
Precision@3: 0.0, Recall@3: 0.0
평점 + 유사도 기반 추천 성능:
Precision@3: 0.0, Recall@3: 0.0

Category: Grocery_Gourmet_Food
TF-IDF Matrix Shape: (4862, 5753)
Number of products with rating >= 4.5 in Grocery_Gourmet_Food: 3187
순수 유사도 기반 추천 성능:
Precision@3: 0.0, Recall@3: 0.0
평점 + 유사도 기반 추천 성능:
Precision@3: 0.6666666666666666, Recall@3: 0.03225806451612903

Category: Electronics
TF-IDF Matrix Shape: (5471, 10000)
Number of products with rating >= 4.5 in Electronics: 3581
순수 유사도 기반 추천 성능:
Precision@3: 0.0, Recall@3: 0.0
평점 + 유사도 기반 추천 성능:
Precision@3: 0.0, Recall@3: 0.0


In [75]:
# 사용자 리뷰 데이터에 대해서는 성능 평가 없이 추천만 수행( 실제 시나리오)
print("\n" + "="*50)
print("User Review 기반 추천 (성능평가 없음)")
print("="*50)

# 특정사용자 데이터
user_reviews_df['combined_text'] = combine_product_title(user_reviews_df)

for target_category in desired_categories:
    print("\nCategory:", target_category)
    category_df = filtered_products_df[filtered_products_df['category'] == target_category].copy()
    category_df['combined_text'] = combine_product_title(category_df)

    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = tfidf.fit_transform(category_df['combined_text'])

    category_df = category_df.reset_index(drop=True)
    category_df['index'] = category_df.index
    asin_to_index = pd.Series(category_df.index, index=category_df['parent_asin']).to_dict()

    high_rating_products_df = category_df[category_df['rating'] >= rating_threshold].copy()

    user_query_all = ' '.join(user_reviews_df['combined_text'].tolist())
    user_tfidf_all = tfidf.transform([user_query_all])

    # 평점 기반 추천 (user_review)
    recommended_products_rating_user = get_recommendations_based_on_high_rating(
        user_reviews_df, high_rating_products_df, tfidf_matrix, asin_to_index, category_df, top_n=3
    )
    print("User Review 기반 평점 + 유사도 추천 상품:")
    if not recommended_products_rating_user.empty:
        print(recommended_products_rating_user[['parent_asin', 'product_title', 'category']])
    else:
        print("추천 없음")

    # 유사도 기반 추천 (user_review)
    recommended_products_similarity_user = get_recommendations_based_on_similarity(
        user_tfidf_all, tfidf_matrix, category_df, user_reviews_df, asin_to_index, top_n=3
    )
    print("User Review 기반 순수 유사도 추천 상품:")
    print(recommended_products_similarity_user[['parent_asin', 'product_title', 'category']])


User Review 기반 추천 (성능평가 없음)

Category: Movies_TV
User Review 기반 평점 + 유사도 추천 상품:
     parent_asin                                      product_title   category
1934  B00ZR3W3M8  Abbott and Costello Meet the Monsters Collecti...  Movies_TV
1417  B00TF7KYXC  Clint Eastwood: The Universal Pictures 7-Movie...  Movies_TV
3330  B008CFZQQS                                    Girls: Season 1  Movies_TV
User Review 기반 순수 유사도 추천 상품:
     parent_asin             product_title   category
6410  B000UAE7H2  A Christmas Memory [DVD]  Movies_TV
3182  B00A8MGLAI               Cloud Atlas  Movies_TV
9185  B00A8MGIZG               Cloud Atlas  Movies_TV

Category: Grocery_Gourmet_Food
User Review 기반 평점 + 유사도 추천 상품:
     parent_asin                                      product_title  \
822   B000ED7M6I  Bob's Red Mill Organic Brown Rice Farina Cream...   
4681  B00CM36GC4  Veetee Basmati Rice - 2 Minute Rice Microwavab...   
824   B00A839U7I  Annabelle's Big Hunk Minis, 0.425 oz Bars in a...   

          

애플리케이션 시나리오에서 수집할 수 있는 데이터는 사용자가 선호하는 상품의 이름, 카테고리 정도 뿐입니다 따라서  콜드스타트 문제로인해 테스트셋에서 실제로 정확한 추천을 거의 못하는 모습을 볼 수 있습니다. 만약 완전한 사용자 리뷰 데이터를 사용할 수 있을때
title , text를 이용할 수 있을때는 그보다 나은 성능이 측정됩니다


In [77]:

#사용자 리뷰데이터를 완전히 사용 가능한다는가정

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# 상품 데이터 파일 경로
products_file_path = './combined_data2.json'  # 실제 경로로 변경
products_df = pd.read_json(products_file_path, lines=True)

# 원하는 카테고리 목록
desired_categories = ['Movies_TV', 'Grocery_Gourmet_Food','Electronics']

# 카테고리 필터링
filtered_products_df = products_df[products_df['category'].isin(desired_categories)].copy()
filtered_products_df = filtered_products_df[['rating', 'title', 'text', 'helpful_vote',
                                             'verified_purchase', 'product_title', 'category', 'parent_asin']]
print("Filtered Products DataFrame:")
print(filtered_products_df.head())

# user_reviews 데이터 로드
user_reviews_file_path = './user_reviews.json'  # 실제 경로로 변경
user_reviews_df = pd.read_json(user_reviews_file_path, lines=True)
user_reviews_df = user_reviews_df[['title', 'text', 'helpful_vote', 'verified_purchase', 'product_title', 'category', 'parent_asin']]
user_reviews_df = user_reviews_df[user_reviews_df['category'].isin(desired_categories)].copy()
print("User Reviews DataFrame:")
print(user_reviews_df.head())

# test_reviews 데이터 로드
test_reviews_file_path = './test_reviews.json'  # 실제 경로로 변경
test_reviews_df = pd.read_json(test_reviews_file_path, lines=True)
test_reviews_df = test_reviews_df[['title', 'text', 'helpful_vote', 'verified_purchase', 'product_title', 'category', 'parent_asin']]
test_reviews_df = test_reviews_df[test_reviews_df['category'].isin(desired_categories)].copy()
print("Test Reviews DataFrame:")
print(test_reviews_df.head())

def combine_text(df):
    return df['title'].fillna('') + ' ' + df['text'].fillna('') + ' ' + df['product_title'].fillna('')

# test_reviews를 Train/Val로 분할 (7:3)
train_reviews, val_reviews = train_test_split(test_reviews_df, test_size=0.3, random_state=42)

# 카테고리별로 TF-IDF를 계산하기 위해 각 카테고리별로 반복 수행
rating_threshold = 4.5  # 평점 임계값을 4.5 이상으로 설정

def get_recommendations_based_on_high_rating(user_reviews_df, high_rating_products_df, tfidf_matrix, asin_to_index, category_df, top_n=5):
    recommendations = []
    reviewed_asins = set(user_reviews_df['parent_asin'])
    for _, review in user_reviews_df.iterrows():
        asin = review['parent_asin']
        if asin not in asin_to_index:
            continue
        idx = asin_to_index[asin]
        cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
        similar_indices = cosine_sim.argsort()[::-1][1:top_n+1]

        similar_products_all = category_df.iloc[similar_indices]
        # 평점 비교 조건을 >=로 변경
        similar_products = similar_products_all[similar_products_all['rating'] >= rating_threshold]

        recommendations.append(similar_products)

    if recommendations:
        recommendations_df = pd.concat(recommendations).drop_duplicates()
        recommendations_df = recommendations_df[~recommendations_df['parent_asin'].isin(reviewed_asins)]
        recommendations_df = recommendations_df.head(top_n)
        return recommendations_df
    else:
        return pd.DataFrame()

def get_recommendations_based_on_similarity(user_tfidf, tfidf_matrix, category_df, user_reviews_df, asin_to_index, top_n=5):
    cosine_sim_user = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    user_asins = user_reviews_df['parent_asin'].tolist()
    user_indices = [asin_to_index[asin] for asin in user_asins if asin in asin_to_index]
    sorted_indices = cosine_sim_user.argsort()[::-1]
    sorted_indices = [idx for idx in sorted_indices if idx not in user_indices]
    top_indices = sorted_indices[:top_n]
    recommended_products = category_df.iloc[top_indices]
    return recommended_products.head(top_n)

def precision_at_k(recommended_asins, val_asins, k=5):
    recommended_top_k = recommended_asins[:k]
    relevant = [asin for asin in recommended_top_k if asin in val_asins]
    precision = len(relevant) / k
    return precision

def recall_at_k(recommended_asins, val_asins, k=5):
    recommended_top_k = recommended_asins[:k]
    relevant = [asin for asin in recommended_top_k if asin in val_asins]
    recall = len(relevant) / len(val_asins) if len(val_asins) > 0 else 0
    return recall

top_n = 3  # 추천 개수를 5으로 설정

# 평점 데이터 타입 일관성 확인 및 변환
filtered_products_df['rating'] = filtered_products_df['rating'].astype(float)

for target_category in desired_categories:
    print("\n" + "="*50)
    print(f"Category: {target_category}")
    print("="*50)

    category_df = filtered_products_df[filtered_products_df['category'] == target_category].copy()
    category_df['combined_text'] = combine_text(category_df)

    category_train_reviews = train_reviews[train_reviews['category'] == target_category].copy()
    category_val_reviews = val_reviews[val_reviews['category'] == target_category].copy()

    category_train_reviews['combined_text'] = combine_text(category_train_reviews)
    category_val_reviews['combined_text'] = combine_text(category_val_reviews)

    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = tfidf.fit_transform(category_df['combined_text'])

    category_df = category_df.reset_index(drop=True)
    category_df['index'] = category_df.index
    asin_to_index = pd.Series(category_df.index, index=category_df['parent_asin']).to_dict()

    print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

    high_rating_products_df = category_df[category_df['rating'] >= rating_threshold].copy()
    print(f"Number of products with rating >= {rating_threshold} in {target_category}: {len(high_rating_products_df)}")

    if len(category_train_reviews) > 0:
        user_query = ' '.join(category_train_reviews['combined_text'].tolist())
        user_tfidf = tfidf.transform([user_query])
    else:
        user_query = ''
        user_tfidf = tfidf.transform([user_query])

    val_asins = category_val_reviews['parent_asin'].unique().tolist()

    # 유사도 기반 추천
    recommended_products_similarity = get_recommendations_based_on_similarity(
        user_tfidf, tfidf_matrix, category_df, category_train_reviews, asin_to_index, top_n=top_n
    )
    recommended_asins_similarity = recommended_products_similarity['parent_asin'].tolist()
    precision_sim = precision_at_k(recommended_asins_similarity, val_asins, k=top_n)
    recall_sim = recall_at_k(recommended_asins_similarity, val_asins, k=top_n)

    print("순수 유사도 기반 추천 성능:")
    print(f"Precision@{top_n}: {precision_sim}, Recall@{top_n}: {recall_sim}")

    # 평점 기반 추천
    recommended_products_rating = get_recommendations_based_on_high_rating(
        category_train_reviews, high_rating_products_df, tfidf_matrix, asin_to_index, category_df, top_n=top_n
    )
    if not recommended_products_rating.empty:
        recommended_asins_rating = recommended_products_rating['parent_asin'].tolist()
        precision_rating = precision_at_k(recommended_asins_rating, val_asins, k=top_n)
        recall_rating = recall_at_k(recommended_asins_rating, val_asins, k=top_n)
        print("평점 + 유사도 기반 추천 성능:")
        print(f"Precision@{top_n}: {precision_rating}, Recall@{top_n}: {recall_rating}")
    else:
        print("평점 + 유사도 기반 추천 결과 없음 (해당 카테고리에서 rating >= 4.5 상품 부재)")

# 사용자 리뷰 데이터에 대해서는 성능 평가 없이 추천만 수행
print("\n" + "="*50)
print("User Review 기반 추천 (성능평가 없음)")
print("="*50)

# user_reviews_df 전체를 하나의 사용자로 가정 (필요하다면 특정 사용자만 선택 가능)
user_reviews_df['combined_text'] = combine_text(user_reviews_df)

for target_category in desired_categories:
    print("\nCategory:", target_category)
    category_df = filtered_products_df[filtered_products_df['category'] == target_category].copy()
    category_df['combined_text'] = combine_text(category_df)

    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = tfidf.fit_transform(category_df['combined_text'])

    category_df = category_df.reset_index(drop=True)
    category_df['index'] = category_df.index
    asin_to_index = pd.Series(category_df.index, index=category_df['parent_asin']).to_dict()

    high_rating_products_df = category_df[category_df['rating'] >= rating_threshold].copy()

    user_query_all = ' '.join(user_reviews_df['combined_text'].tolist())
    user_tfidf_all = tfidf.transform([user_query_all])

    # 평점 기반 추천 (user_review)
    recommended_products_rating_user = get_recommendations_based_on_high_rating(
        user_reviews_df, high_rating_products_df, tfidf_matrix, asin_to_index, category_df, top_n=5
    )
    print("User Review 기반 평점 + 유사도 추천 상품:")
    if not recommended_products_rating_user.empty:
        print(recommended_products_rating_user[['parent_asin', 'product_title', 'category']])
    else:
        print("추천 없음")

    # 유사도 기반 추천 (user_review)
    recommended_products_similarity_user = get_recommendations_based_on_similarity(
        user_tfidf_all, tfidf_matrix, category_df, user_reviews_df, asin_to_index, top_n=5
    )
    print("User Review 기반 순수 유사도 추천 상품:")
    print(recommended_products_similarity_user[['parent_asin', 'product_title', 'category']])


Filtered Products DataFrame:
       rating                                              title  \
29669       5                                         Five Stars   
29670       5                                         Five Stars   
29671       3                       Some decent moments...but...   
29672       4  Decent Depiction of Lower-Functioning Autism, ...   
29673       5                                    What Love Is...   

                                                    text  helpful_vote  \
29669           Amazon, please buy the show! I'm hooked!             0   
29670                         My Kiddos LOVE this show!!             0   
29671  Annabella Sciorra did her character justice wi...             0   
29672  ...there should be more of a range of characte...             1   
29673  ...isn't always how you expect it to be, but w...             0   

       verified_purchase    product_title   category parent_asin  
29669                  1      Sneaky Pete  Movies_

# Colaborative Filtering

In [None]:
# local에 padnas, numpy, matplotlib, surprise, sklearn 설치필요
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split

import seaborn as sns

데이터 전처리

In [None]:
def preprocess_review(review_path, min_user_cnt, min_review_cnt):
    review_df = pd.read_json(review_path, lines=True)
    filtered_review_df = review_df[["parent_asin", "rating", "user_id", "title", "category", "product_title"]]
    # filtered_review_df = filtered_review_df[filtered_review_df["category"] == category]
    distinct_review_df = filtered_review_df.drop_duplicates(subset=["user_id", "parent_asin"])

    review_counts = distinct_review_df["parent_asin"].value_counts()
    user_counts = distinct_review_df['user_id'].value_counts()

    result_df = distinct_review_df[(
        (distinct_review_df['user_id'].isin(user_counts[user_counts >= min_user_cnt].index)) &
        (distinct_review_df['parent_asin'].isin(review_counts[review_counts >= min_review_cnt].index))
    )]

    # # 리뷰 개수가 많은 순으로 정렬
    # sorted_parent_asin = result_df["parent_asin"].value_counts().index
    # result_df = result_df.set_index("parent_asin").loc[sorted_parent_asin].reset_index()

    return result_df

In [None]:
def knn_predict(item_sim_df, train_matrix, k=5):
    item_ids = train_matrix.index
    user_ids = train_matrix.columns
    predictions = np.zeros(train_matrix.shape)

    for item_idx, item in enumerate(item_ids):
        for user_idx, user in enumerate(user_ids):
            user_ratings = train_matrix.iloc[:, user_idx].values  # 해당 사용자의 모든 평점
            relevant_items = item_sim_df.iloc[item_idx].values  # 현재 아이템과 다른 아이템 간 유사도

            # K개의 가장 유사한 이웃 선택
            neighbors_idx = np.argsort(relevant_items)[-k:]
            neighbors_sim = relevant_items[neighbors_idx]
            neighbors_ratings = user_ratings[neighbors_idx]

            # 가중합 계산
            weighted_sum = np.dot(neighbors_sim, neighbors_ratings)
            sim_sum = np.abs(neighbors_sim).sum()

            if sim_sum > 0:
                predictions[item_idx, user_idx] = weighted_sum / sim_sum
            else:
                # 유사도가 없는 경우 행 평균값으로 대체
                row_mean = train_matrix.iloc[item_idx, :].mean()
                predictions[item_idx, user_idx] = row_mean

    return predictions

def knn_based_cf(df, org_df, k=5):
    # 사용자-아이템 행렬 생성
    item_user_matrix = df.pivot(index='parent_asin', columns='user_id', values='rating').fillna(0)
    item_ids = item_user_matrix.index
    user_ids = item_user_matrix.columns

    if df.empty:
        return "Empty","Empty"

    # 훈련-테스트 데이터 분할
    trainset, testset = train_test_split(df, test_size=0.2, random_state=42)

    # 훈련 데이터로 사용자-아이템 행렬 생성
    train_matrix = trainset.pivot(index='parent_asin', columns='user_id', values='rating').reindex(index=item_ids, columns=user_ids).fillna(0)
    test_matrix = testset.pivot(index='parent_asin', columns='user_id', values='rating').reindex(index=item_ids, columns=user_ids).fillna(0)

    # 아이템 기반 코사인 유사도 계산
    item_sim_matrix = cosine_similarity(train_matrix)
    np.fill_diagonal(item_sim_matrix, 0)  # 자신과의 유사도는 0으로 설정
    item_sim_df = pd.DataFrame(item_sim_matrix, index=item_ids, columns=item_ids)

    # 예측값 생성
    predictions = knn_predict(item_sim_df, train_matrix, k)

    # 테스트 세트에서 RMSE 계산
    test_actual = test_matrix.values[test_matrix > 0]
    test_predicted = predictions[test_matrix > 0]
    rmse = sqrt(mean_squared_error(test_actual, test_predicted))

    # 커버리지 계산
    total_len = len(org_df['parent_asin'].unique())
    non_zero_predictions = np.sum(predictions > 0)
    coverage_value = non_zero_predictions / total_len
    return rmse, coverage_value

데이터를 전처리 할 때 파라미터로 상품별 최소 리뷰 개수와 사용자별 최소 작성 리뷰 개수를 전달해 데이터를 필터링하였다. 이는 2가지 이유가 있는데, 첫번째는 데이터의 희소성을 줄여 성능 향상을 도모하기 위해서이고, 두번째는 연산 속도를 높이기 위함이었다. 설정한 값은 각각 8로 하였다.

In [None]:
review_path = './combined_data2.json'
review_df = pd.read_json(review_path, lines=True)
raw_df = review_df[["parent_asin", "rating", "user_id", "title", "category", "product_title"]]
raw_df = raw_df.drop_duplicates(subset=["user_id", "parent_asin"])
review_df = preprocess_review(review_path,8,8)

아이템-사용자 행렬에서 아이템 끼리의 코사인 유사도를 구한 후 K개의 근접한 이웃을 뽑아 평점 예측을 하였다. 성능 지표로는 정확성을 위한 테스트 셋과 예측으로 만들어진 행렬간의 RMSE와 다양성을 위한 전체 상품 수 중 몇 가지를 추천하는지 그 비율을 나타내는 Coverage 2가지를 채택하였다.

하지만 처음 시도한 결과는 좋지 않았다. RMSE가 4.144와 Coverage가 33% 라는 만족스럽지 못한 수치가 나왔다. 이는 샘플로 입력된 데이터가 편향된 것으로 간주하여 이를 조사하였다.

In [None]:
# 아이템 기반 CF 실행
rmse,coverage_value = knn_based_cf(review_df,raw_df)
print("KNN based")
print(f"RMSE: {rmse:.3f}")
print(f"Coverage: {coverage_value:.2%}")

KNN based
RMSE: 4.144
Coverage: 33.59%


 작성한 리뷰가 많은 사용자 N명 또는 리뷰가 많았던 상품 N개를 골라 데이터를 조정하여 테스트해보았다. N을 20~50 범위에서 Gridsearch를 이용하여 가장 좋은 성능일 때의 파라미터를 찾았다. 그 결과, 인기 사용자는 25명, 인기 상품은 45개 일때 각각 RMSE가 3.647, 3.743이 나왔다. 하지만 N의 값이 커질수록 coverage는 감소하며 상품 추천의 다양성은 줄어들었다.

In [None]:
def find_best_hyperparameter_x(review_df, raw_df, y=50, x_range=(20, 50)):
    best_rmse = float('inf')
    best_params = {'x': None, 'rmse': None, 'coverage': None}
    x_values = range(x_range[0], x_range[1] + 1)

    for x in x_values:
        # 상위 x명의 유저와 각 유저당 y개의 리뷰를 필터링
        top_user_id = (
            review_df['user_id']
            .value_counts()
            .head(x)
            .index
        )
        filtered_df = (
            review_df[review_df['user_id'].isin(top_user_id)]
            .groupby('user_id')
            .apply(lambda group: group.head(y))
            .reset_index(drop=True)
        )

        rmse, coverage_value = knn_based_cf(filtered_df, raw_df)
        if rmse < best_rmse:
            best_rmse = rmse
            best_params = {
                'x': x,
                'rmse': rmse,
                'coverage': coverage_value
            }

    return best_params

# 사용 예제
best_params = find_best_hyperparameter_x(review_df, raw_df, y=50, x_range=(20, 50))

print("\nBest Hyperparameters:")
print(f"x: {best_params['x']}")
print(f"RMSE: {best_params['rmse']:.3f}")
print(f"Coverage: {best_params['coverage']:.2%}")

  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lam


Best Hyperparameters:
x: 25
RMSE: 3.647
Coverage: 1.56%


In [None]:
def find_best_hyperparameter_x(review_df, raw_df, y=50, x_range=(20, 50)):
    best_rmse = float('inf')
    best_params = {'x': None, 'rmse': None, 'coverage': None}

    x_values = range(x_range[0], x_range[1] + 1)

    for x in x_values:
        # 상위 N개의 상품과 각 상품의 y개를 필터링
        top_item_id = (
            review_df['parent_asin']
            .value_counts()
            .head(x)
            .index
        )
        filtered_df = (
            review_df[review_df['parent_asin'].isin(top_item_id)]
            .groupby('parent_asin')
            .apply(lambda group: group.head(y))
            .reset_index(drop=True)
        )

        rmse, coverage_value = knn_based_cf(filtered_df, raw_df)
        if rmse < best_rmse:
            best_rmse = rmse
            best_params = {
                'x': x,
                'rmse': rmse,
                'coverage': coverage_value
            }

    return best_params

# 사용 예제
best_params = find_best_hyperparameter_x(review_df, raw_df, y=50, x_range=(20, 50))

print("\nBest Hyperparameters:")
print(f"x: {best_params['x']}")
print(f"RMSE: {best_params['rmse']:.3f}")
print(f"Coverage: {best_params['coverage']:.2%}")


  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lambda group: group.head(y))
  .apply(lam


Best Hyperparameters:
x: 45
RMSE: 3.743
Coverage: 5.82%


**surprise 라이브러리 이용**

GridSearch로도 RMSE에서 좋은 성능을 얻지 못하자 자체 구현한 알고리즘의 최적화 부분에서 문제가 있을 거라는 가설을 세웠다. 그래서 surprise 라이브러리를 사용하여 다시 진행해보았다.

In [None]:
!pip install surprise
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

def prepare_data(df):
    reader = Reader(rating_scale=(1, 5))  # 평점 범위 지정
    data = Dataset.load_from_df(df[["user_id", "parent_asin", "rating"]], reader)
    return data

def coverage(predictions, total_len):
    recommended_items = set([pred.iid for pred in predictions])
    return len(recommended_items) / total_len

def knn_based_cf_surprise(df,org_df):
    data = prepare_data(df)
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

    # 코사인 유사도를 사용한 아이템 기반 CF 모델 설정
    sim_options = {
        "name": "cosine",
        "user_based": False,
    }
    algo = KNNBasic(sim_options=sim_options)
    algo.fit(trainset)

    predictions = algo.test(testset)
    # RMSE 계산
    rmse = accuracy.rmse(predictions)

    # Coverage 계산
    total_len = len(org_df['parent_asin'].unique())
    coverage_value = coverage(predictions, total_len)

    return rmse,coverage_value,trainset



surprise 라이브러리를 사용하니 RMSE가 1.253으로 줄었으나 coverage가 0.32%로 매우 낮아졌다. 이는 처음에 데이터 전처리를 할 때 파라미터(상품별 최소 리뷰 개수, 사용자별 최소 작성 리뷰 개수)를 각각 8개로 설정해 필터링했는데 surprise 라이브러리를 사용하니 다양성이 크게 훼손되었다. 하지만 RMSE 성능이 개선되었고 연산속도가 빨라져 더 큰 데이터를 처리할 수 있다고 판단, 이에 데이터 전처리 과정에서 파라미터를 2,2로 설정하여 성능을 다시 측정하였다. 그 결과 RMSE가 1.266, Coverage가 3.98%라는 좋은 성능을 얻을 수 있었다.

In [None]:
rmse,coverage_value,trainset = knn_based_cf_surprise(review_df,raw_df)
print("KNN based")
print(f"RMSE: {rmse:.3f}")
print(f"Coverage: {coverage_value:.2%}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.2527
KNN based
RMSE: 1.253
Coverage: 0.32%


In [None]:
new_review_df = preprocess_review(review_path,2,2)
rmse,coverage_value,trainset = knn_based_cf_surprise(new_review_df,raw_df)
print("KNN based")
print(f"RMSE: {rmse:.3f}")
print(f"Coverage: {coverage_value:.2%}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.2659
KNN based
RMSE: 1.266
Coverage: 3.98%


여기서 성능 개선을 위하여 KNN에 기반한 Memory based 방식의 CF가 아닌, SVD에 기반한 Model based 방식의 CF를 적용해보았다. 그 결과 RMSE가 1.170으로 개선된 것을 확인할 수 있었다. 이는 데이터의 희소성을 해소했기 때문으로 예상된다.

In [None]:
from surprise import SVD

def svd_based_cf(df,org_df):
    data = prepare_data(df)
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

    algo = SVD(n_factors=10, random_state=42)
    algo.fit(trainset)
    predictions = algo.test(testset)

    # RMSE 계산
    rmse = accuracy.rmse(predictions)

    # Coverage 계산
    total_len = len(org_df['parent_asin'].unique())
    coverage_value = coverage(predictions, total_len)
    return rmse, coverage_value

rmse,coverage_value = svd_based_cf(new_review_df,raw_df)
print("SVD based")
print(f"RMSE: {rmse:.3f}")
print(f"Coverage: {coverage_value:.2%}")

RMSE: 1.1696
SVD based
RMSE: 1.170
Coverage: 3.98%


실제 시나리오

1. 카테고리 입력
2. 해당 카테고리 인기 상품 5개 추출
3. 5개 상품에 대하여 각각 추천하여 유사도 높은 순으로 정렬

In [None]:
# 입력한 카테고리에서 인기 상품을 추천하는 함수
def CategoryBasedRecommend(review_df, targetCategory, top_n=5):

    filtered_df = review_df[review_df["category"] == targetCategory]

    if filtered_df.empty:
        print(f"카테고리 '{targetCategory}'에 해당하는 데이터가 없습니다.")
        return pd.DataFrame()

    popular_items = (
        filtered_df.groupby(["parent_asin", "category","product_title"])
        .agg(avg_rating=("rating", "mean"), num_reviews=("user_id", "count"))
        .reset_index()
        .sort_values(by=["num_reviews", "avg_rating"], ascending=[False, False])
    )
    top_items = popular_items.head(top_n)
    return top_items

In [None]:
# 특정 상품과 관련된 추천 5개 생성
def recommend_related_items(df, target_item, top_n=5):
    data = prepare_data(df)
    trainset = data.build_full_trainset()
    algo = SVD(n_factors=50, random_state=42)
    algo.fit(trainset)
    item_factors = algo.qi
    item_ids = trainset._raw2inner_id_items.keys()

    item_factors_df = pd.DataFrame(item_factors, index=item_ids)

    if target_item not in item_factors_df.index:
        raise ValueError(f"Target item {target_item} not found in training data.")
    target_vector = item_factors_df.loc[target_item].values.reshape(1, -1)

    # 코사인 유사도 계산
    similarities = cosine_similarity(target_vector, item_factors_df.values).flatten()

    # 유사도를 데이터프레임으로 변환
    similarity_df = pd.DataFrame({
        "parent_asin": item_factors_df.index,
        "similarity": similarities
    })

    # 대상 아이템 제외 및 상위 N개 추천
    top_related_items = similarity_df[similarity_df["parent_asin"] != target_item] \
        .sort_values(by="similarity", ascending=False) \
        .head(top_n)

    # 추천 결과에 상품 정보 병합
    related_items_df = top_related_items.merge(
        df[["parent_asin", "product_title", "category"]].drop_duplicates(), on="parent_asin", how="left"
    )

    return related_items_df

In [None]:
for input_category in desired_categories:
  top_items = CategoryBasedRecommend(new_review_df,input_category)
  recommend_result = []
  for _, row in top_items.iterrows():
      target_item = row["parent_asin"]
      target_category = row["category"]
      result = recommend_related_items(new_review_df, target_item)
      for _, recommend_row in result.iterrows():
          recommended_asin = recommend_row["parent_asin"]
          if recommended_asin not in [item["parent_asin"] for item in recommend_result]:
              recommend_result.append({
                  "parent_asin": recommended_asin,
                  "product_title": recommend_row["product_title"],
                  "category": recommend_row["category"],
                  "similarity": recommend_row["similarity"]
              })

  sorted_recommendations = sorted(recommend_result, key=lambda x: x["similarity"], reverse=True)[:5]

  print("Input Category : ",input_category,"\n")

  print("Top 5 Recommended Items:")
  for item in sorted_recommendations:
      print(f"Parent ASIN: {item['parent_asin']}")
      print(f"Product Title: {item['product_title']}")
      print(f"Category: {item['category']}")
      print(f"Similarity: {item['similarity']:.3f}")
      print("-" * 30)


Input Category :  Movies_TV 

Top 5 Recommended Items:
Parent ASIN: B089RLLT5C
Product Title: Purple Hair Mask for Blonde with Keratin & Jojoba Oil - Platinum & Silver Hair - Instantly Eliminate Brassiness & Yellows - Made in USA - Hair Toner - Bleached & Highlighted Hair - Sulfate Free - 8 oz
Category: Beauty
Similarity: 0.497
------------------------------
Parent ASIN: B07PPFZQ6B
Product Title: Jade Roller and Gua Sha Tools Set - Anti Aging Rose Quartz Roller Massager - 100% Real Natural Jade Roller for Face, Eye, Neck - Beauty Jade Facial Roller for Slimming & Firming
Category: Beauty
Similarity: 0.486
------------------------------
Parent ASIN: B016R1QS7Y
Product Title: Pentatonix Deluxe Edition CD w/3 BONUS Tracks 2015 TARGET EXCLUSIVE
Category: Music
Similarity: 0.472
------------------------------
Parent ASIN: B01MY2EXYY
Product Title: Waiting for Stars
Category: Music
Similarity: 0.470
------------------------------
Parent ASIN: B017E884HQ
Product Title: Exfoliating Honey Bath 