In [11]:
import pandas as pd
from scipy.sparse import coo_matrix

# 1. 데이터 로드 (예제: Feather 파일)
df = pd.read_feather("../dataset/rating_groupby_category_feather/rating_Books.feather")

# 2. 사용자 & 아이템 리스트 생성
users = sorted(df["UserID"].unique())
items = sorted(df["products"].unique())
items

[' Thomas & Friends - Find That Freight - Publications International',
 '"Good to Great" and the Social Sectors - Jim Collins',
 "'Tis - Frank McCourt",
 '1,001 Nights Without Sex: The Curse of the Single Girl - Suzanne Schlosberg',
 '10 Reasons Not To Fall In Love - Linda Green',
 '10-Minute Tarot - Skye Alexander',
 '100 Best Loved Nursery Rhymes',
 '100 Exercises to Get You into Drama School: Improve Your Acting and Audition Skills - Jona Howl & John Rowe',
 '100 Great Brits: A Rhyming History from Bede to Beckham - James Muirden',
 '100 Great Houseplants - John Evans',
 '100 Questions & Answers About HIV and AIDS - Joel E. Gallant M.D.',
 '100 Things You Should Know About Ancient Rome - Fiona MacDonald',
 '100 Years 100 Songs',
 '100% Gleek - Evie Parker',
 '1000 Activities - Roger Priddy',
 '1000 Stickers - Roger Priddy',
 '1000 Tattoos (Taschen) - Henk Schiffmacher',
 '1000 Things for Kids to Do in the Holidays - Time Out Guides',
 '1000 Things to Do in Britain - Time Out Guides'

In [12]:
# 3. 사용자-아이템 인덱스 매핑
user_idx = {u: i for i, u in enumerate(users)}
item_idx = {p: j for j, p in enumerate(items)}

In [35]:
# 4. 사용자-아이템 희소 행렬 생성
row = df["UserID"].map(user_idx)
col = df["products"].map(item_idx)
data = df["rating"]  # 평점 or 구매 횟수 데이터
R_sparse = coo_matrix((data, (row, col)), shape=(len(users), len(items)))

# R_size에 따른 num_factors 초기화 함수
def set_num_factors(R_size):
    # 조건을 하나의 딕셔너리로 처리하여 코드 간결화
    factor_map = {
        (1000, 10000): 2,
        (10000, 100000): 4,
        (100000, 1000000): 8,
        (1000000, float('inf')): 8
    }
    
    # 범위에 맞는 num_factors 반환
    for (lower, upper), factors in factor_map.items():
        if lower <= R_size < upper:
            return factors
    return 1  # 기본값

R_size = len(users)* len(items)
num_factors = set_num_factors(R_size)
num_factors

8

In [17]:
# 5. ALS 모델 학습 함수
def train_als(R_sparse, num_factors=16, reg=0.1, iterations=20):
    model = implicit.als.AlternatingLeastSquares(
        factors=num_factors, 
        regularization=reg, 
        iterations=iterations
    )
    model.fit(R_sparse)
    return model

# 6. ALS 실행
als_model = train_als(R_sparse)

# 7. 결과 출력 (사용자 벡터)
print("\n사용자 특성 행렬 (P):")
print(als_model.user_factors)



  0%|          | 0/20 [00:00<?, ?it/s]


사용자 특성 행렬 (P):
[[ 0.00078298 -0.0662047   0.06383559 ... -0.33744746  0.12162353
  -0.24095948]
 [-0.10577349 -0.10910149  0.1907902  ...  0.04268162 -0.09490505
  -0.02862601]
 [ 0.11834679 -0.03750421 -0.15524653 ... -0.18150489 -0.11755951
  -0.0773875 ]
 ...
 [ 0.0091384   0.15632494 -0.07553526 ... -0.09711138  0.13651402
   0.12677976]
 [ 0.09180298 -0.08266521  0.06039119 ... -0.20544204 -0.11315019
   0.08800449]
 [-0.10995341  0.3805399   0.09362763 ... -0.12319755  0.18635148
   0.12600109]]


In [None]:
user_factors_array = als_model.user_factors.to_numpy()
user_factors_list = user_factors_array.tolist()

# 7. 사용자 벡터를 DataFrame으로 변환 (GPU 호환)
user_vectors = pd.DataFrame({
    "UserID": users, 
    "Vector": user_factors_list 
})

user_vectors.to_feather("../output/ouput.feather")

## data 확인

In [None]:
import pandas as pd
import numpy as np
import glob

# 디렉토리 내 모든 feather 파일 경로 가져오기
feathers = glob.glob("../dataset/rating_groupby_category_feather/*.feather")

# 각 CSV 파일에 대해 작업 수행
for file in feathers:
    df = pd.read_feather(file)
    
    # 2. 사용자와 아이템(상품) 리스트 생성
    users = sorted(df["UserID"].unique())
    
    # 'products' 컬럼에서 None 값을 제거한 후 아이템 목록 생성
    items = sorted(df["products"].dropna().unique())

    # 3. 사용자-아이템 행렬(R) 생성 (binary or frequency-based)
    user_idx = {u: i for i, u in enumerate(users)}
    item_idx = {p: j for j, p in enumerate(items)}

    # 사용자-아이템 행렬의 크기 출력 (쉼표 추가)
    matrix_size = len(user_idx) * len(item_idx)
    file = file.split('/')[-1]
    print(f"File: {file} | R size: {matrix_size:,}")


File: rating_Adult Products.feather | R size: 2,548
File: rating_Entertainment.feather | R size: 6,055,374
File: rating_Family.feather | R size: 8,607,128
File: rating_Telecommunications.feather | R size: 4,712,708
File: rating_Finance.feather | R size: 910,780
File: rating_Books.feather | R size: 50,923,080
File: rating_Education & Careers.feather | R size: 741,300
File: rating_Household Appliances.feather | R size: 8,067,064
File: rating_Food & Drink.feather | R size: 26,395,827
File: rating_Travel.feather | R size: 53,864,346
File: rating_Internet.feather | R size: 15,620,220
File: rating_Beauty.feather | R size: 37,734,240
File: rating_Musical Instruments & Equipment.feather | R size: 106,265
File: rating_Cars & Motorcycles.feather | R size: 3,158,100
File: rating_Computers.feather | R size: 11,507,200
File: rating_DVDs.feather | R size: 60,478,201
File: rating_Office Equipment.feather | R size: 104,794
File: rating_Sports & Outdoors.feather | R size: 819,698
File: rating_Ciao Caf_

In [6]:
import pandas as pd

# 1. 데이터 로드 (예제: Feather 파일)
df = pd.read_feather("../output/UR_output/rating_Books_UR.feather")
df

Unnamed: 0,UserID,Vector
0,24.0,"[0.043156661093235016, 0.14980490505695343, -0..."
1,27.0,"[0.02001309022307396, -0.06039724498987198, -0..."
2,45.0,"[0.09603305160999298, 0.01229463703930378, 0.0..."
3,136.0,"[-1.7821576598362268e-14, 2.3313987256045644e-..."
4,213.0,"[-0.010745213367044926, -0.22765904664993286, ..."
...,...,...
3715,6893624.0,"[-0.0011913853231817484, -0.035554587841033936..."
3716,6894262.0,"[-0.025968607515096664, -0.004517001565545797,..."
3717,6895524.0,"[0.0077126966789364815, -0.03846563398838043, ..."
3718,6896104.0,"[0.06838995963335037, -0.010646789334714413, -..."
