### data 확인 및 인덱스 추가

In [13]:
import pandas as pd
import numpy as np

df = pd.read_feather("../dataset/rating_groupby_category_feather/rating_DVDs.feather")
df.head(5)

Unnamed: 0,index,UserID,products,Categories of the products,rating,helpfulness,time,content of review,userid:helpfulness voting
0,1392,5247778.0,Mary Poppins (DVD),DVDs,50.0,very helpful,22.08.2004,"In 1964 I was 10, a village girl who rarel...","6224941:3, 5309969:3, 5309969:3, 5270615:3, 55..."
1,1663,5647565.0,The Box (DVD),DVDs,30.0,very helpful,06.11.2010,Norma and Arthur Lewis are an ordinary mid...,"5719918:4, 6690494:4, 6690494:4, 5466331:4, 54..."
2,1664,5647565.0,Doomed To Die (DVD),DVDs,30.0,very helpful,30.10.2010,Shipping magnate Cyrus Wentworth is devast...,"5552133:4, 5552133:4, 5719918:4, 5633146:4, 55..."
3,1665,5647565.0,Haunting The (DVD),DVDs,50.0,very helpful,29.10.2010,"Hill House has a ghostly past, full of vio...","5719918:4, 5633146:4, 5534578:4, 6023030:4, 60..."
4,1666,5647565.0,The Invisible Ghost (DVD),DVDs,30.0,very helpful,27.10.2010,Dr Charles Kessler is a mild-mannered man ...,"5719918:4, 5633146:4, 6023030:4, 6023030:4, 55..."


In [None]:


embedding_dim = df['embedding'].iloc[0].shape  # 첫 번째 'embedding'의 차원
print("Embedding dimension:", embedding_dim)

# null byte 제거 및 개수 추적
def remove_null_bytes(x):
    # x는 numpy array로, 각 요소에 대해 null byte(\x00)의 개수를 셈
    null_byte_count = np.char.count(x.astype(str), '\x00').sum()  # null byte의 개수 세기
    cleaned_x = np.array([i.replace('\x00', '') for i in x.astype(str)])  # null byte를 제거한 배열
    return cleaned_x, null_byte_count

# 데이터프레임에 적용
df['embedding'], df['null_byte_count'] = zip(*df['embedding'].apply(lambda x: remove_null_bytes(x)))

# 결과 확인
print(df[['embedding', 'null_byte_count']].head())

df.sample(6)

# 첫 번째 embedding 벡터의 차원 출력
embedding_dim = df['embedding'].iloc[0].shape  # 첫 번째 'embedding'의 차원
print("Embedding dimension:", embedding_dim)


KeyError: 'embedding'

### doc2vec로 임베딩 생성

In [10]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# 문장을 토큰화하고 review_idx를 태그로 지정
tagged_data = [TaggedDocument(words=word_tokenize(row['content of review']), tags=[str(row['index'])]) for _, row in df.iterrows()]

# doc2vec 모델 학습
model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1, workers=4, epochs=20)

# 각 리뷰의 벡터를 가져오기
df['embedding'] = df['index'].apply(lambda idx: model.dv[str(idx)])


KeyError: 'content of review'

In [None]:

df['UserID'] = df['UserID'].astype(str)
df['UserID'] = df['UserID'].apply(lambda x: x.split('.')[0])

# 날짜 변환 (DD.MM.YYYY → timestamp)
df['time'] = pd.to_datetime(df['time'], format='%d.%m.%Y')
df['time'] = df['time'].astype('int64') // 10**9  # 초 단위로 변환

final_df = df[['userid', 'time', 'embedding']]
final_df.to_feather("../dataset/test")


## lstm 코드

In [14]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import ast

# CSV 파일 로드
df = pd.read_feather("../dataset/rating_for_UI_feather/rating_Adult Products.feather")

# 리스트 형태의 embedding을 실제 리스트로 변환
# null byte 제거하기
df['embedding'] = df['embedding'].apply(
    lambda x: np.array(eval(x.replace('\x00', ''))) if isinstance(x, str) else x
)

# UserID와 time을 기준으로 정렬
df = df.sort_values(by=['UserID', 'time'])

user_sequences = []
user_targets = []

for user_id, group in df.groupby('UserID'):
    sequence = np.stack(group['embedding'].values)  # embedding 벡터 리스트 → 배열 변환
    if len(sequence) > 1:  # 최소한 2개 이상의 데이터가 있어야 시퀀스를 만들 수 있음
        user_sequences.append(sequence[:-1])  # 입력 시퀀스
        user_targets.append(sequence[1:])  # 타겟 (다음 시간의 선호 벡터)

class LSTMPreferenceModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers):
        super(LSTMPreferenceModel, self).__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, embedding_dim)  # LSTM 출력 → 선호 벡터 예측

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out)
        return output

# 하이퍼파라미터 설정
embedding_dim = 100  # 벡터 차원
hidden_dim = 128  # LSTM 은닉층 차원
num_layers = 2  # LSTM 레이어 개수
learning_rate = 0.001
num_epochs = 10

# 모델 초기화
model = LSTMPreferenceModel(embedding_dim, hidden_dim, num_layers)
criterion = nn.MSELoss()  # 예측된 벡터와 실제 벡터 간 차이를 최소화
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 학습 루프
for epoch in range(num_epochs):
    total_loss = 0
    for seq, target in zip(user_sequences, user_targets):
        seq_tensor = torch.tensor(seq, dtype=torch.float32).unsqueeze(0)  # (1, 시퀀스 길이, 100)
        target_tensor = torch.tensor(target, dtype=torch.float32).unsqueeze(0)  # (1, 시퀀스 길이, 100)

        optimizer.zero_grad()
        output = model(seq_tensor)
        loss = criterion(output, target_tensor)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

def predict_future_preference(user_id, past_embeddings):
    model.eval()
    with torch.no_grad():
        seq_tensor = torch.tensor(past_embeddings, dtype=torch.float32).unsqueeze(0)
        predicted_future = model(seq_tensor)
    return predicted_future.squeeze(0).numpy()

# 각 유저에 대한 선호 벡터 예측
user_predictions = {}
for user_id, user_sequence in zip(df['UserID'].unique(), user_sequences):
    past_embeddings = user_sequence[-5:]  # 최근 5개 embedding 사용
    predicted_preference = predict_future_preference(user_id, past_embeddings)
    user_predictions[user_id] = predicted_preference[-1]  # 마지막 예측 벡터 (미래 선호 벡터)

# 유저별 예측된 선호 벡터를 DataFrame으로 변환
user_predictions_df = pd.DataFrame(
    [(user_id, predicted_pref) for user_id, predicted_pref in user_predictions.items()],
    columns=['UserID', 'Predicted Preference']
)

# 예측 결과를 Feather 형식으로 저장
user_predictions_df.to_feather("../dataset/predicted_user_preferences.feather")

print("Predicted preferences saved to 'predicted_user_preferences.feather'.")



Epoch 1, Loss: 4.7560
Epoch 2, Loss: 2.8489
Epoch 3, Loss: 1.7420
Epoch 4, Loss: 1.3038
Epoch 5, Loss: 1.1692
Epoch 6, Loss: 1.1045
Epoch 7, Loss: 1.0144
Epoch 8, Loss: 0.8410
Epoch 9, Loss: 0.6278
Epoch 10, Loss: 0.5099
Predicted preferences saved to 'predicted_user_preferences.feather'.


In [18]:
df = pd.read_feather("../output/UI_output/rating_Adult Products_UI.feather")
df.head(5)

Unnamed: 0,UserID,Predicted Preference
0,11996,"[-0.4516291, 0.15521911, -1.0964531, 0.1853739..."
1,2815,"[-0.3822423, 0.32363, -1.0623038, 0.4568331, 0..."
2,5001156,"[-0.4474397, 0.25937295, -1.1561599, 0.3420246..."
3,5002475,"[-0.46374485, 0.10646514, -1.0883391, 0.217411..."
4,5016738,"[-0.39016166, 0.431353, -1.1600732, 0.5669898,..."
