In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity

# 데이터 로드
view_log_train = pd.read_csv('/content/drive/MyDrive/dacon/web/view_log.csv')
article_info = pd.read_csv('/content/drive/MyDrive/dacon/web/article_info.csv')
submission = pd.read_csv('/content/drive/MyDrive/dacon/web/sample_submission.csv')

# 사용자-기사 행렬 생성
user_article_matrix = view_log_train.groupby(['userID', 'articleID']).size().unstack(fill_value=0.0)

# 1. matrix factorization
users_article_sparse_matrix = csr_matrix(user_article_matrix)

NUMBER_OF_FACTORS_MF = 600 #600 근방에서 성능이 가장 좋았음

U, sigma, Vt = svds(users_article_sparse_matrix, k = NUMBER_OF_FACTORS_MF) #singular value decomposition
sigma = np.diag(sigma)

all_user_predicted = np.dot(np.dot(U, sigma), Vt) #prediction
all_user_predicted_norm = (all_user_predicted - all_user_predicted.min()) / (all_user_predicted.max() - all_user_predicted.min())

cf_preds_df = pd.DataFrame(all_user_predicted_norm, columns = user_article_matrix.columns, index = user_article_matrix.index).transpose()

# 2.사용자-기사 행렬의 cosine similarity를 이용한 추천
user_similarity = cosine_similarity(user_article_matrix)

# 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T

recommendations = []

for idx, user in enumerate(user_article_matrix.index):

    # 3. 두 모델의 앙상블
    # cosine similarity 모델의 가중치가 작을수록, 0.02 근방에서 성능이 가장 좋았음
    sorted_indices = (0.02*(user_predicted_scores[idx]) + 0.98*(cf_preds_df[user].values)).argsort()[::-1]
    top5recommend = [article for article in user_article_matrix.columns[sorted_indices]][:5]

    for article in top5recommend:
        recommendations.append([user, article])

# sample_submission.csv 형태로 DataFrame 생성
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

submission['articleID'] = top_recommendations['articleID']

submission.to_csv('baseline_submission.csv', index=False)