In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/MyDrive/브런치추천/'

/content/drive/MyDrive/브런치추천


In [None]:
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[K     |████████████████████████████████| 85.5 MB 1.1 MB/s 
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import pandas as pd
import numpy as np
import pickle

from tqdm import tqdm
from collections import defaultdict
import faiss

In [None]:
metadata = pd.read_json('processed_data/metadata_topic.json')

train = pd.read_csv('processed_data/train.csv')
test = pd.read_csv('processed_data/test.csv')

In [None]:
train['article_id'] = train['article_id'].astype(str)
train['id'] = train['writer_id'] + '_' + train['article_id']

test['article_id'] = test['article_id'].astype(str)
test['id'] = test['writer_id'] + '_' + test['article_id']

In [None]:
tmp = metadata[['id','sentence','topic_num_20','topic_num_50','topic_num_100','title','keyword_list']]
train = pd.merge(tmp,train,how='right')
test = pd.merge(tmp,test,how='right')

In [None]:
train = train.dropna()
test = test.dropna()
user_ids = set(train['user_id'].unique()) & set(test['user_id'].unique())
train = train[train['user_id'].isin(user_ids)]
test = test[test['user_id'].isin(user_ids)]

In [None]:
train

Unnamed: 0,id,sentence,topic_num_20,topic_num_50,topic_num_100,title,keyword_list,dt,hr,user_id,article_id,writer_id
0,@cosmos-j_387,"소년은 계속 웃을 수 있을까, 끝나지 않는 영화. 영화 영화이야기 영화리뷰 영화 <...",-1.0,-1.0,-1.0,"소년은 계속 웃을 수 있을까, 끝나지 않는 영화.","[영화, 영화이야기, 영화리뷰]",20190111,5,#ad379c552d279aa0133666cbca5ed0ad,387,@cosmos-j


In [None]:
metadata.head(1)

Unnamed: 0,magazine_id,user_id,title,keyword_list,display_url,sub_title,reg_ts,article_id,id,sentence,topic_num_50,topic_num_100,topic_num_20
0,28849,@elang8151,출간제의 받았습니다.,"[메일, 출판사]",https://brunch.co.kr/@elang8151/229,미안하지만 아직 학생입니다.,1539405177000,229,@elang8151_229,출간제의 받았습니다. 메일 출판사 미안하지만 아직 학생입니다.,-1,-1,-1


In [None]:
item_id_dic = {j:i for i,j in enumerate(metadata['id'].unique())}
user_id_dic = {j:i for i,j in enumerate(train['user_id'].unique())}

In [None]:
train['item_id_new'] = train['id'].apply(lambda x : item_id_dic[x])
train['user_id_new'] = train['user_id'].apply(lambda x : user_id_dic[x])

In [None]:
test['item_id_new'] = test['id'].apply(lambda x : item_id_dic[x])
test['user_id_new'] = test['user_id'].apply(lambda x : user_id_dic[x])

In [None]:
docs_by_user = train.groupby('user_id_new')['item_id_new'].unique()

In [None]:
user_id = list(train['user_id_new'].unique())[1]
nums = 100

In [None]:
def predict_by_user(user_id):
  docs = train[train['user_id_new']==user_id]['item_id_new'].unique()
  tmp = train[train['item_id_new'].isin(docs)].copy()
  tmp['value'] = 1

  user_item = tmp.pivot_table(values='value',index='user_id_new',columns=['item_id_new'],aggfunc=['sum'])
  user_item = user_item.fillna(0)

  vectors = user_item.values.tolist()
  vectors = np.array(vectors,dtype=np.float32)
  ids = np.array(user_item.index.tolist())

  index = faiss.IndexFlatIP(vectors.shape[1])
  faiss.normalize_L2(vectors)
  index = faiss.IndexIDMap2(index)
  index.add_with_ids(vectors,ids)

  emb = np.array(user_item.loc[user_id].tolist(),dtype=np.float32).reshape(1,-1)
  faiss.normalize_L2(emb)
  cos_sim, user_reco = index.search(emb,100)

  cos_sim, user_reco = cos_sim[0], user_reco[0]

  reco_weight = {}
  for cos,usr in zip(cos_sim,user_reco):
    if usr != -1:
      for doc in docs_by_user[usr]:
        if doc not in reco_weight : reco_weight[doc] = 0
        reco_weight[doc] += cos

  reco_weight = {k: v for k, v in sorted(reco_weight.items(), key=lambda item: item[1], reverse=True)} #score를 기준으로 rank 계산 -> 유사도 높은 글 100개
  pred = list(reco_weight.keys())[:nums]

  real_docs = test[test['user_id_new']==user_id]['item_id_new'].tolist()
  dcg = 0.0

  if len(real_docs) >= 100:
    idcg = sum((1.0/np.log(i+1) for i in range(1, 101)))

  else:
    idcg = sum((1.0/np.log(i+1) for i in range(1, len(real_docs)+1)))

  for i, r in enumerate(pred):
    if r in real_docs:
      dcg += 1.0/np.log(i+2)

  return dcg/idcg

In [None]:
docs = train[train['user_id_new']==user_id]['item_id_new'].unique()
tmp = train[train['item_id_new'].isin(docs)].copy()
tmp['value'] = 1

user_item = tmp.pivot_table(values='value',index='user_id_new',columns=['item_id_new'],aggfunc=['sum'])
user_item = user_item.fillna(0)

vectors = user_item.values.tolist()
vectors = np.array(vectors,dtype=np.float32)
ids = np.array(user_item.index.tolist())

index = faiss.IndexFlatIP(vectors.shape[1])
faiss.normalize_L2(vectors)
index = faiss.IndexIDMap2(index)
index.add_with_ids(vectors,ids)

emb = np.array(user_item.loc[user_id].tolist(),dtype=np.float32).reshape(1,-1)
faiss.normalize_L2(emb)
cos_sim, user_reco = index.search(emb,100)

cos_sim, user_reco = cos_sim[0], user_reco[0]

reco_weight = {}
for cos,usr in zip(cos_sim,user_reco):
  if usr != -1: 
    for doc in docs_by_user[usr]:
      if doc not in reco_weight : reco_weight[doc] = 0
      reco_weight[doc] += cos

In [None]:
user_ids = train['user_id_new'].unique()
pred = 0
for user_id in tqdm(user_ids):
  pred += predict_by_user(user_id)
pred = pred/len(user_ids)

100%|██████████| 13134/13134 [17:26<00:00, 12.56it/s]


In [None]:
print(pred)

0.14849482929862695
