In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [64]:
import numpy as np

- following하고 있는 구독자를 기준으로 -> 벡터 간 유사성을 계산해서 글 100개 뽑기
- 각 글 별로 토픽 + 클러스터 -> 벡터 간 유사성을 계산해서 글 100개 뽑기
- 글이 10개면 10x100x2 개의 결과가 나오는 것


---

- A -> 1 3 5 B -> 3 5 7 => 3 5가 추천되는 방향

---

- 예측 성능 기준: 

In [65]:
cd '/content/drive/MyDrive/브런치추천/'

/content/drive/MyDrive/브런치추천


In [None]:
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[K     |████████████████████████████████| 85.5 MB 90 kB/s 
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import pandas as pd
import numpy as np
import pickle

from tqdm import tqdm
from collections import defaultdict
import faiss

In [None]:
nr_topics = 100
top_n_words = 10

In [None]:
metadata = pd.read_json('processed_data/metadata_topic.json'.format(nr_topics,top_n_words))

train = pd.read_csv('processed_data/train.csv')
test = pd.read_csv('processed_data/test.csv')

In [None]:
train['article_id'] = train['article_id'].astype(str)
train['id'] = train['writer_id'] + '_' + train['article_id']

test['article_id'] = test['article_id'].astype(str)
test['id'] = test['writer_id'] + '_' + test['article_id']

In [None]:
metadata = metadata.reset_index().rename(columns={'index':'faiss_index'})

In [None]:
tmp = metadata[['id','sentence','topic_num_20','topic_num_50','topic_num_100','title','faiss_index','keyword_list']]
train = pd.merge(tmp,train,how='right')
test = pd.merge(tmp,test,how='right')

In [None]:
train = train.dropna()
test = test.dropna()
user_ids = set(train['user_id'].unique()) & set(test['user_id'].unique())
train = train[train['user_id'].isin(user_ids)]
test = test[test['user_id'].isin(user_ids)]

In [74]:
print(train.shape)
print(test.shape)

(720486, 13)
(258777, 13)


In [71]:
train_org = pd.read_csv('processed_data/read_raw.csv')

In [78]:
22097882*0.3

6629364.6

In [None]:
embedding_kobart = pd.read_pickle("processed_data/sentence_embedding_kobart.pkl")
embedding_kobert = pd.read_pickle("processed_data/sentence_embedding_kobert.pkl")
embedding_ontology = pd.read_pickle("processed_data/document_embedding_ontology_final.pkl")

In [None]:
for key in embedding_ontology:
  embedding_ontology[key] = np.array([[embedding_ontology[key]]])

In [None]:
with open('processed_data/doc2vec.pkl','rb') as f:
  doc2vec = pickle.load(f)

In [None]:
with open('processed_data/word2vec_{}_{}.pkl'.format(20,top_n_words),'rb') as f:
  word2vec_20 = pickle.load(f)
with open('processed_data/word2vec_{}_{}.pkl'.format(100,top_n_words),'rb') as f:
  word2vec_100 = pickle.load(f)

In [None]:
class Reco:

  def __init__(self,
               embeddings:list,
               cluster_num = 10
               ):

    self.model_dic = {}
    self.vector_dic = {}
    self.reco_dict = {}
    self.col_name = 'topic_num_{}'.format(cluster_num)

    self.doc_to_index = {}
    self.embeddings = embeddings

  def get_dimension(self):
    
    self.d = 0

    for emb in self.embeddings:
      key = list(globals()[emb].keys())[0]
      self.d += globals()[emb][key][0].shape[0]
    
  def fit_by_topic_num(self,topic_num):
    
    try:
      self.d
    except:
      self.get_dimension()

    self.doc_to_index[topic_num] = {}

    tmp = metadata[metadata[self.col_name]==topic_num]
    faiss_indices = np.array(tmp['faiss_index'].tolist())
    ids = tmp['id'].tolist()
    vectors = []
    for i,j in enumerate(zip(ids,faiss_indices)):
      id,faiss_index = j
      vec = []
      for emb in self.embeddings:
        vec += globals()[emb][id][0].tolist()
      vectors.append(vec)

      self.doc_to_index[topic_num][faiss_index] = i

    vectors = np.array(vectors,dtype=np.float32)
    self.vector_dic[topic_num] = vectors

    index = faiss.IndexFlatIP(self.d)
    faiss.normalize_L2(vectors)
    index = faiss.IndexIDMap2(index)
    index.add_with_ids(vectors,faiss_indices)
    self.model_dic[topic_num] = index

  def fit(self):

    '''
    topic 별로 faiss 모델 생성
    self.model_dic[topic_num]으로 접근 가능
    '''
    
    print('topic 별로 faiss 모델 생성 중')
    for topic_num in tqdm(metadata[self.col_name].unique()):
      self.fit_by_topic_num(topic_num)

  def cossim_by_topic_num(self,topic_num,nums=100):

    index = self.model_dic[topic_num]
    embs = self.vector_dic[topic_num]
    faiss.normalize_L2(embs)
    cos_sim, reco_docs = index.search(embs,nums+1)
    self.reco_dict[topic_num] = {}
    self.reco_dict[topic_num]['cos_sim'] = cos_sim
    self.reco_dict[topic_num]['reco_docs'] = reco_docs

  def cosim_predict(self,nums=100):

    '''
    글 별로 유사도 높은 글과 cosine similarity 값을 reco_dict에 저장한다
    reco_dict[faiss_index]로 접근
    '''
    print('각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다')
    reco_dict = {}
    for topic_num in tqdm(metadata[self.col_name].unique()):
      self.cossim_by_topic_num(topic_num,nums+1)


  def reco_by_user(self,user_id,nums=100):
    '''
    이용자가 읽었던 글을 기반으로 추천하는 방식
    topic_distribution과 cosine_similary로 score 계산

    '''
    tmp = train[train['user_id']==user_id]
    faiss_indices = tmp['faiss_index'].tolist()
    topic_nums= tmp[self.col_name].tolist()

    topic_distribution = tmp[[self.col_name]].value_counts(normalize=True)
    reco_weight = {}

    for faiss_index,topic_num in zip(faiss_indices,topic_nums):
      faiss_index = int(faiss_index)
      topic_num = int(topic_num)
      res = self.reco_dict[topic_num]
      i = self.doc_to_index[topic_num][faiss_index]
      cos_sim, reco_docs = res['cos_sim'][i], res['reco_docs'][i]

      for cos,reco in zip(cos_sim,reco_docs):
        if reco not in reco_weight : reco_weight[reco] = 0
        reco_weight[reco] += topic_distribution[topic_num] * cos

    reco_weight = {k: v for k, v in sorted(reco_weight.items(), key=lambda item: item[1], reverse=True)} #score를 기준으로 rank 계산
    result = list(reco_weight.keys())[:nums]
    
    return reco_weight,result

  def ndcg(self,user_id):
    faiss_indices = test[test['user_id']==user_id]['faiss_index'].tolist()
    faiss_indices = list(map(int,faiss_indices))
    reco_weight,pred = self.reco_by_user(user_id)
    dcg = 0.0

    if len(faiss_indices) >= 100:
      idcg = sum((1.0/np.log(i+1) for i in range(1, 101)))

    else:
      idcg = sum((1.0/np.log(i+1) for i in range(1, len(faiss_indices)+1)))

    for i, r in enumerate(pred):
      if r in faiss_indices:
        dcg += 1.0/np.log(i+2)
    if idcg == 0: print(user_id)
    return dcg/idcg

  def predict(self):
    user_ids = train['user_id'].unique()
    pred = 0
    for user_id in tqdm(user_ids):
      pred += self.ndcg(user_id)
    pred = pred/len(user_ids)
    return pred

# bertopic 100

In [None]:
reco_model = Reco(['embedding_kobart','doc2vec'],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobart & doc2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:14<00:00,  6.88it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [03:19<00:00,  1.98s/it]
100%|██████████| 13134/13134 [29:14<00:00,  7.49it/s]

embedding_kobart & doc2vec:  0.09688714183949838





In [None]:
reco_model = Reco(['embedding_kobart','doc2vec','word2vec_{}'.format(100)],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobart & doc2vec & word2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:18<00:00,  5.56it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [04:10<00:00,  2.48s/it]
100%|██████████| 13134/13134 [29:22<00:00,  7.45it/s]

embedding_kobart & doc2vec & word2vec:  0.09742851867639328





In [None]:
reco_model = Reco(['embedding_kobart','doc2vec','embedding_ontology'.format(100)],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobart & doc2vec & word2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:14<00:00,  6.79it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [03:14<00:00,  1.92s/it]
100%|██████████| 13134/13134 [28:55<00:00,  7.57it/s]

embedding_kobart & doc2vec & word2vec:  0.0969004225180334





In [None]:
reco_model = Reco(['embedding_kobart','doc2vec','word2vec_{}'.format(20)],20)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobart & doc2vec & word2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 21/21 [00:18<00:00,  1.16it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 21/21 [07:40<00:00, 21.91s/it]
100%|██████████| 13134/13134 [28:48<00:00,  7.60it/s]

embedding_kobart & doc2vec & word2vec:  0.09888095542855443





In [None]:
reco_model = Reco(['embedding_kobert'],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobert: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:10<00:00,  9.77it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [02:16<00:00,  1.36s/it]
100%|██████████| 13134/13134 [28:45<00:00,  7.61it/s]

embedding_kobert:  0.08759236060809272





In [None]:
reco_model = Reco(['embedding_ontology'],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_ontology: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:00<00:00, 105.62it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [00:07<00:00, 13.76it/s]
100%|██████████| 13134/13134 [28:08<00:00,  7.78it/s]

embedding_ontology:  0.002572808291606912





In [None]:
reco_model = Reco(['embedding_kobart'],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobart: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:10<00:00,  9.50it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [02:21<00:00,  1.41s/it]
100%|██████████| 13134/13134 [28:49<00:00,  7.59it/s]

embedding_kobart:  0.09634569180922034





In [None]:
reco_model = Reco(['embedding_kobart'],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobart & : ',pred)

In [None]:
reco_model = Reco(['word2vec_{}'.format(100)],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('word2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:05<00:00, 19.99it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [01:02<00:00,  1.62it/s]
100%|██████████| 13134/13134 [29:17<00:00,  7.47it/s]

word2vec:  0.0886399081481283





In [None]:
reco_model = Reco(['embedding_kobart','word2vec_{}'.format(100)],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobart: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:16<00:00,  6.11it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [03:05<00:00,  1.84s/it]
100%|██████████| 13134/13134 [29:30<00:00,  7.42it/s]

embedding_kobart:  0.09676431298333062





In [None]:
reco_model = Reco(['doc2vec_{}'.format(100)],100)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('doc2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:04<00:00, 20.56it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [00:58<00:00,  1.71it/s]
100%|██████████| 13134/13134 [28:45<00:00,  7.61it/s]

doc2vec:  0.08085969726531353





In [None]:
reco_model = Reco(['word2vec_{}'.format(100)],100)
reco_model.fit()
reco_model.cosim_predict(500)
pred = reco_model.predict()
print('word2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:05<00:00, 19.62it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [01:08<00:00,  1.47it/s]
100%|██████████| 13134/13134 [1:32:28<00:00,  2.37it/s]

word2vec:  0.08386352472576745





In [None]:
reco_model = Reco(['doc2vec_{}'.format(100)],100)
reco_model.fit()
reco_model.cosim_predict(500)
pred = reco_model.predict()
print('doc2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 101/101 [00:05<00:00, 19.38it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 101/101 [01:07<00:00,  1.50it/s]
100%|██████████| 13134/13134 [1:32:05<00:00,  2.38it/s]

doc2vec:  0.07610233057223863





# bertopic 20

In [None]:
reco_model = Reco(['embedding_kobert'],20)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobert: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 21/21 [00:12<00:00,  1.70it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 21/21 [04:35<00:00, 13.11s/it]
100%|██████████| 13134/13134 [28:33<00:00,  7.67it/s]

embedding_kobert:  0.0878454923735797





In [None]:
reco_model = Reco(['embedding_kobart'],20)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('embedding_kobart: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 21/21 [00:13<00:00,  1.60it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 21/21 [04:14<00:00, 12.14s/it]
100%|██████████| 13134/13134 [28:10<00:00,  7.77it/s]

embedding_kobart:  0.09749235046285651





In [None]:
reco_model = Reco(['word2vec_{}'.format(20)],20)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('word2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 21/21 [00:05<00:00,  3.94it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 21/21 [01:52<00:00,  5.35s/it]
100%|██████████| 13134/13134 [28:25<00:00,  7.70it/s]

word2vec:  0.08209391027407646





In [None]:
reco_model = Reco(['doc2vec_{}'.format(20)],20)
reco_model.fit()
reco_model.cosim_predict(100)
pred = reco_model.predict()
print('doc2vec: ',pred)

topic 별로 faiss 모델 생성 중


100%|██████████| 21/21 [00:05<00:00,  3.81it/s]


각 글 별로 유사도 높은 글을 추출해 reco_dict에 저장하는 중 입니다


100%|██████████| 21/21 [01:54<00:00,  5.46s/it]
100%|██████████| 13134/13134 [29:10<00:00,  7.50it/s]

doc2vec:  0.08008931907342182



