In [2]:
import os
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process



In [3]:
ratings = pd.read_csv("./user_data.csv", encoding='cp949')
ratings

Unnamed: 0,user_id,pattern_id,fave_date
0,Crocheturlay,720780,2017/02/19 15:08:49 -0500
1,cozykrisknits,393166,2014/12/25 14:31:01 -0500
2,cozykrisknits,447781,2014/12/25 14:29:06 -0500
3,cozykrisknits,117296,2013/01/07 22:42:44 -0500
4,cozykrisknits,224811,2013/01/07 22:39:07 -0500
...,...,...,...
3756709,alchemia,173758,2013/03/28 16:34:16 -0400
3756710,alchemia,195006,2013/03/28 16:34:07 -0400
3756711,alchemia,143595,2013/03/28 16:34:02 -0400
3756712,alchemia,181436,2013/03/28 16:33:52 -0400


In [4]:
users = ratings.groupby('user_id')['user_id'].count()

In [5]:
patterns = ratings.groupby('pattern_id')['pattern_id'].count()
print(patterns)

pattern_id
10        157
13         27
16        172
17        308
20        123
         ... 
774662      1
774666      1
774667      1
774670      1
774676      1
Name: pattern_id, Length: 410633, dtype: int64


In [6]:
# 신뢰할만한 user를 걸러내자.
# 어떤 user가 평가한 패턴의 개수가 N개 이상이라면, 이 user는 여러 개의 패턴을 보고 평가한 것으로 볼 수 있다.
# 그러므로 이 사용자의 평가는 믿을만하다고 가정한다.
N = 10
reliable_users = users[users > N]
reliable_users = reliable_users.to_frame()
# index와, 첫번째 column의 이름이 user_id라 pd.merge 연산이 불가하므로 column name 치환
reliable_users.rename(columns={'user_id':'count'}, inplace=True)
print(reliable_users)

                  count
user_id                
A-Bear              293
A-Jar-Of-Bees        23
A-KN                615
A-Kelli              65
A-Ko-Cloudartowl   3699
...                 ...
cvilleknits          38
cvitt                24
cvivianay            44
cvjunebug           207
cvkasdan             16

[11926 rows x 1 columns]


In [7]:
# 위에서 거른 믿을만한 사용자 집단과 rating set의 교집합을 걸러내어,
# 믿을만한 사용자 집단이 평가하지 않은 pattern id는 dataset에서 제외한다.
merge_ratings = pd.merge(ratings, reliable_users, on=['user_id'], how='inner')

In [8]:
merge_ratings

Unnamed: 0,user_id,pattern_id,fave_date,count
0,crowgirl,390707,2017/03/25 11:08:12 -0400,91
1,crowgirl,570883,2016/09/03 19:07:34 -0400,91
2,crowgirl,574591,2016/09/03 19:06:12 -0400,91
3,crowgirl,687166,2016/09/03 19:04:59 -0400,91
4,crowgirl,639396,2016/09/03 19:04:00 -0400,91
...,...,...,...,...
3733885,alchemia,173758,2013/03/28 16:34:16 -0400,100
3733886,alchemia,195006,2013/03/28 16:34:07 -0400,100
3733887,alchemia,143595,2013/03/28 16:34:02 -0400,100
3733888,alchemia,181436,2013/03/28 16:33:52 -0400,100


In [9]:
# 교집합 연산이 잘 되었는지 확인을 위한 작업
# 교집합 연산 전 reliable users 의 row 길이와, 
# 현재 merge된 ratings에서 user id끼리 groupby한 연산의 결과가 같으므로, 
# 이는 옳게 교집합 연산이 되었다
users = merge_ratings.groupby('user_id')['user_id'].count()
print(users)

user_id
A-Bear               293
A-Jar-Of-Bees         23
A-KN                 615
A-Kelli               65
A-Ko-Cloudartowl    3699
                    ... 
cvilleknits           38
cvitt                 24
cvivianay             44
cvjunebug            207
cvkasdan              16
Name: user_id, Length: 11926, dtype: int64


In [10]:
patterns = merge_ratings.groupby('pattern_id')['pattern_id'].count()
print(patterns)
# 그러나, 사람마다 취향이 너무 달라서 제외된 패턴임에도 row가 410000개이다. 여전히 너무 많아 MF를 실행할 수 없다.

pattern_id
10        155
13         27
16        169
17        307
20        121
         ... 
774662      1
774666      1
774667      1
774670      1
774676      1
Name: pattern_id, Length: 409836, dtype: int64


In [11]:
# 이젠 신뢰할만한 pattern을 걸러내자.
# 어떤 pattern이 평가된 횟수가 M개 이상이라면, 이 패턴은 많은 사용자에게 평가받았다.
# 그러므로 이 패턴은 보편적 취향에 부합하며, 다른 이에게도 추천할만하다.
M = 10
reliable_patterns = patterns[patterns > M]
reliable_patterns = reliable_patterns.to_frame()
reliable_patterns.rename(columns={'pattern_id':'count'}, inplace=True)
print(reliable_patterns)

            count
pattern_id       
10            155
13             27
16            169
17            307
20            121
...           ...
774297         33
774351         45
774352         13
774421         14
774443         33

[73401 rows x 1 columns]


In [12]:
# 위에서 거른 믿을만한 사용자 집단과 rating set의 교집합을 걸러내어,
# 믿을만한 사용자 집단이 평가하지 않은 pattern id는 dataset에서 제외한다.
s_merge_ratings = pd.merge(merge_ratings, reliable_patterns, on=['pattern_id'], how='inner')

In [13]:
# user는 줄지 않았다
users = s_merge_ratings.groupby('user_id')['user_id'].count()
print(users)

user_id
A-Bear               235
A-Jar-Of-Bees         18
A-KN                 399
A-Kelli               52
A-Ko-Cloudartowl    2639
                    ... 
cvilleknits           34
cvitt                 17
cvivianay             43
cvjunebug            120
cvkasdan              14
Name: user_id, Length: 11922, dtype: int64


In [14]:
# 교집합 연산이 잘 되었는지 확인을 위한 작업
# 교집합 연산 전 reliable patterns 의 row 길이와, 
# 현재 merge된 ratings에서 pattern id끼리 groupby한 연산의 결과가 같으므로, 
# 이는 옳게 교집합 연산이 되었다
patterns = s_merge_ratings.groupby('pattern_id')['pattern_id'].count()
print(patterns)

pattern_id
10        155
13         27
16        169
17        307
20        121
         ... 
774297     33
774351     45
774352     13
774421     14
774443     33
Name: pattern_id, Length: 73401, dtype: int64


In [24]:
responses = [-1, 0, 1]
p = np.array([1, 5, 1])
m = 5
n = 10

b_ratings = np.random.choice(responses, size=m*n, p=p / p.sum()).reshape((m, n))

In [16]:
print(b_ratings)

[[0 0 0 0 1 1 0 1 1 1]
 [0 0 0 1 0 0 1 0 1 1]
 [1 1 0 1 0 0 1 1 0 0]
 [0 0 1 1 0 1 0 0 0 0]
 [1 0 0 0 0 0 1 0 1 1]]


In [17]:
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

import tensorflow as tf
print(tf.__version__)

2.0.0


In [18]:
class MatrixFactorization:
  def __init__(self, R, k, lr=.0003, l2=.04, seed=777):
    self.R = tf.convert_to_tensor(R, dtype=tf.float32)
    self.mask = tf.not_equal(self.R, 0)
    self.m, self.n = R.shape
    self.k = k
    self.lr = lr
    self.l2 = l2
    self.tol = .001
    # Initialize trainable weights.
    self.weight_init = tf.random_normal_initializer(seed=seed)
    self.P = tf.Variable(self.weight_init((self.m, self.k)))
    self.Q = tf.Variable(self.weight_init((self.n, self.k)))

  def loss(self):
    raise NotImplementedError

  def grad_update(self):
    with tf.GradientTape() as t:
      t.watch([self.P, self.Q])
      self.current_loss = self.loss()
    gP, gQ = t.gradient(self.current_loss, [self.P, self.Q])
    self.P.assign_sub(self.lr * gP)
    self.Q.assign_sub(self.lr * gQ)

  def train(self, n_epoch=5000):
    for epoch in range(n_epoch):
      self.grad_update()
      if self.current_loss < self.tol:
        break


class RealValueMF(MatrixFactorization):
  # The implementation is far from optimized since we don't need the product of entire P'Q.
  # We only need scores for non-missing entries.
  # The code is hence for educational purpose only.
  def loss(self):
    """Squared error loss."""
    E = (self.R - tf.matmul(self.P, self.Q, transpose_b=True))**2
    l2_norm = tf.reduce_sum(self.P**2) + tf.reduce_sum(self.Q**2)
    out = tf.reduce_sum(tf.boolean_mask(E, self.mask)) + self.l2 * l2_norm
    return out

In [19]:
'''rvmf_model = RealValueMF(ratings, k=3)
rvmf_model.train()

predictions = tf.matmul(rvmf_model.P, rvmf_model.Q, transpose_b=True).numpy()
print(np.round(predictions * mask, 2))'''

'rvmf_model = RealValueMF(ratings, k=3)\nrvmf_model.train()\n\npredictions = tf.matmul(rvmf_model.P, rvmf_model.Q, transpose_b=True).numpy()\nprint(np.round(predictions * mask, 2))'

In [20]:
class BinaryMF(MatrixFactorization):
  def train(self, n_epoch=5000):
    # Cast 1/-1 as binary encoding of 0/1.
    self.labels = tf.cast(tf.not_equal(tf.boolean_mask(self.R, self.mask), -1), dtype=tf.float32)
    for epoch in range(n_epoch):
      self.grad_update()

  # The implementation is far from optimized since we don't need the product of entire P'Q.
  # We only need scores for non-missing entries.
  # The code is hence for educational purpose only.
  def loss(self):
    """Cross entropy loss."""
    logits = tf.boolean_mask(tf.matmul(self.P, self.Q, transpose_b=True), self.mask)
    logloss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=logits)
    mlogloss = tf.reduce_mean(logloss)
    l2_norm = tf.reduce_sum(self.P**2) + tf.reduce_sum(self.Q**2)
    return mlogloss + self.l2 * l2_norm

In [25]:
# We increase the learning a bit since logloss has a very different scale than squared error.
# For the same reason we decrease the L2 coefficient.
bmf_model = BinaryMF(b_ratings, k=3, lr=.03, l2=.0001)
bmf_model.train()

b_predictions = tf.sigmoid(tf.matmul(bmf_model.P, bmf_model.Q, transpose_b=True)).numpy()

b_mask = np.zeros_like(b_ratings)
b_mask[b_ratings.nonzero()] = 1

print(np.round(b_predictions * b_mask, 2)) # Check prediction on training entries.

[[0.   1.   0.   0.   0.   0.01 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   1.   0.   0.99 0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   1.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   1.   0.   0.   0.   0.   0.  ]]


In [26]:
print(np.round(b_predictions, 2))  # Prediction for all entries.

[[0.56 1.   0.5  0.98 0.33 0.01 0.51 0.48 0.45 0.48]
 [0.5  0.55 0.5  0.54 0.48 0.46 0.5  0.5  0.5  0.5 ]
 [0.52 1.   0.53 0.99 0.06 0.02 0.52 0.49 0.47 0.5 ]
 [0.52 0.22 0.47 0.16 1.   0.56 0.5  0.52 0.46 0.47]
 [0.52 0.24 0.47 0.17 1.   0.53 0.5  0.52 0.46 0.47]]
