In [1]:
import numpy as np
import pandas as pd
from scipy import spatial

### Euclidean Distance Similarity

두 vector 사이의 거리를 측정하여 거리가 가까울수록 유사도가 높음을 의미

In [None]:
vector_1 = np.array([1, 2, 3, 4, 5])
vector_2 = np.array([5, 4, 3, 2, 1])
vector_3 = np.array([2, 3, 4, 5, 6])

In [None]:
print(vector_1 - vector_2)
print((vector_1 - vector_2) ** 2)
print(sum((vector_1 - vector_2) ** 2))
print(np.sqrt(sum((vector_1 - vector_2) ** 2)))

In [None]:
np.linalg.norm(vector_1 - vector_2), np.linalg.norm(vector_1 - vector_3)

vector_1과 vector_3의 거리가 더 가까우므로 Euclidean Distance Similarity가 더 높다고 할 수 있다.

### Cosine Similarity

두 vector의 내적을 이용하여 cosine값을 구하고 내적값이 클수록 두 vector사이의 각도가 작으므로(방향성이 비슷함) 유사도가 크다고 할 수 있다.

In [None]:
vector_1 = np.array([1, 2, 3, 4, 5])
vector_2 = np.array([5, 4, 3, 2, 1])
vector_3 = np.array([11, 19, 28, 32, 47])

In [None]:
print(sum(vector_1 * vector_2))
print(np.dot(vector_1, vector_2))

In [None]:
print(vector_1 * vector_1, vector_2 * vector_2)
print(sum(vector_1 * vector_1), sum(vector_2 * vector_2))
print(np.sqrt(sum(vector_1 * vector_1)), np.sqrt(sum(vector_2 * vector_2)))

In [None]:
print(np.dot(vector_1, vector_2) / (np.sqrt(sum(vector_1 * vector_1)) * np.sqrt(sum(vector_2 * vector_2))))

In [None]:
1 - spatial.distance.cosine(vector_1, vector_2), 1 - spatial.distance.cosine(vector_1, vector_3)

Cosine Similarity의 측면에서 vector_1과 vector_3의 유사도가 더 크다.

In [2]:
# sample data sets
columns = ["article_1","article_2","article_3","article_4","article_5"]
index = ["user_1", "user_2", "user_3", "user_4"]

In [3]:
data = np.array([[5,3,0,0,2],
                 [2,0,0,1,4],
                 [0,0,4,3,1],
                 [4,0,4,5,0]])

sample_df = pd.DataFrame(data = data, index = index, columns = columns)

In [4]:
sample_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user_1,5,3,0,0,2
user_2,2,0,0,1,4
user_3,0,0,4,3,1
user_4,4,0,4,5,0


In [None]:
# value가 0인 데이터는 활동이 없었다는 것을 의미하여 삭제.
# 사용자 기반 유사도 필터링

In [9]:
def cosine_similarity(vector_1, vector_2) :
    # vector_1에서 데이터가 0인 index 제거
    idx = vector_1.nonzero()[0]
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [12]:
def similarity_matrix(sample_df, similarity_func) :
    index = sample_df.index
    
    df = sample_df.T
    
    matrix = []
    for idx_1, value_1 in df.items() :
        row = []
        
        for idx_2, value_2 in df.items() :
            row.append(similarity_func(value_1, value_2))
            
        matrix.append(row)
        
    return pd.DataFrame(matrix, columns = index, index = index)

In [13]:
sm_df = similarity_matrix(sample_df, cosine_similarity)
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_1,1.0,0.652929,0.324443,0.811107
user_2,0.729397,1.0,0.483046,0.443039
user_3,0.196116,0.332956,1.0,0.949474
user_4,0.529813,0.770054,0.82121,1.0


In [None]:
# 추천할 대상을 정하고 그 대상에 맞는 유사도 행렬구하기

In [16]:
user, closer_count = 'user_1', 2

In [17]:
ms_df = sm_df.drop(user)
# 본인제거

In [18]:
ms_df = ms_df.sort_values(user, ascending = False)

In [19]:
ms_df = ms_df[:closer_count]

In [20]:
ms_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_2,0.729397,1.0,0.483046,0.443039
user_4,0.529813,0.770054,0.82121,1.0


In [22]:
sample_df.loc[ms_df.index]

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user_2,2,0,0,1,4
user_4,4,0,4,5,0


In [23]:
# user데이터에대한 컨텐츠별 평균 구하기

In [24]:
mean = np.zeros(len(sample_df.columns))

for ms_user, sms_value in ms_df[user].items() :
    mean += sample_df.loc[ms_user]

mean /= len(ms_df[user])

In [25]:
pred_df = pd.DataFrame(columns = sample_df.columns)
pred_df.loc["user"] = sample_df.loc[user]
pred_df.loc['mean'] = mean

In [26]:
pred_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user,5,3,0,0,2
mean,3,0,2,3,2


In [None]:
# 평균 스코어 행렬

In [27]:
def mean_score(sample_df, sm_df, target, closer_count) :
    ms_df = sm_df.drop(target)
    ms_df = ms_df.sort_values(target, ascending = False)
    ms_df = ms_df[target][:closer_count]
    
    ms_df = sample_df.loc[ms_df.index]
    
    pred_df = pd.DataFrame(columns = sample_df.columns)
    pred_df.loc['user'] = sample_df.loc[target]
    pred_df.loc['mean'] = ms_df.mean()
    
    return pred_df

In [28]:
target, closer_count = 'user_1', 2

In [29]:
pred_df = mean_score(sample_df, sm_df, target, closer_count)

In [30]:
pred_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user,5,3,0,0,2
mean,3,0,2,3,2


In [31]:
#user가 읽지 않은 컨텐츠를 순서대로 나열, 추천기사 정렬 및 출력

In [32]:
recommend_df = pred_df.T
recommend_df = recommend_df[recommend_df['user'] == 0]
recommend_df = recommend_df.sort_values('mean', ascending = False)

In [33]:
print(list(recommend_df.index))

['article_4', 'article_3']


In [35]:
recommend_df

Unnamed: 0,user,mean
article_4,0,3
article_3,0,2
