# Prepare data and modul

In [2]:
import pandas as pd
import numpy as np
import sklearn

In [3]:
ratings = pd.read_csv('./movie_data/ratings_small.csv')
ratings = ratings.drop('timestamp', axis=1)
ratings

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
...,...,...,...
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5


In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings, test_size=0.2, random_state=1932)

In [5]:
movieIds = np.unique(train['movieId'])
userIds = np.unique(train['userId'])
ratings = np.unique(train['rating'])

In [6]:
dfMovieUserTable = pd.pivot_table(train, columns='userId', index='movieId', fill_value=0.)
dfMovieUserTable

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,0.0,0,0.0,0,0.0,0.0,3,0.0,4,0,...,0,0.0,0.0,0,0,0,0,0,4,5.0
2,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,...,0,0.0,0.0,3,0,0,0,0,0,0.0
3,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,...,0,0.0,0.0,3,0,0,0,0,0,0.0
4,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,...,0,0.0,0.0,0,0,0,0,0,0,0.0
5,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,...,0,0.0,0.0,3,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161918,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,...,0,0.0,0.0,0,0,0,0,0,0,0.0
161944,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,...,0,0.0,0.0,0,0,0,0,0,0,0.0
162542,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,...,0,0.0,0.0,0,0,0,0,0,0,0.0
162672,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,...,0,0.0,0.0,0,0,0,0,0,0,0.0


# cosine similarity

In [None]:
def my_cosine_similarity(dataframe):
  """
  def cosine_similarity(X, Y=None, dense_output=True):
    X, Y = check_pairwise_arrays(X, Y)

    X_normalized = normalize(X, copy=True)
    if X is Y:
        Y_normalized = X_normalized
    else:
        Y_normalized = normalize(Y, copy=True)

    K = safe_sparse_dot(X_normalized, Y_normalized.T,
                        dense_output=dense_output)

    return K
  """
  I = normalize(dataframe, copy=True)
  norm = np.sqrt(np.sum(np.square(I), axis=1))
  return np.dot(I, I.T) / (norm * norm.T)

In [None]:
def cosine_similarity_vector(R_i, R_j):
  outer_join = pd.merge(R_i, R_j, how='outer', on='userId')
  I = normalize(outer_join['rating_x'].fillna(0))
  J = normalize(outer_join['rating_y'].fillna(0))
  I_norm = np.sqrt(np.sum(np.square(I)))
  J_norm = np.sqrt(np.sum(np.square(J)))
  return np.dot(I,J) / (I_norm * J_norm)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosineSimilarityMatrix = cosine_similarity(dfMovieUserTable)
cosineSimilarityMatrix

array([[1.        , 0.32944422, 0.27160746, ..., 0.        , 0.        ,
        0.06392494],
       [0.32944422, 1.        , 0.18739697, ..., 0.        , 0.        ,
        0.        ],
       [0.27160746, 0.18739697, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.06392494, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [None]:
cosineSimilarityMatrix = my_cosine_similarity(dfMovieUserTable)
cosineSimilarityMatrix


In [None]:
cosineSimilarityMatrix = pd.DataFrame(data=cosineSimilarityMatrix, index=dfMovieUserTable.index, columns=dfMovieUserTable.index)
cosineSimilarityMatrix

# correlation similarity

In [None]:
def corr_similarity(X):
  X = X.values
  mask = (X!=0.).astype(float)
  
  X_mean = np.sum(X, axis=1) / np.count_nonzero(X, axis=1)

  I = X-np.expand_dims(X_mean, axis=1)
  I = I*mask

  norm = np.sqrt(np.sum(np.square(I), axis=1))
  K = np.dot(I, I.T) / (norm * norm.T)
  K[np.isnan(K)] = 0
  return K

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

def corr_similarity_vector(R_i, R_j):
  inner_join = pd.merge(R_i, R_j, how='inner', on='userId')
  R_i = inner_join['rating_x']
  R_j = inner_join['rating_y']
  I = R_i-np.mean(R_i)
  J = R_j-np.mean(R_j)
  I_norm = np.sqrt(np.sum(np.square(I)))
  J_norm = np.sqrt(np.sum(np.square(J)))
  K = np.dot(I,J) / (I_norm * J_norm)
  return K


In [None]:
corr_similarity_vector(get_rating(movieId=1), get_rating(movieId=2))

0.4682789155833623

In [None]:
corrSimilarityMatrix = corr_similarity(dfMovieUserTable)
corrSimilarityMatrix

In [None]:
corrSimilarityMatrix = pd.DataFrame(data=corrSimilarityMatrix, index=dfMovieUserTable.index, columns=dfMovieUserTable.index)
corrSimilarityMatrix

# adjusted cosine similarity

In [None]:
def adjcos_similarity(X, R_u):
  X = X.values
  mask = (X!=0).astype(float)
  R_u_mean = np.sum(R_u, axis=0) / np.count_nonzero(R_u, axis=0)

  I = X-R_u_mean
  I = I*mask
  # isolate 구현이 이상했음
  # 분자는 어차피 0을 곱해주고 더해서 상호 평가되지않은 값들은 버려져서 괜찮은데
  # 분모에서 어떤 값을 버려야되는지 모름, 그래서 상호평가되지않은 값들도 norm 구하는데 추가되어서 계산이 이상해짐
  norm = np.sqrt(np.sum(np.square(I), axis=1))
  K = np.dot(I, I.T) / (norm * norm.T)
  K[np.isnan(K)] = 0
  return K

In [43]:
def get_user_means(dataframe):
  return np.sum(dataframe, axis=0)/np.count_nonzero(dataframe, axis=0)

In [44]:
print(get_user_means(dfMovieUserTable))

        userId
rating  1         2.500000
        2         3.461538
        3         3.606061
        4         4.358974
        5         3.929487
                    ...   
        667       3.693878
        668       3.800000
        669       3.300000
        670       3.680000
        671       3.886598
Length: 671, dtype: float64


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def adjcos_similarity_vector(R_i, R_j, R_u_mean):
  inner_join = pd.merge(R_i, R_j, how='inner', on='userId')
  R_i = inner_join['rating_x']
  R_j = inner_join['rating_y']
  I = R_i-np.mean(R_u)
  J = R_j-np.mean(R_u)
  I_norm = np.sqrt(np.sum(np.square(I)))
  J_norm = np.sqrt(np.sum(np.square(J)))
  K = np.dot(I,J) / (I_norm * J_norm)
  return K

In [None]:
adjcosSimilarityMatrix = adjcos_similarity(dfMovieUserTable, get_rating(userId=1)['rating'])
adjcosSimilarityMatrix

In [None]:
adjcosSimilarityMatrix = pd.DataFrame(data=adjcosSimilarityMatrix, index=dfMovieUserTable.index, columns=dfMovieUserTable.index)
adjcosSimilarityMatrix

# utils

In [None]:
def get_neighbor_items(targetItem, neighborNum, similarityMatrix):
  # targetItem이 없으면 어떻게하지?
  if not targetItem in similarityMatrix.index:
    return pd.DataFrame({'movieId':np.array([]), 'similarity':np.array([])})
    
  data = similarityMatrix[targetItem].sort_values(ascending=False)
  if neighborNum==0:
    data = data[1:]
  else:
    data = data[1:neighborNum+1]
  return pd.DataFrame({'movieId':data.index, 'similarity':data.values})

In [None]:
def get_k_neighbor_items(targetItem, K, similarityFunction):
  list(map(lambda id: {'movieId': id, 'similarity': similarityFunction(targetItem, id)}, movieIds))

In [None]:
len(np.unique(movieIds))

8417

In [None]:
len(movieIds)

80003

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

a = [3, 4, 0]
b = [2, 0, 5]
cosine_similarity(a, b)

In [9]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.1.tar.gz (212.3 MB)
[K     |████████████████████████████████| 212.3 MB 31.0 MB/s eta 0:00:01
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 16.7 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=b04a42095bb6e849f13c7f29c02f712d79f9e3654e1888317e49387e5fdbaac1
  Stored in directory: /Users/Zoo/Library/Caches/pip/wheels/43/47/42/bc413c760cf9d3f7b46ab7cd6590e8c47ebfd19a7386cd4a57
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [11]:
## pyspark를 불러옵니다.
from pyspark import SparkContext

sc = SparkContext(master="local", appName="first app")

In [31]:
# 너무 느린데
# matrix로 한꺼번에 안되나
# sqrt.sum.square 91.92306923866272
# norm 88.45666885375977
# 둘다 2시간 걸림


from collections import defaultdict
import time

start = time.time()

corr_sim_dict = defaultdict(dict)
for i in movieIds:
  for j in movieIds:
    if i==j:
      continue
    corr_sim_dict[i][j] = corr_similarity_vector(get_rating(movieId=i), get_rating(movieId=j))

end = time.time()
print(end - start)

  del sys.path[0]


88.45666885375977


In [9]:
def get_rating(userId=None, movieId=None):
  if movieId==None:
    return train.query(f'userId=={userId}').drop('userId', axis=1)
  elif userId==None:
    return train.query(f'movieId=={movieId}').drop('movieId', axis=1)

In [10]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

def get_regression_rating(R_i, R_N):
  X = np.concatenate((R_i['rating'].values, np.zeros(len(R_N['rating'].values))))
  y = np.concatenate((R_N['rating'].values, np.zeros(len(R_i['rating'].values))))
  line_fitter = LinearRegression()
  line_fitter.fit(X.reshape(-1,1), y)
  
  return line_fitter.predict([[np.sum(X) / np.count_nonzero(X < 1)]])

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

def get_cos_similarity_matrix(dataframe):
  # cos_similarity_matrix = cosine_similarity(dataframe)
  cos_similarity_matrix = my_cosine_similarity(dataframe)
  cos_similarity_matrix = pd.DataFrame(data=cos_similarity_matrix, index=dataframe.index, columns=dataframe.index)
  return cos_similarity_matrix

def get_corr_similarity_matrix(dataframe):
  corr_similarity_matrix = corr_similarity(dataframe)
  corr_similarity_matrix = pd.DataFrame(data=corr_similarity_matrix, index=dataframe.index, columns=dataframe.index)
  return corr_similarity_matrix

def get_adjcos_similarity_matrix(dataframe, R_u):
  adjcos_similarity_matrix = adjcos_similarity(dataframe, R_u)
  adjcos_similarity_matrix = pd.DataFrame(data=adjcos_similarity_matrix, index=dataframe.index, columns=dataframe.index)  
  return adjcos_similarity_matrix


# prediction

In [None]:
def pred(u, i, neighborNum, similarityMatrix):
  movieRatings = get_rating(userId=u) # movieId, rating
  movieSimilarity = get_neighbor_items(i, neighborNum, similarityMatrix) # movieId, similarity

  inner_join = pd.merge(movieSimilarity, movieRatings, how='inner', on='movieId')

  s = inner_join['similarity']
  r = inner_join['rating']
  
  s_r = s*r
  abs_s = abs(s)
  if sum(abs_s)==0:
    return 3.
  return sum(s_r)/sum(abs_s)

In [None]:
pred(1, 1, 0, get_adjcos_similarity_matrix(dfMovieUserTable, get_rating(userId=1)['rating']))

In [None]:
def pred(u, i, neighborNum, similarityMatrix):
  movieRatings = get_rating(userId=u) # movieId, rating
  movieSimilarity = get_neighbor_items(i, neighborNum, similarityMatrix) # movieId, similarity

  inner_join = pd.merge(movieSimilarity, movieRatings, how='inner', on='movieId')

  s = inner_join['similarity']
  r = inner_join['rating']
  
  s_r = s*r
  abs_s = abs(s)
  if sum(abs_s)==0:
    return 3.
  return sum(s_r)/sum(abs_s)

In [None]:
def pred_adjcos(u, i, neighborNum):
  R_u = get_rating(userId=u) # movieId, rating
  # movieSimilarity = get_neighbor_items(i, neighborNum, similarityMatrix) # movieId, s_i,movieId
  R_i = get_rating(movieId=i)
  movieSimilarity = []
  for index, row in R_u.iterrows():
    R_j = get_rating(movieId=row["movieId"])
    # print(f'R_i: {R_i}, R_j: {R_j}, R_u: {R_u}')
    movieSimilarity.append({'movieId': row["movieId"], 'similarity': adjcos_similarity_vector(R_i, R_j, R_u)})
  movieSimilarity = pd.DataFrame(movieSimilarity)

  t1 = movieSimilarity.loc[movieSimilarity['movieId'].isin(R_u['movieId'].values)]
  s = t1['similarity'].values
  t2 = R_u.loc[R_u['movieId'].isin(t1['movieId'].values)]
  r = t2['rating'].values
  if len(s)==0:
    return 3.
  s_r = s*r
  abs_s = abs(s)
  return sum(s_r)/sum(abs_s)

In [None]:
def pred_reg(u, i, neighborNum, similarityMatrix):
  movieRatings = get_rating(userId=u) # movieId, rating
  movieSimilarity = get_neighbor_items(i, neighborNum, similarityMatrix) # movieId, s_i,movieId
  
  s = movieSimilarity['similarity'].values
  if len(s)==0:
    return 3.
  R_i = get_rating(movieId=i)
  r_reg = []
  for i in range(neighborNum):
    mostSimilarMovieId = movieSimilarity.loc[i]['movieId']
    R_N = get_rating(movieId=mostSimilarMovieId)
    regRating = get_regression_rating(R_i, R_N) # movieId, reg_rating
    r_reg.append(*regRating)

  s_r_reg = s*r_reg
  abs_s = abs(s)
  return sum(s_r_reg)/sum(abs_s)
  # return

In [None]:
pred_reg(1, 3, 10, 'adjusted_cosine')

evaluation

In [None]:
import math

similarityMatrix = []
similarityMatrix.append(get_cos_similarity_matrix(dfMovieUserTable))
similarityMatrix.append(get_corr_similarity_matrix(dfMovieUserTable))

mae = []
cnt = 0
for i in test.index:
  userId = test._get_value(i, 'userId')
  movieId = test._get_value(i, 'movieId')
  rating = test._get_value(i, 'rating')
  similarityMatrix.append(get_adjcos_similarity_matrix(dfMovieUserTable, get_rating(userId=userId)['rating']))
  row = []
  for matrix in similarityMatrix:
    p = pred(userId, movieId, 0, matrix)
    row.append(abs(p-rating))
  similarityMatrix.pop()
  if not math.isnan(row[0]):
    mae.append(row)
  cnt+=1
  if cnt==10:
    break
print(np.mean(mae, axis=0))



[0.77048764 1.55627745 1.33226396]


In [None]:
np.mean(list(filter(lambda v: v[0]==v[0], mae)), axis=0)

In [None]:
mae = []
for i in test.index:
  userId = test._get_value(i, 'userId')
  movieId = test._get_value(i, 'movieId')
  rating = test._get_value(i, 'rating')
  row = []
  for matrix in similarityMatrix:
    p = pred_reg(userId, movieId, 10, matrix)
    row.append(abs(p-rating))
  mae.append(row)
print(np.mean(mae, axis=0))

In [None]:
1 in similarityMatrix[0].index

In [None]:
similarityMatrix[0][1]