# Prepare data and module

In [None]:
import pandas as pd
import numpy as np
import sklearn
from collections import defaultdict

In [None]:
ratings = pd.read_csv('./movie_data/u.data', sep='\t', header=None, names=['userId','movieId','rating','timestamp'])
ratings = ratings.drop('timestamp', axis=1)
ratings

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings, test_size=0.2, random_state=1932)

In [None]:
movieIds = np.unique(train['movieId'])
userIds = np.unique(train['userId'])
print(f'movie 총 개수: {len(movieIds)}, user 총 수: {len(userIds)}')

In [None]:
movieDict = defaultdict(dict) # [movieId][userId] = rating
userDict = defaultdict(dict) # [userId][movieId] = rating
userRatingMeanDict = defaultdict(dict) # [userId] = mean(rating)
movieRatingMeanDict = defaultdict(dict) # [movieId] = mean(rating)
coRatedMovieDict = defaultdict(dict) # [movieId1][movieId2] = [[movieId1r1, movieId1r2, ...], [movieId2r1, movieId2r2, ...]]
for i in train.index:
  userId = train._get_value(i, 'userId')
  movieId = train._get_value(i, 'movieId')
  rating = train._get_value(i, 'rating')
  movieDict[movieId][userId] = rating
  userDict[userId][movieId] = rating

for key in userDict.keys():
  userRatingMeanDict[key] = sum(userDict[key].values())/len(userDict[key].values())

for key in movieDict.keys():
  movieRatingMeanDict[key] = sum(movieDict[key].values())/len(movieDict[key].values())

for item1 in movieDict.keys():
  for item2 in movieDict.keys():
    if item1 == item2:
      continue
    d1 = movieDict[item1]
    d2 = movieDict[item2]
    coRatedMovieDict[item1][item2] = [[x, d1[x], d2[x]] for x in d1 if x in d2]


# cosine similarity

In [None]:
def get_correlation_similarity(coRatedRi, coRatedRj):
  I = np.array(coRatedRi)
  J = np.array(coRatedRj)
  I_norm = np.sqrt(np.sum(np.square(I)))
  J_norm = np.sqrt(np.sum(np.square(J)))
  K = np.dot(I,J) / (I_norm * J_norm)
  return 0 if np.isnan(K) else K

In [None]:
cosineSimilarityDict = defaultdict(dict)
for item1 in movieDict.keys():
  for item2 in movieDict.keys():
    if item1 >= item2:
      continue
    numer = 0.
    denom1 = 0.
    denom2 = 0.
    for [userId, rating1, rating2] in coRatedMovieDict[item1][item2]:
      numer += rating1*rating2
      denom1 += rating1**2
      denom2 += rating2**2
    if numer*denom1*denom2==0:
      continue
    cosineSimilarityDict[item1][item2] = numer / ((denom1**0.5) * (denom2**0.5))
    cosineSimilarityDict[item2][item1] = cosineSimilarityDict[item1][item2]

In [None]:
cosineSimilarityDict[1][2]

# correlation similarity

In [None]:
def get_correlation_similarity(coRatedRi, coRatedRj):
  R_i = np.array(coRatedRi)
  R_j = np.array(coRatedRj)
  I = R_i-np.mean(R_i)
  J = R_j-np.mean(R_j)
  I_norm = np.sqrt(np.sum(np.square(I)))
  J_norm = np.sqrt(np.sum(np.square(J)))
  K = np.dot(I,J) / (I_norm * J_norm)
  return 0 if np.isnan(K) else K

In [None]:
correlationSimilarityDict = defaultdict(dict)
for item1 in movieDict.keys():
  for item2 in movieDict.keys():
    if item1 >= item2:
      continue
    numer = 0.
    denom1 = 0.
    denom2 = 0.
    for [userId, rating1, rating2] in coRatedMovieDict[item1][item2]:
      I = rating1-movieRatingMeanDict[item1]
      J = rating2-movieRatingMeanDict[item2]
      numer += I*J
      denom1 += I**2
      denom2 += J**2
    if numer*denom1*denom2==0:
      correlationSimilarityDict[item1][item2] = 0.
    else:
      correlationSimilarityDict[item1][item2] = numer / ((denom1**0.5) * (denom2**0.5))
    correlationSimilarityDict[item2][item1] = correlationSimilarityDict[item1][item2]

In [None]:
correlationSimilarityDict[1][2]

# adjusted cosine similarity

In [None]:
def get_adjusted_cosine_similarity(coRatedRi, coRatedRj, RuMean):
  R_i = np.array(coRatedRi)
  R_j = np.array(coRatedRj)
  I = R_i-RuMean
  J = R_j-RuMean
  I_norm = np.sqrt(np.sum(np.square(I)))
  J_norm = np.sqrt(np.sum(np.square(J)))
  K = np.dot(I,J) / (I_norm * J_norm)
  return K

In [None]:
adjustedCosineSimilarityDict = defaultdict(dict)
for item1 in movieDict.keys():
  for item2 in movieDict.keys():
    if item1 >= item2:
      continue
    numer = 0.
    denom1 = 0.
    denom2 = 0.
    for [userId, rating1, rating2] in coRatedMovieDict[item1][item2]:
      I = rating1-userRatingMeanDict[userId]
      J = rating2-userRatingMeanDict[userId]
      numer += I*J
      denom1 += I**2
      denom2 += J**2
    if numer*denom1*denom2==0:
      adjustedCosineSimilarityDict[item1][item2] = 0.
    else:
      adjustedCosineSimilarityDict[item1][item2] = numer / ((denom1**0.5) * (denom2**0.5))
    adjustedCosineSimilarityDict[item2][item1] = adjustedCosineSimilarityDict[item1][item2]

In [None]:
adjustedCosineSimilarityDict[1][2]

# utils

In [None]:
similarityDicts = [cosineSimilarityDict, correlationSimilarityDict, adjustedCosineSimilarityDict]

In [None]:
def get_k_neighbor_dict(similarityDict, K):
  kSimilarityDict = defaultdict(dict)
  for i in movieIds:
    temp = sorted(similarityDict[i].items(), key=(lambda x:x[1]), reverse=True)
    if K != 0:
      temp = temp[:K]
    dict_ = {}
    for j in range(len(temp)):
        dict_[temp[j][0]] = temp[j][1]
    kSimilarityDict[i] = dict_
  return kSimilarityDict

In [None]:
kSimilarityDicts = [get_k_neighbor_dict(similarityDict, 0) for similarityDict in similarityDicts]

In [None]:
# 각 item i의 가장 유사한 item N를 알아낸다
# 둘의 coRatedMovieDict[i][N]을 구한다
# regression을 돌려준다
from sklearn.linear_model import LinearRegression

regRatingDict = defaultdict(dict)
for i in movieIds:
  j=0
  for ksd in kSimilarityDicts:
    similarMovieId = list(ksd[i].keys())[0]
    if not coRatedMovieDict[i][similarMovieId]:
      continue
    users, Ri, Rn = np.array(coRatedMovieDict[i][similarMovieId]).T
    lineFitter = LinearRegression()
    lineFitter.fit(Ri.reshape(-1, 1), Rn)
    if userRatingMeanDict[i] == {}:
      continue
    regRatingDict[j][i] = lineFitter.predict([[userRatingMeanDict[i]]])
    j+=1

In [None]:
from sklearn.linear_model import LinearRegression

def get_regression_rating(R_i, R_N):
  X = np.concatenate((R_i['rating'].values, np.zeros(len(R_N['rating'].values))))
  y = np.concatenate((R_N['rating'].values, np.zeros(len(R_i['rating'].values))))
  line_fitter = LinearRegression()
  line_fitter.fit(X.reshape(-1,1), y)
  
  return line_fitter.predict([[np.sum(X) / np.count_nonzero(X < 1)]])

# prediction

In [None]:
def pred(u, i, kSimilarityDict):
  movieRatings = userDict[u] # [movieId] = rating
  movieSimilarity = kSimilarityDict[i] # [movieId] = similarity

  plus_mean = 0.
  if u in userRatingMeanDict:
    plus_mean = userRatingMeanDict[u]
  else:
    plus_mean = sum(userRatingMeanDict.items())/len(userRatingMeanDict.items())
  ratedSimilarMovie = [[movieRatings[x], movieSimilarity[x]] for x in movieRatings if x in movieSimilarity]
  
  similarityMulRating = list(map(lambda x: (x[0]-plus_mean)*x[1], ratedSimilarMovie))
  absoluteSimilarity = list(map(lambda x: abs(x[1]), ratedSimilarMovie))
  
  if sum(absoluteSimilarity)==0:
    return plus_mean
  p = sum(similarityMulRating)/sum(absoluteSimilarity)+plus_mean

  return min(max(p, 1.), 5.)

In [None]:
pred(1, 1, kSimilarityDicts[1])

In [None]:
def pred_reg(u, i, kSimilarityDict, regRatingDict):
  movieRatings = userDict[u] # [movieId] = rating
  movieSimilarity = kSimilarityDict[i] # [movieId] = similarity

  plus_mean = 0.
  if u in userRatingMeanDict:
    plus_mean = userRatingMeanDict[u]
  else:
    plus_mean = sum(userRatingMeanDict.items())/len(userRatingMeanDict.items())
  regRatings = []
  ratedSimilarMovie = [[regRatingDict[x] if x in regRatingDict else movieRatings[x], movieSimilarity[x]] for x in movieRatings if x in movieSimilarity]
  
  similarityMulRating = list(map(lambda x: (x[0]-plus_mean)*x[1], ratedSimilarMovie))
  absoluteSimilarity = list(map(lambda x: abs(x[1]), ratedSimilarMovie))
  
  if sum(absoluteSimilarity)==0:
    return plus_mean
  p = sum(similarityMulRating)/sum(absoluteSimilarity)+plus_mean

  return min(max(p, 1.), 5.)

In [None]:
pred_reg(1, 3, 10, 'adjusted_cosine')

# evaluation

In [None]:
mae = []
for i in test.index:
  userId = test._get_value(i, 'userId')
  movieId = test._get_value(i, 'movieId')
  rating = test._get_value(i, 'rating')
  row = []
  for dic in kSimilarityDicts:
    p = pred(userId, movieId, dic)
    row.append(abs(p-rating))
  mae.append(row)

In [None]:
mae_reg = []
for i in test.index:
  userId = test._get_value(i, 'userId')
  movieId = test._get_value(i, 'movieId')
  rating = test._get_value(i, 'rating')
  row = []
  j=0
  for matrix in kSimilarityDicts:
    p = pred_reg(userId, movieId, dic, regRatingDict[j])
    row.append(abs(p-rating))
    j+=1
  mae_reg.append(row)

In [None]:
np.mean(mae_reg, axis=0)

# visualization

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(5,5))
plt.bar(['cosine','corr','adj_cosine'],np.mean(mae, axis=0),  width=0.3)
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(5,5))
plt.bar(['cosine','corr','adj_cosine'], np.mean(mae_reg, axis=0),  width=0.3)
plt.show()