In [0]:
# google driveのファイルを使う時に使用
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# google check auth
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# google driveのファイル取得
downloaded = drive.CreateFile({'id':'1FbY8_qFo5_oHhK31_Zn5QDt0iIzwkqBt'})

# Download the file to a local disk as 'sample.csv'.
downloaded.GetContentFile('ml-100k.zip')

In [3]:
!unzip ml-100k.zip

Archive:  ml-100k.zip
   creating: sushi3-2016/
  inflating: sushi3-2016/README-en.txt  
  inflating: sushi3-2016/README-ja.txt  
  inflating: sushi3-2016/README-stat-ja.txt  
  inflating: sushi3-2016/sushi3.idata  
  inflating: sushi3-2016/sushi3.udata  
  inflating: sushi3-2016/sushi3a.5000.10.order  
  inflating: sushi3-2016/sushi3b.5000.10.order  
  inflating: sushi3-2016/sushi3b.5000.10.score  


In [0]:
import numpy as np

scores = np.loadtxt('sushi3-2016/sushi3b.5000.10.score', delimiter=' ')

## 相関係数の計算

In [0]:
def get_correlation_coefficents(scores, target_user_index):
  similarities = []
  target = scores[target_user_index]

  for i, score in enumerate(scores):
    # 共通の評価が少ない場合は除外
    indices = np.where(((target + 1) * (score + 1)) != 0)[0]
    if len(indices) < 3 or i == target_user_index:
        continue

    similarity = np.corrcoef(target[indices], score[indices])[0, 1]
    if np.isnan(similarity):
        continue

    similarities.append((i, similarity))

  return sorted(similarities, key=lambda s: s[1], reverse=True)

In [29]:
target_user_index = 0 # 0番目のユーザ
similarities = get_correlation_coefficents(scores, target_user_index)

  c /= stddev[:, None]
  c /= stddev[None, :]


## 予測

In [0]:
def predict(scores, similarities, target_user_index, target_item_index):
  target = scores[target_user_index]
  avg_target = np.mean(target[np.where(target >= 0)])

  numerator = 0.0
  denominator = 0.0
  k = 0

  for similarity in similarities:
    if k > 5 or similarity[1] <= 0.0:
      break

    score = scores[similarity[0]]
    if score[target_item_index] >= 0:
      denominator += similarity[1]
      numerator += similarity[1] * (score[target_item_index] - np.mean(score[np.where(score >= 0)]))
      k += 1

  return avg_target + (numerator / denominator) if denominator > 0 else -1

In [33]:
target_item_index = 0
target_user_index = 1

print('Predict score: {:.3f}'.format(predict(scores, similarities, target_user_index, target_item_index)))

Predict score: 1.761


In [0]:
def rank_items(scores, similarities, target_user_index):
  rankings = []
  ranking = scores[target_user_index]
  for i in range(100):
    if target[i] >= 0:
      continue

    rankings.append((i, predict(scores, similarities, target_user_index, i)))

  return sorted(rankings, key=lambda r:r[1], reverse=True)

In [37]:
target_user_index = 0

rank = rank_items(scores, similarities, target_user_index)
print('Ranking: {}'.format(rank))

Ranking: [(92, 2.85), (3, 2.8), (61, 2.7935924227841857), (0, 2.7609467700985615), (37, 2.7211344002240043), (58, 2.6919033932057106), (84, 2.6), (5, 2.5946657805828632), (75, 2.555719199760294), (53, 2.367637120239299), (10, 2.3660238723670055), (38, 2.3650101398862566), (91, 2.3000000000000003), (40, 2.2955602964441466), (19, 2.2760501211322266), (79, 2.2208255433546236), (9, 2.2146082646298866), (13, 2.157370476323419), (70, 2.1485331036951845), (2, 2.1), (47, 2.0565551731059193), (78, 1.9785348971392018), (65, 1.9479070809100525), (82, 1.9406893317388043), (88, 1.8867114232725926), (15, 1.846798191540819), (43, 1.7716437568938366), (26, 1.7645254789772584), (73, 1.7516268967471267), (34, 1.7431005911953052), (57, 1.7371937923981557), (4, 1.7000000000000002), (20, 1.655635066414566), (7, 1.6490878530385313), (51, 1.63820030388256), (42, 1.5787365846926897), (35, 1.4654437700001424), (44, 1.4385173305379528), (29, 1.4352200166533615), (99, 1.4220564424295503), (16, 1.3901887112777946