In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
from datetime import datetime
from heapq import heappush,heappop
from collections import Counter,defaultdict
import multiprocessing as mp

PATH="/home/yui/Documents/data/recommender/ratings_Beauty.csv"

func = lambda x:datetime.utcfromtimestamp(x).\
        strftime('%Y-%m-%d %H:%M:%S')
df=pd.read_csv(PATH)
df['Date']=df['Timestamp'].apply(func)
df.head(5)

Unnamed: 0,UserId,ProductId,Rating,Timestamp,Date
0,A39HTATAQ9V7YF,205616461,5.0,1369699200,2013-05-28 00:00:00
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200,2012-12-14 00:00:00
2,A1Z513UWSAAO0F,558925278,5.0,1404691200,2014-07-07 00:00:00
3,A1WMRR494NWEWV,733001998,4.0,1382572800,2013-10-24 00:00:00
4,A3IAAVS479H7M7,737104473,1.0,1274227200,2010-05-19 00:00:00


In [2]:
print("No. of Unique Users: ",df['UserId'].unique().shape)
print("No. of Unique Products: ",df["ProductId"].unique().shape)
print("Shape of dataframe: ",df.shape)

No. of Unique Users:  (1210271,)
No. of Unique Products:  (249274,)
Shape of dataframe:  (2023070, 5)


#### User-User Collaborative Filtering
Given the rating matrix, $i$ is the user, $j$ is the item,
$$
\mathbf{R} = \begin{pmatrix}
r_{00} & \cdots & r_{0j}\\
\vdots & \ddots & \vdots\\
r_{i0} & \cdots & r_{ij}\\
\end{pmatrix}
$$
Originally, the average rating for item j can be treated as suggestion criteria, 
$$s_j = \frac{1}{|\Omega_j|}\sum_{i\in\Omega_j}r_{ij}$$
where $\Omega_j$ set of all users who rated item $j$, $r_{ij}$ rating user $i$ gave item $j$.

Personalized score can be written as, 
$$s_{ij}=\frac{1}{|\Omega_j|}\sum_{i'\in\Omega_j }r_{i'j}$$

But some users have less high ratings than other users, it might be not be too fair to include in the same scale. The better measurement of rating should be average of the deviation between the average ratings of user $\bar{r}_i$ and the rating of the product $r_{ij}$.


$$\begin{align*}
\delta_{ij} &= r_{ij}-\bar{r}_i \\
\hat{\delta}_{ij} &= \frac{1}{|\Omega|}\sum_{i'\in \Omega_j} r_{i'j}-\bar{r}_{i'} \\
s_{ij} &= \bar{r}_i +\hat{\delta}_{ij}
\end{align*}$$

The algorithm is to predictive the empty cell $s_{ij}$ as $\hat{r}_{ij}$ so as to guess what user $i$ might rate item $j$. This becomes a regression problem, and mean-squared error is a good metric to estimate the model performance, 

$$\Delta = \frac{1}{\Omega}\sum_{i,j\in\Omega}(r_{ij}-\hat{r}_{ij})^2$$
where $\Omega$ set of pairs $(i,j)$ where user $i$ has rated item $j$.

The weight ratings can be used to suggest similar users with similar preferences to watch the same movie, but unsuggest different users to watch the same. 

$$s_{ij}=\frac{\sum_{i'\in\Omega_j} w_{ii'}r_{i'j}}{\sum_{i'\in\Omega_j }w_{ii'}}$$

where the weight should be great if both users are similar, be small if different.

Finally, the expected rating should be summarized as,

$$s_{ij}=\bar{r}_i + \frac{\sum_{i'\in\Omega_j} w_{ii'}(r_{i'j}-\bar{r}_{i'})}{\sum_{i'\in\Omega_j }w_{ii'}}$$

The weight can be calculated as pearson correlation coefficient, 

$$
\rho_{xy} = \frac{\sum^N_{i=1}(x_i-\bar{x})(y_i-\bar{y})}{\sqrt{\sum^N_{i=1}(x_i-\bar{x})^2}\sqrt{\sum^N_{i=1}(y_i-\bar{y})^2}}
$$

$$
w_{ii'} = \frac{\sum_{j\in\Psi_{ii'}}(r_{ij}-\bar{r}_i)(r_{i'j}-\bar{r}_{i'})}{\sqrt{\sum_{j\in\Psi_{ii'}}(r_{ij}-\bar{r}_i)^2}\sqrt{\sum_{j\in\Psi_{ii'}}(r_{i'j}-\bar{r}_{i'})^2}}
$$

where $\Psi_i$ set of items that user $i$ has rated, $\Psi_{ii'}$ set of items both user $i$ and $i'$ have rated, i.e.$\Psi_{ii'}=\Psi_i\cap\Psi_{i'}$. This is equivalent to cosine similarity since $x$ and $y$ are deviations already.

$$\cos\theta = \frac{x^\top y}{|x||y|} = \frac{\sum^N_{i=1}x_iy_i}{\sqrt{\sum^N_{i=1}x_i^2}\sqrt{\sum^N_{i=1}y_i^2}}$$

In [3]:
u2id,id2u,p2id,id2p={},{},{},{}
u2p = defaultdict(dict)
for i in tqdm(range(df.shape[0])):
    user = df.iloc[i]['UserId']
    item = df.iloc[i]['ProductId']
    rating = df.iloc[i]['Rating']
    if user not in u2id:
        u2id[user]=i
        id2u[i]=user
    if item not in p2id:
        p2id[item]=i
        id2p[i]=item
    rateDict = u2p.get(u2id[user],{})
    rateDict[p2id[item]]=rating
    u2p[u2id[user]] = rateDict

100%|██████████| 2023070/2023070 [07:39<00:00, 4400.89it/s]


In [4]:
len(u2p)

1210271

In [5]:
set(u2p[0])

{0, 899062, 969481, 1499663}

In [6]:
def findMatchesItem(target):
    rated = u2p[target]
    p = set(rated)
    if len(p)>3: return 
    res = [] # user who has rated the same products
    for i in range(len(u2p)):
        if i==target:
            continue
        rated_ = u2p[i]
        common = p & set(rated_)
        if len(common)==len(p):
            res.append(i)
    if len(p)>1 and len(res)>0:
        print(target,res)

In [7]:
with mp.Pool(processes=12) as pool:
    with tqdm(total=len(u2p)) as pbar:
        for i, _ in enumerate(pool.imap_unordered(\
            findMatchesItem, range(0,len(u2p)))):
            pbar.update()
            if i==600:
                break

  0%|          | 144/1210271 [00:07<23:06:38, 14.55it/s]

143 [147]


  0%|          | 150/1210271 [00:08<18:18:27, 18.36it/s]

147 [143]


  0%|          | 211/1210271 [00:10<18:10:23, 18.50it/s]

213 [216]


  0%|          | 290/1210271 [00:14<22:00:49, 15.27it/s]

285 [283]


  0%|          | 367/1210271 [00:17<10:22:00, 32.42it/s]

355 [352, 367]


  0%|          | 445/1210271 [00:21<16:20:29, 20.57it/s]

437 [436, 438, 441]


  0%|          | 500/1210271 [00:24<19:11:26, 17.51it/s]

506 [510, 527, 529, 532]


  0%|          | 513/1210271 [00:24<15:51:49, 21.18it/s]

510 [506, 527, 529, 532]


  0%|          | 516/1210271 [00:25<20:09:13, 16.67it/s]

517 [498, 528]


  0%|          | 525/1210271 [00:25<18:13:12, 18.44it/s]

529 [506, 510, 527, 532]


  0%|          | 528/1210271 [00:25<19:38:37, 17.11it/s]

532 [506, 510, 527, 529]


  0%|          | 534/1210271 [00:25<16:58:00, 19.81it/s]

528 [498, 517]


  0%|          | 593/1210271 [00:28<13:51:50, 24.24it/s]

591 [606]


  0%|          | 621/1210271 [00:29<16:10:11, 20.78it/s]Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Process ForkPoolWorker-8:
Process ForkPoolWorker-12:
Process ForkPoolWorker-2:
Process ForkPoolWorker-11:
Process ForkPoolWorker-5:
Process ForkPoolWorker-7:
Process ForkPoolWorker-9:
Process ForkPoolWorker-3:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yui/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yui/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/yui/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/ho

KeyboardInterrupt: 

In [8]:
for i in [529,506,510,527,532]:
    print(u2p[i])

{494: 5.0, 1080644: 5.0}
{494: 5.0, 1080644: 4.0}
{494: 5.0, 1080644: 5.0}
{494: 5.0, 1080644: 5.0, 1391035: 5.0, 1702927: 5.0}
{494: 5.0, 1080644: 5.0}


#### User-User Collaborative Filtering
Given the rating matrix, $i$ is the user, $j$ is the item,
$$
\mathbf{R} = \begin{pmatrix}
r_{00} & \cdots & r_{0j}\\
\vdots & \ddots & \vdots\\
r_{i0} & \cdots & r_{ij}\\
\end{pmatrix}
$$

The expected rating should be summarized as,

$$s_{ij}=\bar{r}_i + \frac{\sum_{i'\in\Omega_j} w_{ii'}(r_{i'j}-\bar{r}_{i'})}{\sum_{i'\in\Omega_j }w_{ii'}}$$

The weight can be calculated as pearson correlation coefficient, 

$$
w_{ii'} = \frac{\sum_{j\in\Psi_{ii'}}(r_{ij}-\bar{r}_i)(r_{i'j}-\bar{r}_{i'})}{\sqrt{\sum_{j\in\Psi_{ii'}}(r_{ij}-\bar{r}_i)^2}\sqrt{\sum_{j\in\Psi_{ii'}}(r_{i'j}-\bar{r}_{i'})^2}}
$$
where $\Psi_i$ set of items that user $i$ has rated, $\Psi_{ii'}$ set of items both user $i$ and $i'$ have rated, i.e.$\Psi_{ii'}=\Psi_i\cap\Psi_{i'}$. 

In [28]:
user = 529
# product = 1391035
product = 1391000
candidates = [506,510,527,532]
u2p[527][product]=4
setProduct = set(u2p[user])
up,down = 0,0
eps = np.finfo(float).eps
user_mean = np.array(list(u2p[user].values())).mean()
for candidate in candidates:
    if product not in u2p[candidate]:
        continue #skip
    common = setProduct & set(u2p[candidate])
    difu = np.array([u2p[user][c] \
            for c in common])-user_mean+eps
    candidate_mean = np.array(list(u2p[candidate].values())).mean()
    difu_ = np.array([u2p[candidate][c] \
            for c in common])-candidate_mean+np.sqrt((difu**2).sum())*np.sqrt((difu_**2).sum())+eps
    w = ((difu*difu_)).sum()/np.sqrt((difu**2).sum())/np.sqrt((difu_**2).sum())
    up += w*(u2p[candidate].get(product,4)-candidate_mean)
    down += w
s = user_mean + up/down
print(up,down,candidate_mean,user_mean,up/down)
print(user_mean,s)

-0.7999999999999998 1.0 4.8 5.0 -0.7999999999999998
5.0 4.2


#### Movie recommendation dataset

In [17]:
PATH="/home/yui/Documents/data/recommender/movieLens20M/rating.csv"
df = pd.read_csv(PATH)
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [18]:
print("Dataframe shape: ",df.shape)
print("Unique User: ",len(df["userId"].unique()))
print("Unique Movie: ",len(df["movieId"].unique()))

Dataframe shape:  (20000263, 4)
Unique User:  138493
Unique Movie:  26744
