In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds


In [2]:
df = pd.read_csv("Data.csv")

pt_df = df.pivot_table(
columns = 'book_id',
index = 'reader_id',
values = 'book_rating'
).fillna(0)

In [4]:
pt_df.shape

(987, 838)

In [5]:
# convert to a csr matrix
mat = pt_df.values
mat = csr_matrix(mat)

In [6]:
def generate_prediction_df(mat, pt_df, n_factors):
    '''
    This function will calculate the "single value decomposition" of the input matrix
    given n_factors. It will then generate and normalize the user rating predictions.
    
    params:
        mat (CSR Matrix) : scipy csr matrix corresponding to the pivot table (pt_df)
        pt_df (DataFrame) : pandas dataframe which is a pivot table
        n_factors (Integer) : Number of singular values and vectors to compute. 
                              Must be 1 <= n_factors < min(mat.shape). 
    '''
    
    if not 1 <= n_factors < min(mat.shape):
        raise ValueError("Must be 1 <= n_factors < min(mat.shape)")
        
    # matrix factorization
    u, s, v = svds(mat, k = n_factors)
    s = np.diag(s)

    # calculate pred ratings
    pred_ratings = np.dot(np.dot(u, s), v) 
    pred_ratings = normalize(pred_ratings)
    
    # convert to df
    pred_df = pd.DataFrame(
        pred_ratings,
        columns = pt_df.columns,
        index = list(pt_df.index)
    ).transpose()
    
    return pred_df

In [8]:
def normalize(pred_ratings):
    '''
    This function will normalize the input pred_ratings
    
    params:
        pred_ratings (List -> List) : The prediction ratings 
    '''
    return (pred_ratings - pred_ratings.min()) / (pred_ratings.max() - pred_ratings.min())

In [9]:
pred_df = generate_prediction_df(mat, pt_df, 10)

In [10]:
pred_df

Unnamed: 0_level_0,2,21,43,85,121,129,138,169,200,253,...,29758,29772,29789,29802,29825,29906,29942,29964,29972,30000
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,...,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15
3,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,...,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15
21,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,...,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15
22,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,...,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15
23,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,...,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2977,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,...,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15
2979,3.645535e-15,3.684452e-15,3.652566e-15,3.644133e-15,3.644696e-15,3.645377e-15,3.645877e-15,3.645391e-15,3.644075e-15,3.644894e-15,...,3.645006e-15,3.643596e-15,3.650472e-15,3.645541e-15,3.645144e-15,3.645992e-15,3.645854e-15,3.644496e-15,3.645909e-15,3.645620e-15
2981,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,...,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15
2984,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,...,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15,3.645341e-15


In [11]:
u, s, v = svds(mat, k = 10)

In [13]:
u.shape

(987, 10)

In [14]:
s.shape

(10,)

In [15]:
s

array([13.45362405, 13.52079729, 13.60147051, 13.66849023, 14.14213562,
       14.14213562, 14.14213562, 14.1813409 , 14.3454562 , 15.65247584])

In [16]:
v.shape

(10, 838)

In [17]:
v

array([[ 1.49325594e-17, -6.68157430e-18, -2.28697926e-17, ...,
         1.50591342e-18,  2.03209986e-17, -2.19778691e-17],
       [ 8.41500267e-18,  6.33855518e-18, -1.38335279e-17, ...,
         4.68940432e-18,  9.88623437e-18,  2.13639947e-17],
       [ 7.86366541e-18, -8.70177233e-19,  1.47827415e-17, ...,
         4.61493461e-18, -1.32020049e-17, -3.00919274e-18],
       ...,
       [-3.91059531e-18, -3.04570928e-18, -4.51764105e-18, ...,
         1.09771262e-18,  3.96828478e-18, -9.02277369e-18],
       [-5.73289682e-18,  1.77702856e-18,  3.23412993e-18, ...,
        -1.56762600e-18, -5.99088342e-18,  5.33119300e-18],
       [ 4.21908913e-19,  4.81490546e-18,  1.05666158e-18, ...,
         8.43452363e-19, -4.76915096e-18,  1.34003896e-17]])

In [18]:
s = np.diag(s)

In [19]:
s

array([[13.45362405,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        , 13.52079729,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        , 13.60147051,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 13.66849023,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        , 14.14213562,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        14.14213562,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , 14.14213562,  0.        

In [20]:
s.shape

(10, 10)

In [25]:
pred_ratings = np.dot(np.dot(u, s), v) 

In [29]:
pred_ratings

array([[ 3.12463954e-34,  1.45516767e-34, -3.55498910e-34, ...,
         1.49751804e-34,  4.91270615e-34,  7.05092462e-34],
       [ 1.56566330e-33,  8.24378494e-33,  2.84947249e-32, ...,
        -1.08855607e-33, -3.20684402e-32,  1.58841674e-32],
       [ 9.91594488e-34, -1.02749497e-33,  6.85846022e-34, ...,
         2.28237121e-35, -5.20710226e-34, -2.74765247e-33],
       ...,
       [-9.70708820e-34,  2.88089782e-34,  1.89431280e-33, ...,
        -1.31457158e-34, -1.68114617e-33,  8.47215639e-34],
       [ 3.77631887e-35, -5.67938405e-34, -9.58420070e-35, ...,
         8.97541205e-35,  1.39817354e-34, -1.98818048e-33],
       [-5.73092720e-35,  9.81108019e-36,  4.34715104e-35, ...,
        -1.63471959e-35, -5.14334884e-35,  1.61422919e-35]])

In [27]:
pred_ratings.min()

-4.9902729068419935

In [28]:
pred_ratings.max()

10.000000000000034

In [30]:
(pred_ratings - pred_ratings.min())

array([[4.99027291, 4.99027291, 4.99027291, ..., 4.99027291, 4.99027291,
        4.99027291],
       [4.99027291, 4.99027291, 4.99027291, ..., 4.99027291, 4.99027291,
        4.99027291],
       [4.99027291, 4.99027291, 4.99027291, ..., 4.99027291, 4.99027291,
        4.99027291],
       ...,
       [4.99027291, 4.99027291, 4.99027291, ..., 4.99027291, 4.99027291,
        4.99027291],
       [4.99027291, 4.99027291, 4.99027291, ..., 4.99027291, 4.99027291,
        4.99027291],
       [4.99027291, 4.99027291, 4.99027291, ..., 4.99027291, 4.99027291,
        4.99027291]])

In [34]:
(3.12463954e-34 -4.990272) / 10

-0.4990272

In [35]:
pred_ratings = normalize(pred_ratings)

In [36]:
pred_ratings

array([[0.33290074, 0.33290074, 0.33290074, ..., 0.33290074, 0.33290074,
        0.33290074],
       [0.33290074, 0.33290074, 0.33290074, ..., 0.33290074, 0.33290074,
        0.33290074],
       [0.33290074, 0.33290074, 0.33290074, ..., 0.33290074, 0.33290074,
        0.33290074],
       ...,
       [0.33290074, 0.33290074, 0.33290074, ..., 0.33290074, 0.33290074,
        0.33290074],
       [0.33290074, 0.33290074, 0.33290074, ..., 0.33290074, 0.33290074,
        0.33290074],
       [0.33290074, 0.33290074, 0.33290074, ..., 0.33290074, 0.33290074,
        0.33290074]])

In [62]:
    pred_df = pd.DataFrame(
        pred_ratings,
        columns = pt_df.columns,
        index = list(pt_df.index)
    ).transpose()

In [63]:
pred_df

Unnamed: 0_level_0,2,21,43,85,121,129,138,169,200,253,...,29758,29772,29789,29802,29825,29906,29942,29964,29972,30000
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,...,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901
3,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,...,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901
21,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,...,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901
22,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,...,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901
23,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,...,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2977,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,...,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901
2979,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,...,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901
2981,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,...,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901
2984,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,...,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901,0.332901


In [65]:
pred_df.iloc[:, 5]

book_id
1       0.332901
3       0.332901
21      0.332901
22      0.332901
23      0.332901
          ...   
2977    0.332901
2979    0.332901
2981    0.332901
2984    0.332901
2999    0.332901
Name: 129, Length: 838, dtype: float64

In [68]:
pred_df.iloc[5].sort_values(ascending = False).reset_index()

Unnamed: 0,index,25
0,2,0.332901
1,18732,0.332901
2,18803,0.332901
3,18897,0.332901
4,18938,0.332901
...,...,...
982,10116,0.332901
983,10171,0.332901
984,10173,0.332901
985,10214,0.332901
