# Normalization with TF-IDF

In [1]:
import numpy as np
import pandas as pd

from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()

In [2]:
from bokeh.palettes import Category10

x = np.arange(1, 100)
tf = 1. / x
idf = np.log(100) - np.log1p(x)


p = figure(plot_width=600, plot_height=450)
colors = Category10[4]
p.line(x=x, y=tf, color=colors[0], legend='TF')
p.line(x=x, y=idf, color=colors[1], legend='IDF')
p.line(x=x, y=tf * idf, color=colors[2], legend='TF-IDF')

show(p)

In [3]:
ratings = pd.read_csv('../ratings.csv')
ratings['count'] = 1

rating_matrix = ratings.pivot(columns='member', index='follower_id', values='count')
rating_matrix.fillna(0, inplace=True)
rating_matrix.head(5)

member,Can,Cherprang,Izurina,Jaa,Jan,Jane,Jennis,Jib,Kaew,Kaimook,...,Music,Namneung,Namsai,Nink,Noey,Orn,Piam,Pun,Satchan,Tarwaan
follower_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
758518,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
989241,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3219851,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3546211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3957551,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Sparse matrix mode to avoid large dataset

In [4]:
from scipy.sparse import coo_matrix

item_index = rating_matrix.columns
user_index = rating_matrix.index

def to_sparse(rating_df):
    return coo_matrix(rating_df.values)

def to_dataframe(rating_matrix):
    return pd.DataFrame(rating_matrix, index=user_index, columns=item_index)

## Normalize with TF-IDF [user-wise]

In [35]:
def tfidf_weight(ratings):
    ratings = coo_matrix(ratings)
    
    N = float(ratings.shape[1])
    idf = np.log(N) - np.log1p(np.bincount(ratings.row))
    ratings.data = np.sqrt(ratings.data) * idf[ratings.row]
    
    return ratings

tfidf = tfidf_weight(rating_matrix.values)
tfidf.data

array([ 2.56494936,  2.56494936,  2.56494936, ...,  2.56494936,
        2.56494936,  2.56494936])

## Then, Normalized with L2Norm

In [119]:
def normalize(ratings):
    ratings = coo_matrix(ratings)
    l2norm = np.sqrt(np.bincount(ratings.col, ratings.data ** 2))[ratings.col]
    ratings.data = ratings.data / l2norm

    return ratings
    
normalized_rating = normalize(tfidf)
normalized_rating.data

array([ 0.00632774,  0.00632774,  0.00632774, ...,  0.0113651 ,
        0.02077625,  0.0113651 ])

In [120]:
to_dataframe(normalized_rating.todense()).loc[963714085018771456]

member
Can          0.000000
Cherprang    0.003773
Izurina      0.000000
Jaa          0.000000
Jan          0.008607
Jane         0.000000
Jennis       0.000000
Jib          0.000000
Kaew         0.000000
Kaimook      0.014727
Kate         0.000000
Korn         0.000000
Maysa        0.000000
Mind         0.000000
Miori        0.000000
Mobile       0.000000
Music        0.006656
Namneung     0.000000
Namsai       0.000000
Nink         0.000000
Noey         0.011003
Orn          0.009310
Piam         0.000000
Pun          0.005223
Satchan      0.000000
Tarwaan      0.000000
Name: 963714085018771456, dtype: float64

# Decompose the matrix

In [121]:
from scipy.sparse.linalg import svds 

u, s, vt = svds(normalized_rating, k=10)

In [122]:
user_id = 3229396254
target_idx = rating_matrix.index.get_loc(user_id)
result = pd.Series(np.dot(u[target_idx] * s, vt), index=item_index, name='predict')
actual = rating_matrix.loc[user_id].rename('real_score')

pd.concat(
    [actual, result],
    axis=1
)

Unnamed: 0_level_0,real_score,predict
member,Unnamed: 1_level_1,Unnamed: 2_level_1
Can,0.0,0.00176
Cherprang,0.0,0.00305
Izurina,0.0,-0.000808
Jaa,0.0,0.001101
Jan,0.0,0.005235
Jane,0.0,-5.7e-05
Jennis,0.0,0.002754
Jib,0.0,-0.000633
Kaew,1.0,0.005694
Kaimook,0.0,0.003064


In [118]:
result.sort_values(ascending=False)[rating_matrix.loc[user_id] == 0]

member
Kaimook      0.250919
Namneung     0.234242
Mind         0.207827
Noey         0.170860
Kate         0.107692
Korn         0.090947
Namsai       0.087846
Jaa          0.071356
Orn          0.056653
Piam         0.026262
Mobile       0.025134
Jennis       0.024435
Can          0.022958
Jane         0.021228
Satchan      0.020133
Miori        0.015808
Maysa        0.008508
Pun          0.007249
Nink         0.005506
Jib          0.004033
Jan          0.003672
Izurina     -0.000838
Cherprang   -0.003589
Name: predict, dtype: float64

In [123]:
result.sort_values(ascending=False)[rating_matrix.loc[user_id] == 0]

member
Jan          0.005235
Mind         0.004701
Namneung     0.004640
Noey         0.004434
Pun          0.003686
Mobile       0.003110
Kaimook      0.003064
Cherprang    0.003050
Jennis       0.002754
Orn          0.002290
Can          0.001760
Jaa          0.001101
Namsai       0.001030
Kate         0.000665
Satchan      0.000480
Korn         0.000156
Miori        0.000050
Piam         0.000003
Jane        -0.000057
Maysa       -0.000276
Jib         -0.000633
Izurina     -0.000808
Nink        -0.000821
Name: predict, dtype: float64