# Jaccard Distance

A simple statistic model to identify how similar between 2 entities by unary data.

# Similarity Function (Item-Item)

\begin{equation*}
\text{Sim}(i,j) = \frac{count(U_i \cap U_j)}{count(U_i \cup U_j)} \\
\text{When}\,\, U_i \,\, \text{is User who rate to Item i}
\end{equation*}

In [1]:
import pandas as pd
import numpy as np

%load_ext Cython

In [2]:
ratings = pd.read_csv('../ratings.csv')
ratings['count'] = 1
rating_matrix = ratings.pivot(index='follower_id', columns='member', values='count')
rating_matrix.fillna(0, inplace=True)
rating_matrix.head(5)

member,Can,Cherprang,Izurina,Jaa,Jan,Jane,Jennis,Jib,Kaew,Kaimook,...,Music,Namneung,Namsai,Nink,Noey,Orn,Piam,Pun,Satchan,Tarwaan
follower_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
758518,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
989241,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3219851,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3546211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3957551,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
%%cython

import numpy as np

def union_count(matrix):
    row, col = matrix.shape
    result = np.empty((row, row), dtype=np.int64)
    
    matrix_t = matrix.T
    
    for i in range(row):
        current_row = matrix[i].reshape(col, 1)
        mutual_count = (current_row | matrix_t).sum(axis=0)
        result[i] = mutual_count
    return result

def intersect_count(matrix):
    return np.dot(matrix, matrix.T)

In [4]:
item_wise_ratings = rating_matrix.values.T
item_item_sim = intersect_count(item_wise_ratings) / union_count(item_wise_ratings > 0.)

In [5]:
item_item_sim_df = pd.DataFrame(item_item_sim, index=list(rating_matrix.columns), columns=list(rating_matrix.columns))

In [6]:
item_item_sim_df

Unnamed: 0,Can,Cherprang,Izurina,Jaa,Jan,Jane,Jennis,Jib,Kaew,Kaimook,...,Music,Namneung,Namsai,Nink,Noey,Orn,Piam,Pun,Satchan,Tarwaan
Can,1.0,0.012025,0.005447,0.03871,0.02534,0.041296,0.0231,0.020093,0.029671,0.026215,...,0.018608,0.044357,0.050942,0.025111,0.034953,0.021233,0.024174,0.017703,0.017241,0.038566
Cherprang,0.012025,1.0,0.072466,0.053663,0.208095,0.02146,0.107808,0.003732,0.198983,0.110808,...,0.340034,0.101567,0.06538,0.004147,0.121529,0.195471,0.016734,0.328152,0.00919,0.147631
Izurina,0.005447,0.072466,1.0,0.02513,0.062418,0.009977,0.03619,0.002949,0.057755,0.037874,...,0.078416,0.040719,0.026361,0.003124,0.040867,0.048926,0.011187,0.061126,0.005656,0.046111
Jaa,0.03871,0.053663,0.02513,1.0,0.116484,0.111588,0.102954,0.025256,0.13511,0.177139,...,0.100853,0.239917,0.262626,0.035736,0.153183,0.129278,0.070735,0.074453,0.032086,0.175917
Jan,0.02534,0.208095,0.062418,0.116484,1.0,0.044359,0.179616,0.009806,0.382843,0.208243,...,0.311918,0.233821,0.130021,0.012104,0.235511,0.218545,0.037124,0.260638,0.020406,0.275306
Jane,0.041296,0.02146,0.009977,0.111588,0.044359,1.0,0.039164,0.026286,0.046183,0.064572,...,0.036717,0.077593,0.103689,0.062714,0.055481,0.047495,0.055245,0.029728,0.014453,0.072185
Jennis,0.0231,0.107808,0.03619,0.102954,0.179616,0.039164,1.0,0.009874,0.192244,0.151003,...,0.165157,0.175847,0.112864,0.009147,0.164181,0.13618,0.033116,0.150511,0.016732,0.190746
Jib,0.020093,0.003732,0.002949,0.025256,0.009806,0.026286,0.009874,1.0,0.010226,0.017593,...,0.007382,0.019584,0.024281,0.055172,0.012518,0.00765,0.054958,0.005173,0.01636,0.014154
Kaew,0.029671,0.198983,0.057755,0.13511,0.382843,0.046183,0.192244,0.010226,1.0,0.228127,...,0.297229,0.27003,0.145492,0.012435,0.249126,0.253104,0.039187,0.257233,0.021566,0.332885
Kaimook,0.026215,0.110808,0.037874,0.177139,0.208243,0.064572,0.151003,0.017593,0.228127,1.0,...,0.20093,0.240333,0.191196,0.017709,0.197932,0.201931,0.04984,0.152439,0.022733,0.260764


In [7]:
item_item_sim_df['Jan'].sort_values(ascending=False)

Jan          1.000000
Kaew         0.382843
Music        0.311918
Tarwaan      0.275306
Pun          0.260638
Mind         0.238944
Noey         0.235511
Namneung     0.233821
Mobile       0.219699
Orn          0.218545
Kaimook      0.208243
Cherprang    0.208095
Jennis       0.179616
Kate         0.139224
Korn         0.130512
Namsai       0.130021
Jaa          0.116484
Izurina      0.062418
Jane         0.044359
Piam         0.037124
Can          0.025340
Satchan      0.020406
Miori        0.017296
Maysa        0.013493
Nink         0.012104
Jib          0.009806
Name: Jan, dtype: float64

In [8]:
# %load ../utility/member_display.py
from operator import itemgetter

import requests

IMAGE_MAX_WIDTH = 150
members = requests.get('https://www.api.bnk48.com/api/members')

member_images = {
    element['slug'].capitalize(): element['avatar_image']
    for element in members.json()['members']
}

def show_rank(ranks, n=5):
    selected_items = ranks[:n]
    max_size = max(selected_items, key=itemgetter(1))[1]

    def img_tag(name, size):
        image_size = int(size / max_size * IMAGE_MAX_WIDTH)
        return f'''<img src="{member_images[name]}" 
            alt="{name} = {size * 100}%" 
            style="width: {image_size}px; display: inline-block;"/>'''

    return ' '.join([img_tag(*member) for member in ranks[:n]])


## Top 5 Jan's Similar member
{{ show_rank(list(item_item_sim_df['Jan'].sort_values(ascending=False).iteritems())[1:], 5) }}

## Top 5 Pun's Similar member
{{ show_rank(list(item_item_sim_df['Pun'].sort_values(ascending=False).iteritems())[1:], 5) }}

# Rating Function

\begin{equation*}
\text{S}(u,i) = \frac{\sum_{j \in N} Sim_{i,j} r_{ui}}{\sum_{j \in N}{|Sim_{i,j}|}} \quad \text{When N is set of items}
\end{equation*}

In [9]:
def predict(similarity_matrix, oshi_mems, target):
    weight_vec = similarity_matrix[target]
    return weight_vec.loc[oshi_mems].sum() / (weight_vec.sum() - weight_vec.loc[target])

In [10]:
predict(item_item_sim_df, ['Kaew', 'Tarwaan', 'Music'], 'Jan')

0.25994841205425184

In [11]:
def recommend(similarity_matrix, oshi_mems, n=5):
    other_members = list(set(similarity_matrix.index) - set(oshi_mems))
    scores = [
        (member, predict(similarity_matrix, oshi_mems, member))
        for member in other_members
    ]
    
    return sorted(scores, key=lambda x: x[1], reverse=True)[:n]

### Recommendation
What if I like `Musicฅ`, `Kaew` and `Tarwaan`

In [12]:
result = recommend(item_item_sim_df, {'Cherprang', 'Kaew', 'Tarwaan'}, n=20)

{{show_rank(result, 5)}}

In [13]:
result

[('Pun', 0.25784995246681569),
 ('Music', 0.24404681507655376),
 ('Jan', 0.23212710533992617),
 ('Izurina', 0.21379371711862913),
 ('Orn', 0.20912386896672944),
 ('Jennis', 0.19011243821648793),
 ('Noey', 0.18633587706340463),
 ('Mobile', 0.18585268021995033),
 ('Kaimook', 0.17263695284628108),
 ('Namneung', 0.17068372855292507),
 ('Mind', 0.16889432956939865),
 ('Korn', 0.14374347129420731),
 ('Kate', 0.13445938247728878),
 ('Namsai', 0.1330790912826614),
 ('Jaa', 0.12209656580613931),
 ('Can', 0.11321727894762031),
 ('Jane', 0.1037021065756535),
 ('Satchan', 0.10330148630813915),
 ('Piam', 0.096366361739592801),
 ('Miori', 0.087380309654361499)]

# User-User

In [14]:
augment_users = [
    {'follower_id': 'S', 'member': 'Kaew','count': 1},
    {'follower_id': 'S', 'member': 'Tarwaan', 'count': 1},
    {'follower_id': 'S', 'member': 'Cherprang',  'count': 1},
    {'follower_id': 'H', 'member': 'Orn', 'count': 1},
    {'follower_id': 'H', 'member': 'Music', 'count': 1}
]
ratings = ratings.append(pd.DataFrame(augment_users))

rating_matrix = ratings.pivot(index='follower_id', columns='member', values='count')
rating_matrix.fillna(0, inplace=True)

## Discard Tan-Oshi

Due to too few information about user, we need to discard them otherwise they will keep pairing within group of tan-oshi and we cannot sugguest them with someone else.

In [15]:
tan_oshis = rating_matrix.sum(axis=1) != 1.

In [16]:
rating_matrix = rating_matrix.loc[tan_oshis]
rating_matrix

member,Can,Cherprang,Izurina,Jaa,Jan,Jane,Jennis,Jib,Kaew,Kaimook,...,Music,Namneung,Namsai,Nink,Noey,Orn,Piam,Pun,Satchan,Tarwaan
follower_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5202411,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8085222,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9366932,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9594042,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9990712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10650962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12156692,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12157412,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12791262,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12815372,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [17]:
user_wise_ratings = rating_matrix.values
user_user_sim = intersect_count(user_wise_ratings) / union_count(user_wise_ratings > 0.)

In [18]:
user_user_sim_df = pd.DataFrame(user_user_sim, index=rating_matrix.index, columns=rating_matrix.index)

## Too many pair ~= Too many noisy

Instead of keep all similarity, we may trucate sum user due to waste long time calculation and result may not be better beacuse of too noise data.

In practical system exact-same taste user may not be ignore, but we have too few items here and it's not very useful if I have no more data. I decide to remove exact-same taste to keep prediction more serendipity

In [19]:
MAX_PAIRS = 100
MIN_MAXNITUDE = .4

user_sim_map = user_user_sim_df\
    .where((user_user_sim_df < 1.) & (user_user_sim_df >= MIN_MAXNITUDE))\
    .stack()\
    .groupby(level=0, group_keys=False).nlargest(MAX_PAIRS)

In [20]:
user_sim_map

follower_id  follower_id
5202411      18354726       0.666667
             39188148       0.666667
             46504631       0.666667
             59814700       0.666667
             67076969       0.666667
             67316371       0.666667
             69541388       0.666667
             71263133       0.666667
             75992827       0.666667
             76885095       0.666667
             80781104       0.666667
             82938818       0.666667
             84514799       0.666667
             90356309       0.666667
             92921587       0.666667
             93150305       0.666667
             95614496       0.666667
             97937318       0.666667
             100162499      0.666667
             102903424      0.666667
             103508074      0.666667
             103810162      0.666667
             103894629      0.666667
             106975616      0.666667
             118759181      0.666667
             124405115      0.666667
             

### Predict & Recommend

In [21]:
def predict(rating_matrix, sim_map, user, member):
    neighbour_weight = sim_map.loc[user]
    neighbour_like = rating_matrix.loc[neighbour_weight.index][member]
    
    weighted_score = (neighbour_weight * neighbour_like).sum()
    total_weight = neighbour_weight.sum()

    return weighted_score / total_weight

In [22]:
predict(rating_matrix, user_sim_map, 51646836, 'Cherprang')

0.9614415322580651

In [23]:
predict(rating_matrix, user_sim_map, 'S', 'Music')

0.11785714285714298

In [24]:
predict(rating_matrix, user_sim_map, 'H', 'Music')

1.0

In [25]:
def recommend(rating_matrix, sim_map, user):
    my_rating = rating_matrix.loc[user]
    unknown_members = my_rating[my_rating == 0].index
    
    all_predicts = [
        (member, predict(rating_matrix, sim_map, user, member))
        for member in unknown_members
    ]
    
    return sorted(all_predicts, key=itemgetter(1), reverse=True)

In [26]:
s_recs = recommend(rating_matrix, user_sim_map, 'S')
s_recs

[('Music', 0.11785714285714298),
 ('Pun', 0.09642857142857153),
 ('Jan', 0.06428571428571435),
 ('Orn', 0.0428571428571429),
 ('Mobile', 0.032142857142857174),
 ('Izurina', 0.02142857142857145),
 ('Jennis', 0.02142857142857145),
 ('Noey', 0.02142857142857145),
 ('Can', 0.010714285714285725),
 ('Jaa', 0.0),
 ('Jane', 0.0),
 ('Jib', 0.0),
 ('Kaimook', 0.0),
 ('Kate', 0.0),
 ('Korn', 0.0),
 ('Maysa', 0.0),
 ('Mind', 0.0),
 ('Miori', 0.0),
 ('Namneung', 0.0),
 ('Namsai', 0.0),
 ('Nink', 0.0),
 ('Piam', 0.0),
 ('Satchan', 0.0)]

### Recommendation for Mr'S

{{ show_rank(s_recs, n=5) }}

In [27]:
h_recs = recommend(rating_matrix, user_sim_map, 'H')
h_recs

[('Cherprang', 0.7300000000000003),
 ('Pun', 0.12000000000000015),
 ('Izurina', 0.04000000000000005),
 ('Kaew', 0.030000000000000037),
 ('Can', 0.010000000000000012),
 ('Jan', 0.010000000000000012),
 ('Jennis', 0.010000000000000012),
 ('Kaimook', 0.010000000000000012),
 ('Korn', 0.010000000000000012),
 ('Mobile', 0.010000000000000012),
 ('Namneung', 0.010000000000000012),
 ('Noey', 0.010000000000000012),
 ('Jaa', 0.0),
 ('Jane', 0.0),
 ('Jib', 0.0),
 ('Kate', 0.0),
 ('Maysa', 0.0),
 ('Mind', 0.0),
 ('Miori', 0.0),
 ('Namsai', 0.0),
 ('Nink', 0.0),
 ('Piam', 0.0),
 ('Satchan', 0.0),
 ('Tarwaan', 0.0)]

### Recommendation for Mr' H

{{ show_rank(h_recs, n=1) }}<br/>
{{ show_rank(h_recs[1:], n=4) }}