In [None]:
from pymongo import MongoClient
import pymongo

from tqdm.notebook import tqdm
import networkx as nx

import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd

import pickle

In [None]:
with open('data/dataset.pickle', 'rb') as f:
    dataset = pickle.load(f)

In [None]:
data = []
g_train = dataset['train']
for u in tqdm(dataset['users'].values()):
    for s in g_train[u]:
        data.append([u, s, g_train.edges()[(u, s)]['scrobbles']])
 
df = pd.DataFrame(data=data, columns=['user', 'track', 'scrobbles'])

  0%|          | 0/3307 [00:00<?, ?it/s]

In [None]:
df.describe()

Unnamed: 0,user,track,scrobbles
count,2564908.0,2564908.0,2564908.0
mean,1464.116,111068.8,44.45133
std,961.642,77917.42,284.2347
min,0.0,3307.0,1.0
25%,624.0,14056.0,8.0
50%,1472.0,131266.0,21.0
75%,2227.0,171293.0,45.0
max,3306.0,255320.0,205689.0


In [None]:
def search_superior(data, percentage=0.01, dmin=None, dmax=None):
    total = data.shape[0]
    if dmin is None:
        dmin = np.min(data)
    if dmax is None:
        dmax = np.max(data)
    mid = (dmax + dmin) // 2
    while dmin != mid and mid != dmax:
        p = np.sum(data >= mid) / total
        #print(f'{p} {dmin} {mid} {dmax}')
        if p > percentage:
            dmin = mid
        elif p < percentage:
            dmax = mid
        else:
            return mid
        mid = (dmax + dmin) // 2
    pmin = np.sum(data > dmin) / total
    pmax = np.sum(data > dmax) / total
    if abs(pmin - percentage) < abs(pmax - percentage):
        return dmin
    return dmax

print(search_superior(df['scrobbles'].values, percentage=0.01))
print(search_superior(df['scrobbles'].values, percentage=0.02))
print(search_superior(df['scrobbles'].values, percentage=0.03))
df['scrobbles'] = np.clip(df['scrobbles'], 0, search_superior(df['scrobbles'].values, percentage=0.03))

374
255
202


In [None]:
df.describe()

Unnamed: 0,user,track,scrobbles
count,2564908.0,2564908.0,2564908.0
mean,1464.116,111068.8,36.96133
std,961.642,77917.42,45.84949
min,0.0,3307.0,1.0
25%,624.0,14056.0,8.0
50%,1472.0,131266.0,21.0
75%,2227.0,171293.0,45.0
max,3306.0,255320.0,202.0


# Embeddings

In [None]:
x1 = np.asarray(df['user'], dtype=np.int32)[:, np.newaxis]
x2 = np.asarray(df['track'], dtype=np.int32)[:, np.newaxis] - len(dataset['users'])

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Reshape, Add, Lambda, Concatenate, Activation
from tensorflow.keras.models import Model
import functools
import tensorflow.keras.backend as K
from scipy.sparse import dok_matrix
import random


seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
tf.compat.v1.set_random_seed(seed_value)

In [None]:
def base_model(users, tracks, vector_dim=64):
    """
    A Keras implementation of the GloVe architecture
    :param vocab_size: The number of distinct words
    :param vector_dim: The vector dimension of each word
    :return:
    """
    input_target = Input((1,), name='user_id')
    input_context = Input((1,), name='track_id')

    central_embedding = Embedding(users, vector_dim, input_length=1, name='user_emb')

    context_embedding = Embedding(tracks, vector_dim, input_length=1, name='track_emb')

    vector_target = central_embedding(input_target)
    vector_context = context_embedding(input_context)

    dot_product = Dot(axes=-1)([vector_target, vector_context])
    prediction = Reshape((1, ))(dot_product)
    prediction = Activation('sigmoid')(prediction)

    model = Model(inputs=[input_target, input_context], outputs=prediction)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
    model.summary()

    return model

In [None]:
model = base_model(len(dataset['users']), len(dataset['train'].nodes())-len(dataset['users']), 10)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
track_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_emb (Embedding)            (None, 1, 10)        33070       user_id[0][0]                    
__________________________________________________________________________________________________
track_emb (Embedding)           (None, 1, 10)        2520140     track_id[0][0]                   
______________________________________________________________________________________________

In [None]:
from tensorflow.keras.utils import Sequence
from sklearn.utils import shuffle
import math


class NegSamping(Sequence):
    
    def __init__(self, users, tracks, n_users, n_tracks, batch_size):
        self.users = users
        self.tracks = tracks
        self.n_users = n_users
        self.n_tracks = n_tracks
        self.batch_size = batch_size
        self.on_epoch_end()
    
    def __len__(self):
        return math.ceil(self.tracks.shape[0] / self.batch_size)
    
    def __getitem__(self, idx):
        x1 = self.users[idx * self.batch_size:(idx+1) * self.batch_size:, ...]
        x2 = self.tracks[idx * self.batch_size:(idx+1) * self.batch_size:, ...]
        y = np.ones((x1.shape[0], 1))
        x1_n = np.random.randint(0, self.n_users, size=x1.shape)
        x2_n = np.random.randint(0, self.n_tracks, size=x2.shape)
        y_n = np.zeros_like(y)
        return [np.concatenate((x1, x1_n), axis=0),
                np.concatenate((x2, x2_n), axis=0)], np.concatenate((y, y_n), axis=0)
    
    def on_epoch_end(self):
        self.users, self.tracks = shuffle(self.users, self.tracks)

In [None]:
ds = NegSamping(x1, x2, len(dataset['users']),len(dataset['train'].nodes())-len(dataset['users']), 500)

In [None]:
ds[0]

([array([[1363],
         [3160],
         [ 146],
         [2285],
         [ 321],
         [ 122],
         [ 247],
         [ 170],
         [ 447],
         [ 955],
         [ 127],
         [2303],
         [3056],
         [  37],
         [1680],
         [1671],
         [1291],
         [ 192],
         [1602],
         [   3],
         [  79],
         [ 450],
         [ 718],
         [1113],
         [1082],
         [ 100],
         [2778],
         [  27],
         [   6],
         [   5],
         [1914],
         [1388],
         [ 413],
         [1676],
         [1130],
         [   4],
         [1491],
         [1041],
         [1127],
         [1588],
         [1449],
         [1176],
         [   0],
         [2767],
         [ 116],
         [1068],
         [  11],
         [2776],
         [1678],
         [1632],
         [ 795],
         [1135],
         [2232],
         [2508],
         [3215],
         [ 827],
         [3108],
         [3228],
         [2449

In [None]:
if not os.path.exists('data/embeddedings.h5'):
    model.fit(NegSamping(x1, x2,
                         len(dataset['users']), 
                         len(dataset['train'].nodes())-len(dataset['users']),
                         500), epochs=15) 
    model.save_weights('data/embeddedings.h5')
else:
    model.load_weights('data/embeddedings.h5')

  ...
    to  
  ['...']
Train for 5130 steps
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
print(model.layers[2].name)
users_embs = K.get_value(model.layers[2].embeddings)
print(model.layers[3].name)
tracks_embs = K.get_value(model.layers[3].embeddings)

user_emb
track_emb


In [None]:
model.predict([x1, x2], batch_size=10000)

array([[0.97153294],
       [0.9502234 ],
       [0.96148473],
       ...,
       [0.95455116],
       [0.97605115],
       [0.7787171 ]], dtype=float32)

In [None]:
del model

# Cosines

In [None]:
if not os.path.exists('data/cos.npz'):
    len_users = np.linalg.norm(users_embs, axis=1)[:, np.newaxis]
    len_tracks = np.linalg.norm(tracks_embs, axis=1)[:, np.newaxis]

    cos = (users_embs @ tracks_embs.T) / (len_users @ len_tracks.T)
    np.savez_compressed('data/cos.npz', cosines=cos)
else:
    cos = np.load('data/cos.npz')['cosines']
del users_embs
del tracks_embs

In [None]:
print(cos)
print(np.min(cos))
print(np.max(cos))
print(np.mean(cos))
print(np.std(cos))

[[ 0.6166293   0.7904514   0.73205227 ...  0.5750937   0.23726128
   0.61921895]
 [ 0.29186314  0.22068222  0.24940278 ...  0.5279246   0.18231632
   0.74171233]
 [ 0.11913287  0.07204851 -0.15346918 ... -0.474571   -0.3752259
  -0.35964486]
 ...
 [ 0.3360403   0.24041389  0.23741494 ... -0.64983857 -0.63236153
  -0.4218958 ]
 [ 0.04058007 -0.28635052 -0.17672122 ... -0.4882339  -0.43182966
  -0.5769507 ]
 [ 0.09148319 -0.1621145  -0.21006835 ... -0.7100018  -0.7428562
  -0.6279147 ]]
-0.99545115
0.98095804
-0.32127735
0.26847863


# Communities test

In [None]:
from networkx.algorithms.community import greedy_modularity_communities

In [None]:
if not os.path.exists('data/commutities.pickle'):
    communities = list(greedy_modularity_communities(dataset['train']))
    with open('data/commutities.pickle', 'wb') as f:
        pickle.dump(communities, f)
else:
    with open('data/commutities.pickle', 'rb') as f:
        communities = pickle.load(f)

In [None]:
type(communities)

In [None]:
node_community = {}
for c, group in enumerate(communities):
    for u in group:
        node_community[u] = c

In [None]:
from collections import deque
c_sim = 0
c_diff = 0
c_all_cos = 0

for i in tqdm(range(len(dataset['users']))):
    for j in range(len(dataset['users']), len(dataset['train'].nodes())):
        c_all_cos += 1
        if node_community[i] == node_community[j]:
            c_sim += 1
        else:
            c_diff += 1
            
sim = np.empty((c_sim,))
diff = np.empty((c_diff,))
all_cos = np.empty((c_all_cos,))
i_sim = 0
i_diff = 0
i_all_cos = 0

for i in tqdm(range(len(dataset['users']))):
    for j in range(len(dataset['users']), len(dataset['train'].nodes())):
        current_cos = cos[i, j - len(dataset['users'])]
        all_cos[i_all_cos] = current_cos
        i_all_cos += 1
        if node_community[i] == node_community[j]:
            sim[i_sim] = current_cos
            i_sim += 1
        else:
            diff[i_diff] = current_cos
            i_diff += 1
            
print(c_diff == i_diff)
print(c_sim == i_sim)
print(c_all_cos == i_all_cos)

In [None]:
print(len(all_cos))
print(np.mean(all_cos))
print(np.std(all_cos))
print(len(sim))
print(np.mean(sim))
print(np.std(sim))
print(len(diff))
print(np.mean(diff))
print(np.std(diff))

In [None]:
from scipy.stats import ttest_ind

ttest_ind(sim, diff, alternative='greater', equal_var=False)

In [None]:
ttest_ind(sim, all_cos, alternative='greater', equal_var=False)

In [None]:
ttest_ind(all_cos, diff, alternative='greater', equal_var=False)