In [None]:
from pymongo import MongoClient
import pymongo
import math
import random
import pandas as pd
from collections import Counter, defaultdict
import unicodedata
from tqdm.notebook import tqdm
import networkx as nx
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Reshape, Add, Lambda, Concatenate, Dense, BatchNormalization, ELU
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from tensorflow.keras.utils import Sequence
from spektral.utils import normalized_adjacency
from spektral.layers import GCNConv
#from scipy.sparse import dok_matrix


seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
tf.compat.v1.set_random_seed(seed_value)

# Data load/generator

In [None]:
with open('data/dataset.pickle', 'rb') as f:
    dataset = pickle.load(f)
    
client = MongoClient()
music = client['music_recommender']

In [None]:
def load_social(file_users, file_edges, users_ids):
    df_users = pd.read_csv(file_users, sep='\t', names=['id', 'user'])
    df_edges = pd.read_csv(file_edges, sep=' ', names=['origin', 'destination'])
    old_new = {}
    for _, r in tqdm(df_users.iterrows(), total=len(df_users)):
        if r['user'] in users_ids:
            old_new[r['id']] = users_ids[r['user']]
    social_graph = nx.DiGraph()
    social_graph.add_nodes_from(old_new.values())
    for _, r in tqdm(df_edges.iterrows(), total=len(df_edges)):
        if r['origin'] in old_new and r['destination'] in old_new:
            social_graph.add_edge(old_new[r['origin']], old_new[r['destination']])
    return social_graph

social_graph = load_social('lastfm_sn/lastfm.nodes', 'lastfm_sn/lastfm.edges', dataset['users'])

In [None]:
'''
pos = nx.spring_layout(social_graph, seed=42)

nodes = nx.draw_networkx_nodes(social_graph, pos, node_color="indigo")
edges = nx.draw_networkx_edges(
    social_graph,
    pos,
    arrowstyle="->",
    arrowsize=10,
    width=2,
)

pc = mpl.collections.PatchCollection(edges)#, cmap=cmap)
#pc.set_array(edge_colors)
#plt.colorbar(pc)

ax = plt.gca()
ax.set_axis_off()
plt.show()
'''

In [None]:
nodes = set(social_graph.nodes)
ds = nx.to_undirected(social_graph)
to_visit = [nodes.pop()]
communities = []
com = []
while len(to_visit) > 0:
    c = to_visit.pop()
    com.append(c)
    next_v = {x for x in ds[c] if x in nodes}
    nodes = nodes - next_v
    to_visit.extend(next_v)
    if len(to_visit) == 0:
        communities.append(com)
        com = []
        if len(nodes) > 0:
            to_visit.append(nodes.pop())

len(communities)

In [None]:
data = []

for n, nbrdict in social_graph.adjacency():
    data.append([n, len(nbrdict)])
    
df = pd.DataFrame(data=data, columns=['node', 'degree'])

df.describe()

### Spotify 
'danceability' : 0

'energy' : 1

'loudness' : 2 //dividir -60

'mode' : 3

'speechiness' : 4

'acousticness' : 5

'instrumentalness' : 6

'liveness' : 7

'valence' : 8

'tempo' : 9 // Dividir 144

'key' : Extra

Ignora:
'duration_ms' : 258787,

'time_signature' : 4, 4 el 99% de las veces

'spotify_id' :

In [None]:
def spotify_feature_to_np(array, track_id, db_doc):
    array[track_id, 0] = db_doc['danceability']
    array[track_id, 1] = db_doc['energy']
    array[track_id, 2] = db_doc['loudness'] 
    array[track_id, 3] = db_doc['mode']
    array[track_id, 4] = db_doc['speechiness']
    array[track_id, 5] = db_doc['acousticness']
    array[track_id, 6] = db_doc['instrumentalness']
    array[track_id, 7] = db_doc['liveness']
    array[track_id, 8] = db_doc['valence']
    array[track_id, 9] = db_doc['tempo']
    pass


def compute_distance(graph, users, cos):
    sim = []
    for u in range(users):
        for t in graph.neighbors(u):
            sim.append(cos[u, t - users])
    sim = np.asarray(sim)
    mean = np.mean(sim)
    std = np.std(sim)
    return np.clip(1 - (cos - (mean - 2 * std)) / (4 * std), 0.1, 0.9)
    

def remove_accents(input_str):
    nkfd_form = unicodedata.normalize('NFKD', input_str.lower())
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])


class DataGenerator:
    
    def __init__(self, db, users_tracks, users, tracks, social, cos):
        self.users = users
        self.tracks = tracks 
        self.social = social
        self.distance = compute_distance(users_tracks, len(users), cos)
        self.users_count = len(users)
        self.track_count = len(users_tracks.nodes()) - len(users)
        self.process_tags_fast(db)
        self.process_spotify_fast(db)
        self.process_hat_d()
        pass   
    
    def map_id_tracks(self, artist_tracks):
        id_track = defaultdict(list)
        for a, tracks in artist_tracks.items():
            for t, idx in tracks.items():
                id_track[idx - self.users_count].append((a, t))
        l = [None] * len(id_track)
        for i, v in id_track.items():
            l[i] = v
        return l
    
    def process_tags_fast(self, db):
        print('Processing artists and tags...')
        id_track = self.map_id_tracks(self.tracks)
        track_id = {}
        for e, tracks in enumerate(id_track):
            for t in tracks:
                track_id[t] = e
        #Set of tags
        tracks_tags = [set() for _ in range(len(id_track))] 
        #Set of artist
        tracks_artist = [set() for _ in range(len(id_track))] 
        #Load tag - tracks
        for r in tqdm(music.track_info.find({'spotify_id': {'$exists': True}}, 
                                            {'artist': True, 'track': True, 'tags': True}), 
                      total=music.track_info.count_documents({'spotify_id': {'$exists': True}})):
            if (r['artist'], r['track']) not in track_id:
                continue
            #Process tags
            tracks_tags[track_id[(r['artist'], r['track'])]].update([remove_accents(t) for t in r['tags']])
            #Process Artist
            tracks_artist[track_id[(r['artist'], r['track'])]].add(remove_accents(r['artist']))
        #Keep shortest artist
        for i in range(len(tracks_artist)):
            artists = list(tracks_artist[i])
            artists.sort(key=lambda x: (len(x), x))
            tracks_artist[i] = artists[0]
        #To ids
        if os.path.exists('data/tags_artist.pickle'):
            with open('data/tags_artist.pickle', 'rb') as f:
                data = pickle.load(f)
            self.tag_id = data['tag_id']
            self.artist_id = data['artist_id']
        else:
            self.artist_id = {a: e for e, a in enumerate(set(tracks_artist))}
            tags_counter = Counter()
            for tags in tracks_tags:
                for tag in tags:
                    tags_counter[tag] += 1
            self.tag_id = {}
            for tag, c in tags_counter.items():
                if c > 4:
                    self.tag_id[tag] = len(self.tag_id) + 1
            with open('data/tags_artist.pickle', 'wb') as f:
                pickle.dump({'tag_id': self.tag_id, 'artist_id': self.artist_id}, f)
        self.tracks_tags = [None] * len(tracks_tags)
        for i, tags in enumerate(tracks_tags):
            ids = [self.tag_id[t] for t in tags if t in self.tag_id]
            self.tracks_tags[i] = ids
        self.tracks_artist = np.zeros((len(id_track), 1), dtype=np.int32) 
        for e, a in enumerate(tracks_artist):
            self.tracks_artist[e, 0] = self.artist_id[a]
        pass
    
    def process_spotify_fast(self, db):
        print('Processing spotify...')
        id_track = self.map_id_tracks(self.tracks)
        track_id = {}
        for e, tracks in enumerate(id_track):
            for t in tracks:
                track_id[t] = e
        #Load spotify to id track.
        spotify_track = {}
        for r in tqdm(music.track_info.find({'spotify_id': {'$exists': True}}, 
                                            {'artist': True, 'track': True, 'spotify_id': True}), 
                      total=music.track_info.count_documents({'spotify_id': {'$exists': True}})):
            if (r['artist'], r['track']) in track_id and r['spotify_id'] not in spotify_track:
                spotify_track[r['spotify_id']] = track_id[(r['artist'], r['track'])]
        #Load info into a numpy matrix
        self.track_spotify_features = np.zeros((len(track_id), 10))
        self.track_spotify_key = np.zeros((len(track_id), 1), dtype=np.int8)
        for spotify_features in tqdm(music.track_spotify_features.find({}),
                                     total=music.track_spotify_features.count_documents({})):
            if spotify_features['spotify_id'] not in spotify_track:
                continue
            t_id = spotify_track[spotify_features['spotify_id']]
            spotify_feature_to_np(self.track_spotify_features, t_id, spotify_features)
            self.track_spotify_key[t_id, 0] = spotify_features['key'] 
        #Loudness de -60 a 4... Map clip(-60, -2e-4) log
        self.track_spotify_features[:, 2] = np.log10(-np.clip(self.track_spotify_features[:, 2], -60, -2e-4))
        #Tempo clipped -2, 2 -mean / stdb
        tempo_mean = np.mean(self.track_spotify_features[:, 9])
        tempo_stdev = np.std(self.track_spotify_features[:, 9])
        self.track_spotify_features[:, 9] = np.clip((self.track_spotify_features[:, 9] - tempo_mean) / tempo_stdev, -2, 2)
        pass
    
    def process_hat_d(self):
        print('Processing hat d')
        a = np.eye(len(self.social.nodes))
        for u in tqdm(self.social.nodes):
            for n in self.social.neighbors(u):
                a[u, n] = 1
        self.d_hat = normalized_adjacency(a)
        pass
    
    def get_users_data_slow(self, ids):
        users = 0
        for u in ids:
            neighbors = len(list(self.social.neighbors(u))) 
            if users < neighbors:
                users = neighbors
        users_ids = np.zeros((ids.shape[0], users + 1), dtype=np.int32)
        users_graph = np.zeros((ids.shape[0], 1, users + 1))
        for i, u in enumerate(ids):
            nodes = list(self.social.neighbors(u))
            neighbors = np.asarray(nodes)
            users_ids[i, 0] = u
            users_ids[i, 1:neighbors.shape[0] + 1] = neighbors
            #Sub Adjacency matrix for the first level neighbors
            #this is done because we need the normalize adyacency
            id_map = {u: e for e, u in enumerate(nodes, start=1)}
            id_map[u] = 0
            for u in list(id_map.keys()):
                for n in self.social.neighbors(u):
                    if n not in id_map:
                        id_map[n] = len(id_map)
            a = np.eye(len(id_map))
            for ui in [u] + list(self.social.neighbors(u)):
                for n in self.social.neighbors(ui):
                    a[id_map[ui], id_map[n]] = 1
            d = normalized_adjacency(a)
            users_graph[i, 0, :min(d.shape[1], users + 1)] = d[0, :min(d.shape[1], users + 1)]
        return [users_ids, users_graph]
    
    def get_users_data(self, ids):
        users = 0
        for u in ids:
            neighbors = len(list(self.social.neighbors(u))) 
            if users < neighbors:
                users = neighbors
        users_ids = np.zeros((ids.shape[0], users + 1), dtype=np.int32)
        users_graph = np.zeros((ids.shape[0], 1, users + 1))
        for i, u in enumerate(ids):
            nodes = list(self.social.neighbors(u))
            neighbors = np.asarray(nodes)
            users_ids[i, 0] = u
            users_ids[i, 1:neighbors.shape[0] + 1] = neighbors
            users_graph[i, 0, 0] = self.d_hat[u, u]
            users_graph[i, 0, 1:len(nodes) + 1] = self.d_hat[u, nodes]
        return [users_ids, users_graph]
        
    def get_tracks_data(self, ids):
        tracks_ids = ids[:, np.newaxis]
        #Tags
        tags = []
        for i in ids:
            tags.append(self.tracks_tags[i])
        tags_np = np.zeros((ids.shape[0], max([len(t) for t in tags])), dtype=np.int32)
        for i, t in enumerate(tags):
            tags_np[i, :len(t)] = t
        #Artist
        artists = self.tracks_artist[ids, :]
        spotify = self.track_spotify_features[ids, ...]
        return [tracks_ids, tags_np, artists, spotify]
    

In [None]:
class GenUserTrack(DataGenerator, Sequence):
    
    def __init__(self, db, users_tracks, users, tracks, social, cos, batch_size=512, mode_user=True):
        super().__init__(db, users_tracks, users, tracks, social, cos)
        self.users_tracks = users_tracks
        self.batch_size = batch_size
        self.mode_user = mode_user
        
    def __len__(self):
        if self.mode_user:
            return math.ceil(self.users_count / self.batch_size)
        return math.ceil(self.track_count / self.batch_size)
    
    def __getitem__(self, index):
        if self.mode_user:
            users = np.arange(index * self.batch_size, min((index + 1) * self.batch_size, self.users_count))
            return self.get_users_data(users)
        tracks = np.arange(index * self.batch_size, min((index + 1) * self.batch_size, self.track_count))
        return self.get_tracks_data(tracks)

In [None]:
ds = GenUserTrack(music, dataset['train'], dataset['users'], 
                  dataset['artist-tracks'], social_graph,
                  np.load('data/cos.npz')['cosines'])#, batch_size=512)

In [None]:
for i in ds[0]:
    print(i.shape)

# Red

In [None]:
def loss(y_true, y_pred):
    #recibe indices con forma 1xvaloresx3 (indices + valor)
    #trasnforma los indices a valoresx2 y los valores valoresx1
    v_true, dist = y_true[:, 0], y_true[:, 1]
    return K.mean(dist * K.square(y_pred - K.log(2 * v_true) / K.log(2.0)))

In [None]:
emb_size = 64
kernel_size = 32
deep = 1

#Input users
i_users = Input((None,), name='users')
i_user_graph = Input((None, None), name='user_graph')

#Input music
i_tracks = Input((1,), name='tracks')
i_tags = Input((None,), name='tags')
i_artist = Input((1,), name='artist')
i_spotify = Input((10,), name='spotify')

#Process users
emb_user_base = Embedding(len(dataset['users']), emb_size, name="embedding_users")(i_users)
emb_user = emb_user_base
emb_user_base = Lambda(lambda x: x[:, 0,:], name='extract_emb_user_base')(emb_user_base)
for i in range(deep):
    emb_user = GCNConv(kernel_size, name='gcn_user_{}'.format(i))([emb_user, i_user_graph])

emb_user = Lambda(lambda x: x[:, 0,:], name='extract_gcn_user')(emb_user)
#Process music
def avg(x):
    i = x[0]
    m = x[1]
    i = i * tf.expand_dims(tf.cast(m, tf.float32), axis=-1)
    r = tf.reduce_sum(i, axis=-2) / tf.expand_dims(tf.reduce_sum(tf.cast(m, tf.float32), axis=-1), axis=-1)
    return tf.where(tf.math.logical_or(tf.math.is_nan(r), tf.math.is_inf(r)), 0., r)

emb_tracks = Embedding(len(dataset['train'].nodes()) - len(dataset['users']), emb_size, name="embedding_tracks")(i_tracks)
emb_tracks = Lambda(lambda x: x[:, 0, :], name='extract_track')(emb_tracks)

mask_tags = Lambda(lambda x: x != 0, name='mask_tags')(i_tags)
emb_tags = Embedding(len(ds.tag_id) + 1, emb_size, name='embedding_tags')(i_tags)
emb_tags = Lambda(avg, name='masked_average_tags', output_shape=(64,))([emb_tags, mask_tags])

emb_artist = Embedding(len(ds.artist_id), emb_size, name='embedding_artist')(i_artist)
emb_artist = Lambda(lambda x: x[:, 0, :], name='extract_artist')(emb_artist)

emb_music = Concatenate(name='concatenate_embedding_track_tag_artist')([emb_tracks, emb_tags, emb_artist, i_spotify])
#emb_music = Dense(kernel_size, name='dense_music')(emb_music)

#Deep part
deep = Concatenate(name='concatenate_gcn_user_music')([emb_user_base, emb_user, emb_music])
deep = Dense(256, name='deep_dense_1')(deep)
deep = Dense(256, name='deep_dense_2')(deep)
deep = Dense(1, name='deep_dense_3')(deep)

#Wide 
wide = Concatenate(name='concatentate_user_track')([emb_user_base, emb_tracks, emb_tags, emb_artist])
wide = Dense(1, name='wide')(wide)

out = Add(name='deep_plus_wide')([deep, wide])

model = Model([i_users, i_user_graph, i_tracks, i_tags, i_artist, i_spotify], out)

model.summary()

model.compile(loss=loss, optimizer='adam')

In [None]:
model.load_weights('model_dropout_complex_075/model.h5')

In [None]:
model_user = Model([i_users, i_user_graph], [emb_user, emb_user_base])
model_track = Model([i_tracks, i_tags, i_artist, i_spotify], [emb_music, emb_tracks, emb_tags, emb_artist])

In [None]:
ds.mode_user = True
users = model_user.predict(ds)
ds.mode_user = False
tracks = model_track.predict(ds)

In [None]:
i_emb_user = Input(emb_user.shape[1:], name='i_emb_user')
i_emb_user_base = Input(emb_user_base.shape[1:], name='i_emb_user_base')
i_emb_music = Input(emb_music.shape[1:], name='i_emb_music')
i_emb_tracks = Input(emb_tracks.shape[1:], name='i_emb_tracks')
i_emb_tags = Input(emb_tags.shape[1:], name='i_emb_tags')
i_emb_artist = Input(emb_artist.shape[1:], name='i_emb_artist')
#Deep part
deep = Concatenate(name='concatenate_gcn_user_music')([i_emb_user_base, i_emb_user, i_emb_music])
deep = Dense(256, name='deep_dense_1')(deep)
deep = Dense(256, name='deep_dense_2')(deep)
deep = Dense(1, name='deep_dense_3')(deep)


#Wide 
wide = Concatenate(name='concatentate_user_track')([i_emb_user_base, i_emb_tracks, i_emb_tags, i_emb_artist])
wide = Dense(1, name='wide')(wide)

out = Add(name='deep_plus_wide')([deep, wide])

model_fast = Model([i_emb_user, i_emb_user_base, i_emb_music, i_emb_tracks, i_emb_tags, i_emb_artist], out)

In [None]:
for l_in in model_fast.layers:
    for l_out in model.layers:
        if l_in.name == l_out.name:
            l_in.set_weights(l_out.get_weights())
            break

In [None]:
model.predict(ds.get_users_data(np.arange(10)) + ds.get_tracks_data(np.arange(10)))

In [None]:
model_fast.predict([users[0][:10,...], users[1][:10,...], tracks[0][:10,...], tracks[1][:10,...], tracks[2][:10,...], tracks[3][:10,...]])

In [None]:
l_users = len(dataset['users'])
l_tracks = len(dataset['train'].nodes) - len(dataset['users'])

r_users = []
r_tracks = []
for n in dataset['test'].nodes:
    if n < l_users:
        r_users.append(n)
    else:
        r_tracks.append(n)


r_users.sort()
r_tracks.sort()
r_tracks = np.asarray(r_tracks)
r_user_id = {u: e for e, u in enumerate(r_users)}
r_track_id = {u: e for e, u in enumerate(r_tracks)}

In [None]:
n_tracks = [t[r_tracks - l_users, ...] for t in tracks]

In [None]:
g = dataset['train']

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

df = pd.DataFrame(data=[], columns=['User', 'Track', 'Prediction']).astype({'User': np.int32, 'Track': np.int32, 'Prediction': np.float32})

parquet_schema = pa.Table.from_pandas(df=df, preserve_index=False).schema
# Open a Parquet file for writing
parquet_writer = pq.ParquetWriter('model_dropout_complex_075/predictions-dropout_complex_075.parquet', parquet_schema, compression='GZIP')

partial = []
for u in tqdm(r_users):
    user_aux = [np.repeat(users[0][u:u+1, ...], len(r_tracks), axis=0), np.repeat(users[1][u:u+1, ...], len(r_tracks), axis=0)]
    pred = model_fast.predict(user_aux + n_tracks, batch_size=len(r_tracks))
    data = np.zeros((len(r_tracks), 3), dtype=np.float32)
    data[:, 0] = u
    data[:, 1] = r_tracks
    data[:, 2] = pred[:, 0]
    df = pd.DataFrame(data=data, columns=['User', 'Track', 'Prediction']).astype({'User': np.int32, 'Track': np.int32, 'Prediction': np.float32})
    
    no_valid = set(dataset['train'].neighbors(u))
    df = df[~df['Track'].isin(no_valid)]
    
    partial.append(df)
    if len(partial) >= 100:
        df = pd.concat(partial)
        partial = []
        table = pa.Table.from_pandas(df, schema=parquet_schema)
        parquet_writer.write_table(table)
        
        
if len(partial) > 0:
    df = pd.concat(partial)
    partial = []
    table = pa.Table.from_pandas(df, schema=parquet_schema)
    parquet_writer.write_table(table)
    
parquet_writer.close()