In [1]:
#####################################################
################## PACKAGES #########################
#####################################################
import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import networkx as nx
from pyvis.network import Network
import sys 
import base64
from music_utils import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as ntx
import csv
import random

from datetime import datetime, timedelta, date
import pandas as pd
from itertools import combinations
from numpy.linalg import norm
import pickle
from music_utils import *
import os
from torch_geometric.data import Data
import torch
import torch_geometric.nn as graphnn
import torch.nn as nn
import torch
from torch.nn import Linear
import torch.nn.functional as F


local = True
colab = False
git = False
if local:
    DATA_PATH = './data/'

elif git:
    DATA_PATH = './data/'
    
elif colab:
    from google.colab import drive
    import sys
    DATA_PATH = './gdrive/MyDrive/MLNS_Spotify/data/'
    drive.mount('/content/gdrive', force_remount=True)
    sys.path.append('/content/gdrive/MyDrive/MLNS_Spotify')

PATH_TRAIN = DATA_PATH+"train.txt"
PATH_NODE_INFO = DATA_PATH+"node_information.csv"
PATH_TEST = DATA_PATH+"test.txt"
# sys.path.append('/content/gdrive/MyDrive/MLNS_Spotify')



class MLP_post(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super().__init__()
        torch.manual_seed(12345)
        self.num_layers = num_layers
        self.lin1 = Linear(in_channels, hidden_channels)
        self.list_FC = nn.ModuleList()

        for i in range(num_layers):
            self.list_FC.append(nn.Linear(hidden_channels, hidden_channels))
        
        self.last_lin = Linear(hidden_channels, out_channels)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        for i in range(self.num_layers):
            x = F.elu(self.list_FC[i](x))
            x = F.dropout(x, p=0.5, training=self.training)
        return self.softmax(self.last_lin(x))
        

class GAT_MLP(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers, embed_size, n_heads,  MLP_num_layers, MLP_hidden_channels, MLP_out_channels, dropout=False):
        super().__init__()
        self.n_heads = n_heads
        self.dropout = dropout

        self.num_layers = num_layers
        self.graphconv1 = graphnn.conv.GATConv(in_channels=in_channels, out_channels=hidden_channels, heads=n_heads, concat=True)
        self.list_GATC = nn.ModuleList()

        for i in range(num_layers):
            self.list_GATC.append(graphnn.conv.GATConv(in_channels=n_heads*hidden_channels, out_channels=hidden_channels, heads=n_heads, concat=True))
        
        self.last_conv = graphnn.conv.GATConv(in_channels=n_heads*hidden_channels, out_channels=embed_size, heads=n_heads, concat=False)

        self.elu = nn.LeakyReLU(negative_slope=0.2)
        self.MLP_post = MLP_post(in_channels=embed_size*2, hidden_channels=MLP_hidden_channels, num_layers=MLP_num_layers, out_channels=MLP_out_channels)
                  
    def forward(self, x, edge_index):
          # print('very first', x)
        # TO AVOID NAN PROPAGATION
          x = torch.nan_to_num(x)
          # print('first', x)
          x = self.graphconv1(x, edge_index)
        #   print('x beginnning', x)
          x = self.elu(x)
          for i in range(self.num_layers):
              x = x + self.elu(self.list_GATC[i](x, edge_index))
              x = F.dropout(x, p=0.2, training=self.training)

          x = self.last_conv(x, edge_index).relu()
          # print('end GAT', x)
          #we concatenate
          # Extract the source and target node indices from edge_index
          src_idx = edge_index[0]
          tgt_idx = edge_index[1]

          # Use indexing to extract the node features for the source and target nodes
          src_features = x[src_idx]
          tgt_features = x[tgt_idx]

          # Concatenate the features along the last dimension
          x = torch.cat([src_features, tgt_features], dim=-1)
          # print('concatenated', x.size())
          x = self.MLP_post.forward(x)
          # print('final', x)
        #   print('x',x)
          return x

def full_initialisation():
    ######################################################
    ######################################################
    ##               DATA INITIALISATION               ###
    ######################################################
    ######################################################


    spotify_600, artists_600 = read_spotify_600_for_brouillon(DATA_PATH=DATA_PATH, read=False)


    #####
    ## PARAMETERS
    #####
    start_date = datetime.strptime("1998-01-01 00:00:01", "%Y-%m-%d %H:%M:%S")
    end_date = datetime.strptime("2020-12-31 23:59:00", "%Y-%m-%d %H:%M:%S")
    n_month = 12
    interval = timedelta(days=365.25*n_month/12)

    # one to one spotify database rearrangement
    start_date_spotify_600 = spotify_600[spotify_600.release_date.dt.year >= start_date.year].copy()
    spot_600 = start_date_spotify_600.copy()

    song_artist_pairs = [(track_id, artist_pair[0], artist_pair[1]) for track_id, artists in spot_600[['track_id', 'id_artists']].values for artist_pair in combinations(set(artists), 2)]
    correspondace_spot_600 = pd.DataFrame(song_artist_pairs, columns=['track_id', 'artist_1', 'artist_2'])
    spot_600 = pd.merge(correspondace_spot_600, spot_600.drop(columns=['name', 'artists', 'id_artists', 'artist_id']), on='track_id', how='left').copy()

    # existing artists 
    in_spot_artists_600 = artists_600[artists_600.artist_id.isin(start_date_spotify_600.id_artists.explode().unique())].copy()


    ######################################################
    ## BUILD FEATURES OF ARTIST CONSIDERING START DATE ###


    #features of the artists
    artist_features = artists_features_creation(in_spot_artists_600,
                                            start_date_spotify_600,
                                            DATA_PATH, read=False,
                                            pkl_features_artist_path='features_artists_PYGT_yt_1999.pkl',
                                ).reset_index()

    # print(f'len of artist_features : {len(artist_features)}')

    ######################################################
    ##          ARTIST ID TO INT DICTIONNARY           ###

    # Reassign the location IDs (makes it easier later, because here the IDs didn't start at 0)
    artist_idname = artist_features['artist_id'].unique()
    new_ids = list(range(len(artist_idname)))
    mapping = dict(zip(artist_idname, new_ids))
    reversed_mapping = dict(zip(new_ids, artist_idname))

    artist_features['int_artist_id'] = artist_features['artist_id'].map(mapping)


    spot_600['artist_1'] = spot_600['artist_1'].map(mapping)
    spot_600['artist_2'] = spot_600['artist_2'].map(mapping)

    #We drop potential nans
    missings = spot_600[(spot_600.artist_1.isna()) | (spot_600.artist_2.isna())].copy()
    spot_600 = spot_600.dropna(subset=['artist_1', 'artist_2']).copy()


    int_to_name = dict(artist_features[['int_artist_id', 'name']].values)

    spot_600['artist_1_name'] = spot_600.artist_1.map(int_to_name)
    spot_600['artist_2_name'] = spot_600.artist_2.map(int_to_name)


    df_featurings = spot_600.groupby(['artist_1', 'artist_2']).agg(num_feats=('track_id', 'count')).reset_index()

    df_featurings['artist_1_name'] = df_featurings.artist_1.map(int_to_name)
    df_featurings['artist_2_name'] = df_featurings.artist_2.map(int_to_name)

    node_features = np.array(artist_features.drop(columns=['artist_id', 'genres', 'name', 'int_artist_id']).fillna(0))
    node_features = (node_features - node_features.mean(axis=0))/node_features.std(axis=0) 

    model = torch.load(DATA_PATH + 'best-model_GAT_MLP_TRAINED_normalized_working.pt',  map_location='cpu')

    return mapping, reversed_mapping, int_to_name, spot_600, artist_features, df_featurings, node_features, model


def test_Data_construction(df_select, node_features):
    """build the data test obect

    Args:
        df_select (pandas dataframe): the selected spotify_600 subset
        node_features (array): features of the nodes

    Returns:
        pytorch geometric Data: the test Data object
    """
    
    edge_list = torch.from_numpy(np.array(df_select[['artist_1','artist_2']].values).transpose())
    edge_attr = torch.from_numpy(np.array(df_select.num_feats.values).transpose())
    y = torch.from_numpy(np.array(df_select.done_feat.values).transpose())
    test_data = Data(x=torch.from_numpy(node_features).float(), 
    y_indices=edge_list.long(), 
    edge_index=edge_list, 
    edge_attr=edge_attr,
    y=y)

    print(test_data)
    return test_data

def y_labels_val(spot_600, df_select):
    labels_df = spot_600[(spot_600.release_date >= begin_date) & (spot_600.release_date <= end_date)].copy()
    
    labels_df = labels_df.groupby(['artist_1_name', 'artist_2_name']).agg(num_feats=('track_id', 'count')).reset_index()
    labels_df['done_feat'] = labels_df.num_feats.apply(lambda x: 1 if x >= 1 else 0)
    df_select = pd.merge(df_select, labels_df[['artist_1_name', 'artist_2_name', 'done_feat']],
                        on=['artist_1_name', 'artist_2_name'],
                        how='left'
                    )
    df_select.done_feat = df_select.done_feat.fillna(0)

    return df_select

mapping, reversed_mapping, int_to_name, spot_600, artist_features, df_featurings, node_features, model = full_initialisation()

In [29]:

# Define list of selection options and sort alphabetically
artist_list = ['Damso']

#Define the list of genres
genres_list = []


graph_type = 'Direct connections'
begin_date = date(1999, 1, 1)

end_date = date(2022, 1, 1)

begin_date = np.datetime64(begin_date)
end_date = np.datetime64(end_date)

selected_artists = ['Damso']

df_select = df_featurings.loc[df_featurings['artist_1_name'].isin(selected_artists) | \
                            df_featurings['artist_2_name'].isin(selected_artists)]
df_select = df_select.reset_index(drop=True)
# df_select = y_labels_val(spot_600, df_select)
            

In [30]:
df_select_2  = y_labels_val(spot_600, df_select)

In [31]:
df_select_2

Unnamed: 0,artist_1,artist_2,num_feats,artist_1_name,artist_2_name,done_feat
0,20185.0,32364.0,1,Kalash,Damso,1
1,32364.0,4347.0,1,Damso,Benash,1
2,32364.0,9120.0,1,Damso,Gato,1
3,32364.0,9137.0,1,Damso,Siboy,1
4,32364.0,9555.0,1,Damso,Ikaz Boi,1
5,32364.0,24283.0,2,Damso,Angèle,1
6,32364.0,26094.0,1,Damso,Lous and The Yakuza,1
7,32364.0,27684.0,1,Damso,Nekfeu,1
8,32364.0,31629.0,2,Damso,Ninho,1
9,32364.0,32248.0,1,Damso,Orelsan,1


In [None]:
def y_labels_val(spot_600, df_select):
    labels_df = spot_600[(spot_600.release_date >= begin_date) & (spot_600.release_date <= end_date)].copy()
    
    labels_df = labels_df.groupby(['artist_1_name', 'artist_2_name']).agg(num_feats=('track_id', 'count')).reset_index()
    labels_df['done_feat'] = labels_df.num_feats.apply(lambda x: 1 if x >= 1 else 0)
    df_select = pd.merge(df_select, labels_df[['artist_1_name', 'artist_2_name', 'done_feat']],
                        on=['artist_1_name', 'artist_2_name'],
                        how='left'
                    )
    df_select.done_feat = df_select.done_feat.fillna(0)

In [27]:
labels_df = spot_600[(spot_600.release_date >= begin_date) & (spot_600.release_date <= end_date)].copy()
    
labels_df = labels_df.groupby(['artist_1_name', 'artist_2_name']).agg(num_feats=('track_id', 'count')).reset_index()
labels_df['done_feat'] = labels_df.num_feats.apply(lambda x: 1 if x >= 1 else 0)

In [28]:
labels_df[labels_df.artist_2_name == 'Damso']

Unnamed: 0,artist_1_name,artist_2_name,num_feats,done_feat
29383,Kalash,Damso,1,1
32104,Lacrim,Damso,1,1


In [11]:
labels_df.artist_1_name.unique()

array(['-M-', '2Pac', '3OH!3', ..., '優客李林', '花月千鳥', '陳思安'], dtype=object)

In [15]:
df_featurings[df_featurings.artist_1_name == 'Damso']

Unnamed: 0,artist_1,artist_2,num_feats,artist_1_name,artist_2_name
40984,32364.0,4347.0,1,Damso,Benash
40985,32364.0,9120.0,1,Damso,Gato
40986,32364.0,9137.0,1,Damso,Siboy
40987,32364.0,9555.0,1,Damso,Ikaz Boi
40988,32364.0,24283.0,2,Damso,Angèle
40989,32364.0,26094.0,1,Damso,Lous and The Yakuza
40990,32364.0,27684.0,1,Damso,Nekfeu
40991,32364.0,31629.0,2,Damso,Ninho
40992,32364.0,32248.0,1,Damso,Orelsan
40993,32364.0,33435.0,1,Damso,Vald


In [24]:
short_spot_600 = spot_600[(spot_600.release_date >= begin_date) & (spot_600.release_date <= end_date)].copy()

df_featurings_2 = short_spot_600.groupby(['artist_1', 'artist_2']).agg(num_feats=('track_id', 'count')).reset_index()

df_featurings_2['artist_1_name'] = df_featurings_2.artist_1.map(int_to_name)
df_featurings_2['artist_2_name'] = df_featurings_2.artist_2.map(int_to_name)

In [25]:
df_featurings_2[df_featurings_2.artist_1_name == 'Damso']

Unnamed: 0,artist_1,artist_2,num_feats,artist_1_name,artist_2_name
39939,32364.0,4347.0,1,Damso,Benash
39940,32364.0,9120.0,1,Damso,Gato
39941,32364.0,9137.0,1,Damso,Siboy
39942,32364.0,9555.0,1,Damso,Ikaz Boi
39943,32364.0,24283.0,2,Damso,Angèle
39944,32364.0,26094.0,1,Damso,Lous and The Yakuza
39945,32364.0,27684.0,1,Damso,Nekfeu
39946,32364.0,31629.0,2,Damso,Ninho
39947,32364.0,32248.0,1,Damso,Orelsan
39948,32364.0,33435.0,1,Damso,Vald


In [26]:
short_spot_600

Unnamed: 0,track_id,artist_1,artist_2,track_popularity,duration_ms,explicit,release_date,danceability,energy,key,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,num_artists,artist_1_name,artist_2_name
0,6l82JF2bHWDnbdSp7mw5dP,31928.0,34726.0,14,231267,0,2006-11-07,0.3290,0.2240,10,...,0.0737,0.877000,0.000004,0.9590,0.2360,80.532,1,2,Count Basie,Frank Sinatra
1,6bgnQlDQKgni9fH7mEQ1sC,31928.0,34726.0,13,183960,0,2006-11-07,0.2920,0.2200,7,...,0.0431,0.886000,0.000001,0.9090,0.2210,108.978,1,2,Count Basie,Frank Sinatra
9,0yZl9h6TwQYsyDfA2IWlxD,18479.0,30720.0,0,837373,0,2017-06-16,0.0725,0.0115,7,...,0.0380,0.938000,0.964000,0.0653,0.0411,77.359,4,3,Orchestre National Radiodiffusion Française,Dmitri Shostakovich
10,0yZl9h6TwQYsyDfA2IWlxD,18479.0,28611.0,0,837373,0,2017-06-16,0.0725,0.0115,7,...,0.0380,0.938000,0.964000,0.0653,0.0411,77.359,4,3,Orchestre National Radiodiffusion Française,André Cluytens
11,0yZl9h6TwQYsyDfA2IWlxD,30720.0,28611.0,0,837373,0,2017-06-16,0.0725,0.0115,7,...,0.0380,0.938000,0.964000,0.0653,0.0411,77.359,4,3,Dmitri Shostakovich,André Cluytens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127392,2OChe6vvOEaphM4zFyImBQ,34365.0,39162.0,1,55656,0,2020-01-05,0.6080,0.9020,7,...,0.0402,0.000015,0.623000,0.1520,0.2460,138.012,4,2,Armin van Buuren,Tempo Giusto
127393,1HpzOCZbNWzxvvXfSGtSrX,33255.0,30969.0,64,176417,0,2019-05-30,0.6050,0.6640,8,...,0.0316,0.005480,0.000003,0.2040,0.1640,103.997,4,2,Camila Cabello,Mark Ronson
127394,3wGIJgoBmapSwFjafNJQ6X,21857.0,45905.0,1,55651,0,2020-01-05,0.4620,0.9530,1,...,0.1190,0.003440,0.934000,0.5960,0.3470,137.984,5,2,Davey Asprey,Beatsole
127398,1932nBdtEgMpDGkpl93cS4,37358.0,21696.0,1,125216,0,2020-01-05,0.4580,0.9970,6,...,0.1940,0.000898,0.872000,0.4600,0.0393,137.996,4,2,Stine Grove,RAM
