In [1]:
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
import re
import itertools
from tqdm import tqdm

# Creating song similarity networks from user's librairies

In [3]:
train =  pd.read_csv('data/train.csv')

In [4]:
songs = pd.read_csv('data/songs.csv')

First we sort the train table in order to group by user ID.

In [5]:
train.sort_values('msno', inplace=True)
train.reset_index(drop=True, inplace=True)

In [6]:
train

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,DZanD98cLDbuPBiXONSwLFSw8BfI69ErQS23vYJ9UU4=,my library,Local playlist more,local-library,1
1,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,E6Fm873UEDPv0LATidt2gKwBZZHww2Q04nxTg15N9iE=,my library,Local playlist more,local-library,1
2,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,0TZ4BKn9YES2upK9ZgKgsAUtUClVKHnct79pnA/iPiQ=,my library,Local playlist more,local-library,1
3,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,m2YSjnhR3nzx/2WccNYpGWywohyd8fT0xR+skQC9/JQ=,my library,Local playlist more,local-library,1
4,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,cXpk2ZXiSgkwGo8iGbswBc0lzucL9AH+j/2khmnmOLI=,my library,Local playlist more,local-playlist,0
5,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,l4oo7oWfRaS6ruFnWfHkAWEVjpcTr+J6T9Mf7l3/x+s=,my library,Local playlist more,local-playlist,0
6,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,upH6pOAUd+iV/MkpzeELvqEFoTEIVhsV9eML8N7/gUM=,my library,Local playlist more,local-library,1
7,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,0smF3rcYJymevsNGZbPZnTpUGLsgV/DvukNP9+3pZzI=,my library,Local playlist more,local-playlist,0
8,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,a1iJZwnK13y6WORmBLHLoX/4k9WLwGGQ03cXVvdqhzM=,my library,Local playlist more,local-playlist,0
9,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,9/M6UYUamqppFMo34jXxFApSujcu4eHl+yDaI3Gg6S4=,my library,Local playlist more,local-playlist,0


Then we create the adjacency matrix that contains all the songs in the dataset. We also keep track of whether the songs were found in the same library in a boolean adjacency matrix. We can then subtract the complete matrix with the boolean matrix to remove spurious song interactions and create a lighter graph.

In [7]:
unique_songs = train['song_id'].unique()
unique_song_id = dict(zip(unique_songs, range(len(unique_songs))))

In [34]:
from scipy.sparse import csr_matrix, dok_matrix

songs_adj_matrix = csr_matrix((len(unique_songs), len(unique_songs)), dtype=np.uint16)
songs_unique_adj = dok_matrix((len(unique_songs), len(unique_songs)), dtype=bool)

In [35]:
unique_users = train['msno'].unique()

for i, unique_user in tqdm(enumerate(unique_users[:1000])):
    # Retrieve unique user's "library" of songs listened to.
    start_index = train['msno'].searchsorted(unique_users[i])[0]
    if not i == (len(unique_users)-1):
        stop_index = train['msno'].searchsorted(unique_users[i+1])[0]
    else:
        stop_index = len(train)
    unique_user_library = train.iloc[start_index:stop_index]['song_id']
    
    # Add to the the global adjacency song matrix.
    dct = {}
    for edge in itertools.permutations(unique_user_library,2):
        dct[(unique_song_id[edge[0]], unique_song_id[edge[1]])] = 1
    
    unique_user_library_matrix = dok_matrix((len(unique_songs), len(unique_songs)), dtype=np.int16)
    songs_unique_adj.update(dct)
    unique_user_library_matrix.update(dct)
    
    songs_adj_matrix = unique_user_library_matrix + songs_adj_matrix


0it [00:00, ?it/s][A
1000it [1:27:53, 13.11s/it]


In [None]:
len(unique_users) * 10 / 60 / 60

In [None]:
songs_adj_matrix

<359966x359966 sparse matrix of type '<class 'numpy.int32'>'
	with 103595612 stored elements in Compressed Sparse Column format>

In [None]:
songs_adj_matrix = songs_adj_matrix - songs_unique_adj
song_adj_matrix

In [21]:
graph = nx.from_scipy_sparse_matrix(songs_adj_matrix)

In [22]:
subg = graph.subgraph([unique_song_id[song] for song in train[train['msno'] == unique_user]['song_id']])