In [None]:
import pickle
import pandas as pd
import numpy as np
import math
import random
from tqdm import tqdm

## Load Data

In [None]:
with open('song_user.dict', 'rb') as handle:
    unserialized_song_user = pickle.load(handle) #dict
    
with open('user_song.dict', 'rb') as handle:
    unserialized_user_song = pickle.load(handle) #dict
    
with open('song_person.dict', 'rb') as handle:
    unserialized_song_person = pickle.load(handle) #dict
    
with open('person_song.dict', 'rb') as handle:
    unserialized_person_song = pickle.load(handle) #dict

with open('song_id.txt', 'rb') as handle:
    unserialized_song_id = pickle.load(handle) #numpy.ndarray
    
with open('user_id.txt', 'rb') as handle:
    unserialized_user_id = pickle.load(handle) #numpy.ndarray
    
with open('person_id.txt', 'rb') as handle:
    unserialized_person_id = pickle.load(handle) #numpy.ndarray

## Sort Nodes By Degree

In [None]:
# key: song, value: number of users listening to it + number of person relating to its creation
song_degree_dict = {}
for (k,v) in unserialized_song_user.items():
    song_degree_dict[k] = v
for (k,v) in unserialized_song_person.items():
    if k in song_degree_dict.keys():
        song_degree_dict[k] = song_degree_dict[k] + v
    else:
        song_degree_dict[k] = v
song_degree = [(k,len(v)) for (k,v) in song_degree_dict.items()]
print('There are %d songs in the network' % len(song_degree))

# sort by degree in descending order
song_degree.sort(key = lambda x : -x[1])

In [None]:
print(len(unserialized_song_user))
print(len(unserialized_song_person))
print(len(unserialized_song_id))

In [None]:
# key: person, value: number of songs they create
person_degree = [(k,len(v)) for (k,v) in unserialized_person_song.items()]
print('There are %d persons in the network' % len(person_degree))

# sort by degree in descending order
person_degree.sort(key = lambda x : -x[1])

In [None]:
# key: user, value: number of songs they listen to
user_degree = [(k,len(v)) for (k,v) in unserialized_user_song.items()]
print('There are %d users in the network' % len(user_degree))

# sort by degree in descending order
user_degree.sort(key = lambda x : -x[1])

## Construct Subnetworks

### Construct Sparse Subnet (not using random sampling) 

In [None]:
# construct sparse subnetwork
sparse_nodes_percent = 0.1

# find the nodes
print('finding the nodes...')
sparse_net_song_nodes_holder = song_degree[-int(len(song_degree)*sparse_nodes_percent):] #song_id is the first item in the tuple element of the returned list
sparse_net_song_nodes = [node_holder[0] for node_holder in sparse_net_song_nodes_holder]

sparse_net_user_nodes_holder = user_degree[-int(len(user_degree)*sparse_nodes_percent):]
sparse_net_user_nodes = [node_holder[0] for node_holder in sparse_net_user_nodes_holder]

sparse_net_person_nodes_holder = person_degree[-int(len(person_degree)*sparse_nodes_percent):]
sparse_net_person_nodes = [node_holder[0] for node_holder in sparse_net_person_nodes_holder]

sparse_net_nodes = sparse_net_song_nodes + sparse_net_user_nodes + sparse_net_person_nodes

print('sparse net nodes: %d songs, %d users, %d persons.' % (len(sparse_net_song_nodes),\
                                                                                         len(sparse_net_user_nodes), \
                                                                                         len(sparse_net_person_nodes)))
print('total number sparse net nodes: %d.' % len(sparse_net_nodes))

In [None]:
# construct sparse subnetwork
# build the subnetwork --> find the edges
print('finding the edges...')
sparse_net_edge = [] # a list of pairs (node1, node2)
connect = [] # a list of nodes that the node1 should connect to
for i in tqdm(sparse_net_nodes): # (node1, node2) and (node2, node1) both exist
    if i in unserialized_song_user:
        connect = unserialized_song_user[i]
        if i in unserialized_song_person:
            connect.extend(unserialized_song_person[i])
    elif i in unserialized_song_person:
        connect = unserialized_song_person[i]
        if i in unserialized_song_user:
            connect.extend(unserialized_song_user[i])
    elif i in unserialized_user_song:
        connect = unserialized_user_song[i]
    elif i in unserialized_person_song:
        connect = unserialized_person_song[i]
    else:
        print('Error: key error!')
    new_edges = [(i, j) for j in connect if j in sparse_net_nodes]
    sparse_net_edge.extend(new_edges)
    if len(sparse_net_edge) % 100 == 0 and len(sparse_net_edge) != 0:
        print('num edges: ', len(sparse_net_edge))

### Construct Sparse Subnet (random sampling) 

In [None]:
# construct sparse subnetwork
rs_sparse_nodes_percent = 0.01

# find the nodes
print('finding the nodes...')
rs_sparse_net_song_nodes_holder = random.sample(song_degree, int(len(song_degree)*rs_sparse_nodes_percent)) #song_id is the first item in the tuple element of the returned list
rs_sparse_net_song_nodes = [node_holder[0] for node_holder in rs_sparse_net_song_nodes_holder]

rs_sparse_net_user_nodes_holder = random.sample(user_degree, int(len(user_degree)*rs_sparse_nodes_percent))
rs_sparse_net_user_nodes = [node_holder[0] for node_holder in rs_sparse_net_user_nodes_holder]

rs_sparse_net_person_nodes_holder = random.sample(person_degree, int(len(person_degree)*rs_sparse_nodes_percent))
rs_sparse_net_person_nodes = [node_holder[0] for node_holder in rs_sparse_net_person_nodes_holder]

rs_sparse_net_nodes = rs_sparse_net_song_nodes + rs_sparse_net_user_nodes + rs_sparse_net_person_nodes

print('rs sparse net nodes: %d songs, %d users, %d persons.' % (len(rs_sparse_net_song_nodes),\
                                                                                         len(rs_sparse_net_user_nodes), \
                                                                                         len(rs_sparse_net_person_nodes)))
print('total number rs sparse net nodes: %d.' % len(rs_sparse_net_nodes))

In [None]:
# construct sparse subnetwork
# build the subnetwork --> find the edges
print('finding the edges...')
rs_sparse_net_edge = [] # a list of pairs (node1, node2)
connect = [] # a list of nodes that the node1 should connect to
for i in tqdm(rs_sparse_net_nodes): # (node1, node2) and (node2, node1) both exist
    if i in unserialized_song_user:
        connect = unserialized_song_user[i]
        if i in unserialized_song_person:
            connect.extend(unserialized_song_person[i])
    elif i in unserialized_song_person:
        connect = unserialized_song_person[i]
        if i in unserialized_song_user:
            connect.extend(unserialized_song_user[i])
    elif i in unserialized_user_song:
        connect = unserialized_user_song[i]
    elif i in unserialized_person_song:
        connect = unserialized_person_song[i]
    else:
        print('Error: key error!')
    new_edges = [(i, j) for j in connect if j in rs_sparse_net_nodes]
    rs_sparse_net_edge.extend(new_edges)
    if len(rs_sparse_net_edge) % 100 == 0 and len(rs_sparse_net_edge) != 0:
        print('num edges: ', len(rs_sparse_net_edge))

### Construct Dense Subnet (not using random sampling) 

In [None]:
# construct dense subnetwork
dense_nodes_percent = 0.01

# find the nodes
print('finding the nodes...')
dense_net_song_nodes_holder = song_degree[:int(len(song_degree)*dense_nodes_percent)] #song_id is the first item in the tuple element of the returned list
dense_net_song_nodes = [node_holder[0] for node_holder in dense_net_song_nodes_holder]

dense_net_user_nodes_holder = user_degree[:int(len(user_degree)*dense_nodes_percent)]
dense_net_user_nodes = [node_holder[0] for node_holder in dense_net_user_nodes_holder]

dense_net_person_nodes_holder = person_degree[:int(len(person_degree)*dense_nodes_percent)]
dense_net_person_nodes = [node_holder[0] for node_holder in dense_net_person_nodes_holder]

dense_net_nodes = dense_net_song_nodes + dense_net_user_nodes + dense_net_person_nodes

print('dense net nodes: %d songs, %d users, %d persons.' % (len(dense_net_song_nodes),\
                                                                                         len(dense_net_user_nodes), \
                                                                                         len(dense_net_person_nodes)))
print('total number dense net nodes: %d.' % len(dense_net_nodes))

In [None]:
# construct dense subnetwork
# build the subnetwork --> find the edges
print('finding the edges...')
dense_net_edge = [] # a list of pairs (node1, node2)
connect = [] # a list of nodes that the node1 should connect to
for i in tqdm(dense_net_nodes): # (node1, node2) and (node2, node1) both exist
    if i in unserialized_song_user:
        connect = unserialized_song_user[i]
        if i in unserialized_song_person:
            connect.extend(unserialized_song_person[i])
    elif i in unserialized_song_person:
        connect = unserialized_song_person[i]
        if i in unserialized_song_user:
            connect.extend(unserialized_song_user[i])
    elif i in unserialized_user_song:
        connect = unserialized_user_song[i]
    elif i in unserialized_person_song:
        connect = unserialized_person_song[i]
    else:
        print('Error: key error!')
    print(len(connect))
    new_edges = [(i, j) for j in connect if j in dense_net_nodes]
    dense_net_edge.extend(new_edges)
    print('hi')
    if len(sparse_net_edge) % 10 == 0: # and len(sparse_net_edge) != 0:
        print('num edges: ', len(sparse_net_edge))

## Construct Entire Network

In [None]:
# comment out because takes too long
'''
# construct entire network
# find the nodes
print('finding the nodes...')
entire_net_song_nodes = [node_holder[0] for node_holder in song_degree]
entire_net_user_nodes = [node_holder[0] for node_holder in user_degree]
entire_net_person_nodes = [node_holder[0] for node_holder in person_degree]
entire_net_nodes = entire_net_song_nodes + entire_net_user_nodes + entire_net_person_nodes

# build the subnetwork --> find the edges
print('finding the edges...')
entire_net_edge = [] # a list of pairs (node1, node2)
connect = [] # a list of nodes that the node1 should connect to
for i in entire_net_nodes: # (node1, node2) and (node2, node1) both exist
    if i in unserialized_song_user:
        connect = unserialized_song_user[i]
        if i in unserialized_song_person:
            connect.extend(unserialized_song_person[i])
    elif i in unserialized_song_person:
        connect = unserialized_song_person[i]
        if i in unserialized_song_user:
            connect.extend(unserialized_song_user[i])
    elif i in unserialized_user_song:
        connect = unserialized_user_song[i]
    elif i in unserialized_person_song:
        connect = unserialized_person_song[i]
    else:
        print('Error: key error!')
    new_edges = [(i, j) for j in connect]
    entire_net_edge.extend(new_edges)
'''

## Compute Statistics & Density Heuristic

In [None]:
# compute the density heuristic 
sparse_net_density = len(sparse_net_edge) / (len(sparse_net_song_nodes) * \
                                                                (len(sparse_net_user_nodes) + len(sparse_net_person_nodes)))
print('sparse subnet density heuristic: ', sparse_net_density)

# number of nodes
print('sparse subnet has %d nodes' % len(sparse_net_nodes))

# number of edges 
print('sparse subnet has %d edges' % len(sparse_net_edge))

### Results:
sparse subnet density heuristic:  1.8770173365290796e-06

sparse subnet has 285475 nodes

sparse subnet has 24068 edges

In [None]:
# compute the density heuristic 
rs_sparse_net_density = len(rs_sparse_net_edge) / (len(rs_sparse_net_song_nodes) * \
                                                                        (len(rs_sparse_net_user_nodes) + len(rs_sparse_net_person_nodes)))
print('(randomly sampled) sparse subnet density heuristic: ', rs_sparse_net_density)

# number of nodes
print('(randomly sampled) sparse subnet has %d nodes' % len(rs_sparse_net_nodes))

# number of edges 
print('(randomly sampled) sparse subnet has %d edges' % len(rs_sparse_net_edge))

### Results:
rs sparse subnet density heuristic:  1.7277363754161006e-05

rs sparse subnet has 28546 nodes

rs sparse subnet has 2215 edges

In [None]:
# compute the density heuristic 
dense_net_density = len(dense_net_edge) / (len(dense_net_song_nodes) * \
                                                                (len(dense_net_user_nodes) + len(dense_net_person_nodes)))
print('dense subnet density heuristic: ', dense_net_density)

# number of nodes
print('dense subnet has %d nodes' % len(dense_net_nodes))

# number of edges 
print('dense subnet has %d edges' % len(dense_net_edge))

In [None]:
'''
# compute the density heuristic 
entire_net_density = len(entire_net_edge) / (len(entire_net_song_nodes) * \
                                                                (len(entire_net_user_nodes) + len(entire_net_person_nodes)))
print('entire subnet density heuristic: ', entire_net_density)

# number of nodes
print('entire subnet has %d nodes' % len(entire_net_nodes))

# number of edges 
print('entire subnet has %d nodes' % len(entire_net_edge))
'''

## Export Subnetworks

In [None]:
# sparse_net_nodes.txt
sparse_net_nodes_np = np.array(sparse_net_nodes) # numpy array of nodes of sparse network
with open('sparse_net_nodes.txt', 'wb') as handle:
    pickle.dump(sparse_net_nodes_np, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# sparse_net_edges.dict
# key: source, value: target
# note that node1 -> node2 and node2 -> node1 are both present in this dictionary
sparse_net_edges = sparse_net_edge #TODO: fix the naming after running...
sparse_net_edges_dict = {pair[0]:pair[1] for pair in sparse_net_edges}
with open('sparse_net_edges.dict', 'wb') as handle:
    pickle.dump(sparse_net_edges_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#TODO: export dense subnetwork

## Test Loading The Saved Subnets

In [None]:
with open('sparse_net_nodes.txt', 'rb') as handle:
    unpickled_sparse_net_nodes = pickle.load(handle)
unpickled_sparse_net_nodes[:5]
len(unpickled_sparse_net_nodes)

In [None]:
with open('sparse_net_edges.dict', 'rb') as handle:
    unpickled_sparse_net_edges = pickle.load(handle)
len(unpickled_sparse_net_edges)

In [None]:
# TODO: dense subnetwork