In [1]:
import pandas as pd
import pickle

### Step 1: Read the Spotify playlists dataset, rearranged as a Pandas dataframe, from a pickle

In [2]:
data = pickle.load(open('playlists.pkl', 'rb'))

### Step 2: Adjust the range of playlists to read from and create edges between songs ending up in playlists together (in a dictionary format)

In [3]:
song_dict = {}
start = 100
end = 200
for row in data.itertuples(index = True):
    if row[0] < start:
        continue
    if row[0] > end:
        break
    for i in range(len(row[3])):
        if row[3][i] not in song_dict:
            song_dict[row[3][i]] = []
        for song in row[3][i+1:]:
            if song not in song_dict[row[3][i]]:
                song_dict[row[3][i]].append(song)

In [4]:
songs = song_dict.keys()

### Step 3: Read song network into a text file with each line showing a connection: song i --> song j

In [5]:
out_file = open('[test]songnet.txt', 'w')
for song_node in songs:
    for song_nb in song_dict[song_node]:
        line = song_node + ' ' + song_nb
        out_file.write(line)
        out_file.write('\n')

In [7]:
out_file.close()

### Step 4: (Run separately, after text file has been created) Generating a graph for the current song network

In [1]:
import networkx as nx
file_name="songnet(mini).txt"
songs=nx.read_edgelist(file_name,create_using=nx.DiGraph())
node, edge=songs.order(),songs.size()
print("No. of nodes are=",node)
print("No. of edges are=",edge)

No. of nodes are= 7876
No. of edges are= 1299529


In [2]:
import networkx as nx
import numpy as np
import matplotlib 
import scipy
import warnings
warnings.filterwarnings('ignore')
import time
import os
import community
import igraph
from community import community_louvain

In [3]:
def find_largest_component(generator):
    sub_graphs = []
    for item in generator:
        sub_graphs.append(item)

    list_of_all_subgraphs = [(graph, len(graph.nodes)) for graph in sub_graphs]

    largest_count = 0
    for i in range(len(list_of_all_subgraphs)):
        count = list_of_all_subgraphs[i][1]
        if count > largest_count:
            largest_count = count
            largest_component = list_of_all_subgraphs[i][0]
    return largest_component

### Step 5: Determining the largest subgraph in the song network (connecting the most number of nodes)

In [4]:
songs_ud = songs.to_undirected()
songs_ud_components = nx.connected_component_subgraphs(songs_ud)
songs_largest_component = find_largest_component(songs_ud_components)
songs_largest_component.number_of_nodes()

4280

### Step 6: (Optional) Run a community detection on the largest subgraph

In [5]:
songs_community = community_louvain.best_partition(songs_largest_component)

In [6]:
import operator
songs_sorted = sorted_x = sorted(songs_community.items(), key=operator.itemgetter(1))
songs_coms = dict(songs_sorted)

In [7]:
songs_degrees = songs_largest_component.degree()
songs_degrees = dict(songs_degrees)

In [9]:
songs_coms = dict(songs_sorted)
nx.set_node_attributes(songs_largest_component, songs_coms, "community")
nx.set_node_attributes(songs_largest_component, songs_degrees, "degrees")
nx.set_node_attributes(songs_largest_component, songs_community, "partitions")

### Step 7: Write the largest subgraph (our reduced song network) to a text file

In [31]:
out_file = open('reduced_songnet.txt', 'w')
for items in dict(songs_largest_component.edges).keys():
    line = items[0] + ' ' + items[1]
    out_file.write(line)
    out_file.write('\n')

In [32]:
out_file.close()

In [24]:
nx.write_gml(songs_largest_component, "songs_coms_TOTAL.gml")