# Import

In [None]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import time
from scipy import optimize
import pandas as pd
plt.rcParams.update({'font.size': 17})

In [None]:
import dsd

# Gemsec deezer

In [None]:
G = nx.from_pandas_edgelist(pd.read_csv('data/deezer_clean_data/HR_edges.csv'),source="node_1",target='node_2')
G.remove_edges_from(nx.selfloop_edges(G))
n = G.number_of_nodes()
m = G.number_of_edges()
nodemap = {}
idx = 0
for i in G.nodes():
    nodemap[i] = idx
    idx += 1
G = nx.relabel_nodes(G, nodemap)
print(n,m)

## Node features

In [None]:
import json

genres = json.load(open('data/deezer_clean_data/HR_genres.json','r'))

In [None]:
all_gens = set()
for i in genres:
    for j in genres[i]:
        all_gens.add(j)
len(all_gens)

In [None]:
cnt = 0
genres_id_map = {}
id_genres_map = {}
for i in all_gens:
    genres_id_map[i] = cnt
    id_genres_map[cnt] = i
    cnt += 1
node_feature = np.zeros((n, len(all_gens)))
for i in genres:
    for t in genres[i]:
        node_feature[nodemap[int(i)]][genres_id_map[t]] = 1.0


In [None]:
plt.plot(sorted(np.sum(node_feature, axis=0))[::-1])
plt.loglog()
plt.show()

In [None]:
np.argsort(np.sum(node_feature, axis=0))[-6:]

In [None]:
genres_id_map

In [None]:
query = np.zeros(len(all_gens))
query[0] = 1.0
query[1] = 1.0
query[5] = -1.0

nw = (node_feature @ query.reshape((-1,1))).ravel()

#### average node weight of the densest subgraph

In [None]:
np.average(nw[S])

In [None]:
f = open('deezer_HR_1.txt','w')
# fin = open('../Tectonic-master/com-dblp.ungraph.txt','r')
f.write(str(n)+" "+str(m)+"\n")
for i in nw:
    f.write(str(int(i))+'\n')
for e in G.edges():
    f.write(str(e[0]) + " " + str(e[1]) + ' 1\n')
f.close()

### naive baseline

In [None]:
S = np.where(nw>0.3)[0]

S

In [None]:
densest_S, opt_d = dsd.exact_densest(G.subgraph(S))
densest_S, opt_d

### run c++ max-flow based solver

parameters are: guess accuracy, lambda accuracy, theta, density_upper_bound, input and output file paths.

In [None]:
%%bash
./code-greedy++/exactweighted 100 100 0.3 100 deezer_HR_1.txt output/deezer/deezer_HR_1.output

In [None]:
S = []
f = open('output/deezer/deezer_HR_1.output','r')
for line in f:
    S.append(int(line.strip()))

sub = G.subgraph(S)
ccs = sorted(nx.connected_components(sub), key=len)
cmap = plt.cm.coolwarm
for cc in ccs:
    color = []
    for i in cc:
        color.append(nw[i])

    pos = nx.spring_layout(G.subgraph(cc))
    pathcollection = nx.draw_networkx(G.subgraph(cc), pos, with_labels=False, node_color=color, node_size=20, cmap=cmap, vmin=-1, vmax=2, width=0.3)
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=-1, vmax=2))
    sm.set_array([])
    cbar = plt.colorbar(sm)
    plt.show()

### run c++ greedy peeling based solver

parameters are: theta, edge weight multiplier, d2(node weight multiplier) upper bound, input and output file paths.

In [None]:
%%bash
./code-greedy++/ip 0.3 10 1000 deezer_HR_1.txt output/deezer/deezer_HR_1.peel

In [None]:
S = []
f = open('output/deezer/deezer_HR_1.peel','r')
for line in f:
    S.append(int(line.strip()))
    

### While the resulting subgraph is dense (close to the densest) and align well with query, the graph is disconnected and the average node feature of the only large CC is not close to the query.

In [None]:

sub = G.subgraph(S)
ccs = sorted(nx.connected_components(sub), key=len)
cmap = plt.cm.coolwarm
for cc in ccs:
    color = []
    for i in cc:
        color.append(nw[i])

    pos = nx.spring_layout(G.subgraph(cc))
    pathcollection = nx.draw_networkx(G.subgraph(cc), pos, with_labels=False, node_color=color, node_size=20, cmap=cmap, vmin=-1, vmax=2, width=0.3)
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=-1, vmax=2))
    sm.set_array([])
    cbar = plt.colorbar(sm)
    plt.show()


## contrast popular types, query = [pop+, dance-, rock-, Reggae-]

In [None]:
query = np.zeros(len(all_gens))
query[83] = -1.
query[78] = -1.
query[64] = 1.
query[39] = -1.
nw = (node_feature @ query.reshape((-1,1))).ravel()

In [None]:
vmin, vmax = min(nw), max(nw)
np.unique(nw)

#### average node weight of the densest subgraph

In [None]:

densest_S, opt_d = dsd.exact_densest(G)

In [None]:
opt_d, len(densest_S)

In [None]:
np.average(nw[densest_S])

In [None]:

sub = G.subgraph(densest_S)
ccs = sorted(nx.connected_components(sub), key=len)
cmap = plt.cm.coolwarm
for cc in ccs:
    color = []
    for i in cc:
        color.append(nw[i])

    pos = nx.spring_layout(G.subgraph(cc))
    pathcollection = nx.draw_networkx(G.subgraph(cc), pos, with_labels=False, node_color=color, node_size=20, cmap=cmap, vmin=vmin, vmax=vmax, width=0.3)
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
    sm.set_array([])
    cbar = plt.colorbar(sm)
    plt.tight_layout()
    plt.savefig('fig/densest_deezer.png')
    plt.show()


In [None]:
f = open('data/deezer_clean_data/deezer_HR_2.txt','w')
# fin = open('../Tectonic-master/com-dblp.ungraph.txt','r')
f.write(str(n)+" "+str(m)+"\n")
for i in nw:
    f.write(str(int(i))+'\n')
for e in G.edges():
    f.write(str(e[0]) + " " + str(e[1]) + ' 1\n')
f.close()

### naive baseline

In [None]:
S = np.where(nw>0.5)[0]
len(S)

In [None]:
densest_AFS, opt_d = dsd.exact_densest(G.subgraph(S))
len(densest_AFS), opt_d, np.average(nw[densest_AFS])

In [None]:

sub = G.subgraph(densest_AFS)
ccs = sorted(nx.connected_components(sub), key=len)
cmap = plt.cm.coolwarm
for cc in ccs:
    color = []
    for i in cc:
        color.append(nw[i])

    pos = nx.spring_layout(G.subgraph(cc))
    pathcollection = nx.draw_networkx(G.subgraph(cc), pos, with_labels=False, node_color=color, node_size=20, cmap=cmap, vmin=vmin, vmax=vmax, width=0.3)
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
    sm.set_array([])
    # cbar = plt.colorbar(sm)
    plt.tight_layout()
    plt.savefig('fig/AF_deezer.png')
    plt.show()


### run c++ max-flow based solver

parameters are: guess accuracy, lambda accuracy, theta, density_upper_bound, input and output file paths.

In [None]:
%%bash
./src/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2.flow

In [None]:
S = []
f = open('output/deezer/deezer_HR_2.flow','r')
for line in f:
    S.append(int(line.strip()))
    

In [None]:
len(S)

In [None]:

sub = G.subgraph(S)
ccs = sorted(nx.connected_components(sub), key=len)
cmap = plt.cm.coolwarm
for cc in ccs:
    color = []
    for i in cc:
        color.append(nw[i])

    pos = nx.spring_layout(G.subgraph(cc))
    pathcollection = nx.draw_networkx(G.subgraph(cc), pos, with_labels=False, node_color=color, node_size=20, cmap=cmap, vmin=vmin, vmax=vmax, width=0.3)
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
    sm.set_array([])
    # cbar = plt.colorbar(sm)
    plt.tight_layout()
    plt.savefig('fig/alg1_deezer.png')
    plt.show()


### run c++ greedy peeling based solver

parameters are: theta, edge weight multiplier, d2(node weight multiplier) upper bound, input and output file paths.

In [None]:
%%bash
./src/ip 0.5 10 1000 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2.peel

In [None]:
S = []
f = open('output/deezer/deezer_HR_2.peel','r')
for line in f:
    S.append(int(line.strip()))
    

In [None]:
sub = G.subgraph(S)
np.average(nw[S]), sub.number_of_edges()/sub.number_of_nodes(), len(S)

In [None]:

sub = G.subgraph(S)
ccs = sorted(nx.connected_components(sub), key=len)
cmap = plt.cm.coolwarm
for cc in ccs:
    color = []
    for i in cc:
        color.append(nw[i])

    pos = nx.spring_layout(G.subgraph(cc))
    pathcollection = nx.draw_networkx(G.subgraph(cc), pos, with_labels=False, node_color=color, node_size=20, cmap=cmap, vmin=vmin, vmax=vmax, width=0.3)
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
    sm.set_array([])
    # cbar = plt.colorbar(sm)
    plt.tight_layout()
    plt.savefig('fig/alg2_deezer.png')
    plt.show()


## tune the "dislikeness" of rock fans

### Spectrum of theta

In [None]:
%%bash
./code-greedy++/exactweighted 100 100 0.1 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_1.flow
./code-greedy++/exactweighted 100 100 0.2 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_2.flow
./code-greedy++/exactweighted 100 100 0.3 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_3.flow
./code-greedy++/exactweighted 100 100 0.4 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_4.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_5.flow
./code-greedy++/exactweighted 100 100 0.6 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_6.flow
./code-greedy++/exactweighted 100 100 0.7 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_7.flow
./code-greedy++/exactweighted 100 100 0.8 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_8.flow
./code-greedy++/exactweighted 100 100 0.9 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_9.flow
./code-greedy++/exactweighted 100 100 0.99 100 data/deezer_clean_data/deezer_HR_2.txt output/deezer/deezer_HR_2_idea1_10.flow

In [None]:
avg_nw, density = [], []
x = [round(0.1*i,1) for i in range(0,11)]
fan_nums = {39:[],64:[],78:[],83:[]}
for i in range(0,11):
    fname = f'output/deezer/deezer_HR_2_idea1_{i}.flow'
    S = []
    f = open(fname,'r')
    for line in f:
        S.append(int(line.strip()))

    sub = G.subgraph(S)
    avg_nw.append(np.average(nw[S]))
    density.append(sub.number_of_edges()/sub.number_of_nodes())
    for j in fan_nums:
        fan_nums[j].append(np.sum(node_feature[S],axis=0)[j]/len(S))
    
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel(r'$\theta$')
ax1.set_ylabel('c(S)', color=color)
ax1.plot(x, avg_nw, label='c(S)', marker='d', color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('d(S)', color=color)  # we already handled the x-label with ax1
ax2.plot(x, density, label='d(S)', marker='s', color=color)
ax2.tick_params(axis='y', labelcolor=color)
xt = []
for i in range(11):
    if i%2==0:
        xt.append(str(round(0.1*i,1)))
    else:
        xt.append("")
plt.xticks([round(0.1*i,1) for i in range(11)],xt)
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig('fig/c_d_tradoff_deezer.png')
plt.show()

In [None]:
for i in fan_nums:
    plt.plot(x, fan_nums[i], label=id_genres_map[i], marker='s')
plt.legend()
plt.xlabel(r'$\theta$')
plt.ylabel('# fans in percentage')
plt.tight_layout()
plt.savefig('fig/fan_percent_deezer.png')
plt.show()

In [None]:
for j in range(0,10):
    query = np.zeros(len(all_gens))
    query[54] = -j
    query[67] = 1.
    query[39] = -1.
    nw = (node_feature @ query.reshape((-1,1))).ravel()

    f = open(f'data/deezer_clean_data/deezer_HR_2_idea2_{j}.txt','w')
    # fin = open('../Tectonic-master/com-dblp.ungraph.txt','r')
    f.write(str(n)+" "+str(m)+"\n")
    for i in nw:
        f.write(str(int(i))+'\n')
    for e in G.edges():
        f.write(str(e[0]) + " " + str(e[1]) + ' 1\n')
    f.close()

In [None]:
%%bash
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_0.txt output/deezer/deezer_HR_2_idea2_0.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_1.txt output/deezer/deezer_HR_2_idea2_1.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_2.txt output/deezer/deezer_HR_2_idea2_2.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_3.txt output/deezer/deezer_HR_2_idea2_3.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_4.txt output/deezer/deezer_HR_2_idea2_4.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_5.txt output/deezer/deezer_HR_2_idea2_5.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_6.txt output/deezer/deezer_HR_2_idea2_6.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_7.txt output/deezer/deezer_HR_2_idea2_7.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_8.txt output/deezer/deezer_HR_2_idea2_8.flow
./code-greedy++/exactweighted 100 100 0.5 100 data/deezer_clean_data/deezer_HR_2_idea2_9.txt output/deezer/deezer_HR_2_idea2_9.flow

In [None]:
avg_nw, density = [], []
x = [i for i in range(10)]
fan_nums = {39:[],54:[],67:[]}
for i in range(0,10):
    query = np.zeros(len(all_gens))
    query[54] = -i
    query[67] = 1.
    query[39] = -1.
    nw = (node_feature @ query.reshape((-1,1))).ravel()
    fname = f'output/deezer/deezer_HR_2_idea2_{i}.flow'
    S = []
    f = open(fname,'r')
    for line in f:
        S.append(int(line.strip()))

    sub = G.subgraph(S)
    avg_nw.append(np.average(nw[S]))
    density.append(sub.number_of_edges()/sub.number_of_nodes())
    for j in fan_nums:
        fan_nums[j].append(np.sum(node_feature[S],axis=0)[j]/len(S))
    
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('query value')
ax1.set_ylabel('c(S)', color=color)
ax1.plot(x, avg_nw, label='c(S)', marker='d', color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('d(S)', color=color)  # we already handled the x-label with ax1
ax2.plot(x, density, label='d(S)', marker='s', color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

In [None]:
for i in fan_nums:
    plt.plot(x, fan_nums[i], label=id_genres_map[i], marker='s')
plt.legend()
plt.ylabel('# fans in percentage')
plt.xlabel('query value')
plt.show()