In [5]:
"""
cluster.py
"""
import networkx as nx
from collections import Counter
import matplotlib.pyplot as plt
import pickle
import numpy as np
from numpy.linalg import eigh

def get_users(filename):
    with open(filename,'rb') as f:
        users = pickle.loads(f.read())
    return users

def follower_count(users):
    c = Counter()
    for user in users:
        c.update(user['followers'])
    return c

def following_count(users):
    c = Counter()
    for user in users:
        c.update(user['following'])
    return c

def create_graph(users):
    """Create edge between following and followers
    Only add edge to followers who follow more than 2 seed users
    and following that is followed by more than 3 seed users
    """
    graph = nx.Graph()
    c = follower_count(users)
    c1 = following_count(users)
    for user in users:
        graph.add_node(user['id'])
        for friend in c1.items():
            if friend[1] >= 3:
                if friend[0] in user['following']:
                    graph.add_edge(int(user['id']),friend[0])
        for follower in c.items():
            if follower[1] >= 2:
                if follower[0] in user['followers']:
                    graph.add_edge(int(user['id']),follower[0])
    return graph

def draw_network(graph,users,filename):
    plt.figure(figsize=(10,10))
    nx.draw_networkx(graph,with_labels = False, node_size = 100, alpha = 0.3)
    plt.savefig(filename)

def volume(nodes,graph):
    edges = []
    for node in nodes:
        for neighbor in graph.neighbors(node):
            edges.append(tuple(sorted([neighbor,node])))
    edges = list(set(edges))
    return len(edges)

def cut(S,T,graph):
    count = 0
    edges = graph.edges()
    for node1, node2 in edges:
        if (node1 in S and node2 in T) or (node2 in S and node1 in T):
            count+=1
    return count

def norm_cut(S,T,graph):
    return (cut(S,T,graph)/(volume(S,graph))) + (cut(S,T,graph)/volume(T,graph))

def adjacency_matrix(graph):
    return nx.adjacency_matrix(graph,sorted(graph.nodes()))

def degree_matrix(graph):
    degrees = graph.degree().items()
    #Sort to be in the same order as adjacency_matrix
    degrees = sorted(degrees, key = lambda x: x[0])
    degrees = [d[1] for d in degrees]
    return np.diag(degrees)

def laplacian_matrix(graph):
    return degree_matrix(graph) - adjacency_matrix(graph)

def get_eigen(laplacian):
    eig_vals, eig_vectors = eigh(laplacian)
    return np.round(eig_vals,2), np.round(eig_vectors,2)

def cluster(eig_vectors, nodes):
    #eig_vectors = np.sum(eig_vectors, axis = 1)
    nodes = sorted(nodes)
    first_comp = []
    second_comp = []
    components = []
    """for i in range(len(nodes)):
        if eig_vectors[i][0] >= 0:
            first_comp.append(nodes[i])
        elif eig_vectors[i][0] < 0:
            second_comp.append(nodes[i])"""
    first_comp = [nodes[0]]
    for i in range(1,len(nodes)):
        second_comp.append(nodes[i])
    components.append(first_comp)
    components.append(second_comp)
    return components

def main():
    users = get_users('users.txt')
    print('Getting 4 seed users:'+" "+', '.join([user['screen_name'] for user in users]))
    print('Creating graph...')
    graph = create_graph(users)
    print('Saving graph to file...')
    draw_network(graph,users,'network1.png')
    print('Graph has {} nodes and {} edges'.format(len(graph.nodes()),len(graph.edges())))
    matrix = laplacian_matrix(graph)
    eig_vals, eig_vectors = get_eigen(matrix)
    print('\nClustering graph into 2 components...')
    components = cluster(eig_vectors,graph.nodes())
    print('\tCluster 1 has {} nodes'.format(len(components[0])))
    print('\tCluster 2 has {} nodes'.format(len(components[1])))
    print('\nNormalized cut value for this partition is: {}'.format(norm_cut(components[0],components[1],graph)))

if __name__== '__main__':
    main()

Getting 4 seed users: akstanwyck, NikkiFinke, slashfilm, ErikDavis
Creating graph...
Saving graph to file...
Graph has 1290 nodes and 3042 edges

Clustering graph into 2 components...
	Cluster 1 has 1 nodes
	Cluster 2 has 1289 nodes

Normalized cut value for this partition is: 1.0013149243918475


In [6]:
graph = nx.Graph()
graph.add_edges_from([('A', 'B'), ('A', 'C'), ('B', 'C'), ('B', 'D'), ('D', 'E'), ('D', 'F'), ('D', 'G'), ('E', 'F'), ('G', 'F'),('C','G'),('H','C')])


In [9]:
components = [['H','A','B','C'],['D','E','F','G']]
print(norm_cut(components[0],components[1],graph))

0.6190476190476191
