In [1]:
import networkx as nx
import matplotlib.pyplot as plt  
from datetime import datetime
%matplotlib inline

In [2]:
def check_data_file():
    import os.path
    if os.path.exists('brand_followers.tsv'):
        #fix brand followers - Do not run more than more
        out_file = open("brand_followers_fixed.tsv","w")
        with open('brand_followers.tsv','r') as data_file:
            data = data_file.read()
            data = data.replace('1114073faithhill','1114073\nfaithhill')
            out_file.write(data)
        out_file.close()
        os.remove('brand_followers.tsv')

In [3]:
def load_data(max_row_count=1404):
    start = datetime.now()
    brand_followers = {}
    row_id = 0
    brands_list = []
    with open('brand_followers_fixed.tsv') as data_file:
        for row in data_file:
            row_id += 1
            followers = row.split()
            brand = followers.pop(0)
            brand_followers[brand] = set([int(x) for x in followers])
            brands_list.append(brand)
            
            print( row_id, 'Brand:',brand, 'Followers:',len(followers))
            if row_id == max_row_count:
                print('Reached limit!')
                break
    stop = datetime.now()
    print("Elapsed Time:",stop-start)
    return brand_followers

In [5]:
def create_weighted_graph(brand_followers):
    from itertools import combinations
    start = datetime.now()
    G = nx.Graph()
    brands_list = brand_followers.keys()
    for brand in brands_list:
        G.add_node(brand)
    for pair in combinations(brands_list,2):
        b1 = brand_followers[pair[0]]
        b2 = brand_followers[pair[1]]
        common_connections = len(b1.intersection(b2))
        total_connections = len(b1) + len(b2) - common_connections #More efficient than set(b1).union(b2)
        weight = common_connections/total_connections
        if weight != 0.0:
            G.add_edge(pair[0],pair[1], weight = weight)
    stop = datetime.now()
    print("Elapsed Time:",stop-start)
    return G