In [57]:
#!/usr/bin/env python

# convert gml to json and add basic network statistics

# TODO: fix relative file paths for file input and export
# TODO: move definitions of algorithms to README.md
# TODO: keep label and id from gml

# import libraries
# import argparse
import json
import networkx as nx
from networkx.readwrite import json_graph
from modularity_maximization import partition
from modularity_maximization.utils import get_modularity
from random import randint as rand

In [58]:
outputfile_format = 'json'
gmlfile = 'TheDataFox.gml'
outputfile = 'TheDataFox'

In [59]:
# def analyze_convert(gmlfile, outputfile,outputfile_format='json'):

"""
Converts GML file to json while adding statistics and community information
using modularity_maximization. JSON output is usable with D3 force layout 
and GEXF with sigmajs

# see: https://cambridge-intelligence.com/keylines-faqs-social-network-analysis/

"""

print(outputfile_format.upper(), 'output file selected')
print('\nReading GML file:', gmlfile)

di_graph = nx.read_gml('../data/raw/' + gmlfile, label='label')

print('Identifying communities...')
comm_dict = partition(di_graph)

print('\nModularity of such partition for network is %.3f' % \
      get_modularity(di_graph, comm_dict))

# adds partition/community number as attribute named 'Modularity Class'
print('\nAssigning Communities...')
for n, d in di_graph.nodes(data=True):
    d['mc'] = comm_dict[n]

# set positions of nodes
pos = nx.spring_layout(di_graph)

for node, (x, y) in pos.items():
    di_graph.node[node]['x'] = float(x)
    di_graph.node[node]['y'] = float(y)

JSON output file selected

Reading GML file: TheDataFox.gml
Identifying communities...
Calculating modularity for directed graph

Modularity of such partition for network is 0.492

Assigning Communities...


In [72]:
di_graph.node[2]


KeyError: 2

In [8]:
# betweeness centrality
bc = nx.betweenness_centrality(di_graph)
nx.set_node_attributes(di_graph, name='bc', values=bc)

"""
Definition: Betweenness centrality measures the number of times a node lies on the shortest path between other 
nodes.

What it tells us: This measure shows which nodes act as ‘bridges’ between nodes in a network. It does this by 
identifying all the shortest paths and then counting how many times each node falls on one.

When to use it: For finding the individuals who influence the flow around a system.

A bit more detail: Betweenness is useful for analyzing communication dynamics, but should be used with care. A high 
betweenness count could indicate someone holds authority over, or controls collaboration between, disparate 
clusters in a network; or indicate they are on the periphery of both clusters.

"""
# degree centrality
dc = nx.degree_centrality(di_graph)
nx.set_node_attributes(di_graph, name='dc', values=dc)

"""
Definition: Degree centrality assigns an importance score based purely on the number of links held by each node. 

What it tells us: How many direct, ‘one hop’ connections each node has to other nodes within the network.

When to use it: For finding very connected individuals, popular individuals, individuals who are likely to hold
most information or individuals who can quickly connect with the wider network.

A bit more detail: Degree centrality is the simplest measure of node connectivity. Sometimes it’s useful to look 
at in-degree (number of inbound links) and out-degree (number of outbound links) as distinct measures, 
for example  when looking at transactional data or account activity.
"""
idc = nx.in_degree_centrality(di_graph)
nx.set_node_attributes(di_graph, name='idc', values=idc)

odc = nx.out_degree_centrality(di_graph)
nx.set_node_attributes(di_graph, name='odc', values=odc)


# eignevector centrality
edc = nx.eigenvector_centrality(di_graph)
nx.set_node_attributes(di_graph, name='odc', values=edc)
"""
Definition: Like degree centrality, EigenCentrality measures a node’s influence based on the number of links it 
has to other nodes within the network. EigenCentrality then goes a step further by also taking into account how 
well connected a node is, and how many links their connections have, and so on through the network.

What it tells us: By calculating the extended connections of a node, EigenCentrality can identify nodes with 
influence over the whole network, not just those directly connected to it.

When to use it: EigenCentrality is a good ‘all-round’ SNA score, handy for understanding human social networks, 
but also for understanding networks like malware propagation.

A bit more detail: KeyLines calculates each node’s EigenCentrality by converging on an eigenvector using the power 
iteration method. Learn more.
"""
# closeness centrality
cc = nx.closeness_centrality(di_graph)
nx.set_node_attributes(di_graph, name='cc', values=cc)
"""
Definition: This measure scores each node based on their ‘closeness’ to all other nodes within the network.

What it tells us: This measure calculates the shortest paths between all nodes, then assigns each node a score 
based 
on its sum of shortest paths.

When to use it: For finding the individuals who are best placed to influence the entire network most quickly.

A bit more detail: Closeness centrality can help find good ‘broadcasters’, but in a highly connected network 
you will often find all nodes have a similar score. What may be more useful is using Closeness to find influencers 
within a single cluster.

"""

# page rank
pr = nx.pagerank(di_graph)
nx.set_node_attributes(di_graph, name='pr', values=pr)

"""
Definition: PageRank is a variant of EigenCentrality, also assigning nodes a score based on their connections,
and their connections’ connections. The difference is that PageRank also takes link direction and weight into 
account – so links can only pass influence in one direction, and pass different amounts of influence.

What it tells us: This measure uncovers nodes whose influence extends beyond their direct connections into the 
wider network.

When to use it: Because it factors in directionality and connection weight, PageRank can be helpful for 
understanding citations and authority.

"""

'\nDefinition: PageRank is a variant of EigenCentrality, also assigning nodes a score based on their connections,\nand their connections’ connections. The difference is that PageRank also takes link direction and weight into \naccount – so links can only pass influence in one direction, and pass different amounts of influence.\n\nWhat it tells us: This measure uncovers nodes whose influence extends beyond their direct connections into the \nwider network.\n\nWhen to use it: Because it factors in directionality and connection weight, PageRank can be helpful for \nunderstanding citations and authority.\n\n'

In [9]:
# giant component filter
# giant = max(nx.connected_component_subgraphs(G), key=len)

di_graph['links'] = [
        {
            'source': di_graph['nodes'][link['source']]['id'],
            'target': di_graph['nodes'][link['target']]['id'],
            'id': link['id'], 'size': link['size'], 'color':'#bcdbf6'
        }
        for link in di_graph['links']]

KeyError: 'links'

In [37]:
di_graph = str(json.dumps(di_graph))

if outputfile_format.upper() == 'JSON':

    print('\nExporting JSON file..')

    # create a dictionary in a node-link format that is suitable for JSON serialization
    with open('../../../data/processed/' + outputfile + '.json', 'w') as outfile1:
        outfile1.write(str(json.dumps(json_graph.node_link_data(G=di_graph, attrs={'link':'edges', 'name':'label',
                                                                               'source':'source', 'target':'target'}))))
    print('Complete!')

elif outputfile_format.upper() == 'GEXF':
    print('\nExporting GEXF file..')
    nx.write_gexf(di_graph, '../../../data/processed/' + outputfile + '.gexf')
    print('\nComplete!')

else: print('Please enter a valid output file format: JSON or GEXF')

In [36]:
analyze_convert('TheDataFox.gml', 'TheDataFox', outputfile_format='json')

In [26]:
import json
import networkx as nx
from networkx.readwrite import json_graph
from random import randint as rand
 
def graphing(pair, node, word):
    word = word.lower()
 
    # Creates a primary word list and strips out the key word
    primary_list = [k.replace(" " + word, '') for k, v in pair.items() if v[1] == 'P']
 
    primary_list_test = [k.replace(" " + word, '') for k, v in pair.items()]
 
    # Creates a tertiary word list but retains the
    tertiary_list = [k.split() for k, v in pair.items() if v[1] == 'T']
 
    # Adds in words to the primary list
    for t in tertiary_list:
        for n in t:
            if n not in primary_list:
                primary_list.append(n)
 
    # Index used for providing an ID to the edge
    num = 1
    # creates a new NetworkX graph
    NG = nx.Graph()                                             
 
    for k, v in pair.items():                                   
        # Split the key into two words
        p = k.split(" ", 1)
        # Assign the weight variable
        w = v[0]/2
        # Add an edge with nodes, id, weight and color atributes
        NG.add_edge(p[0], p[1], id=num, weight=w, color='#bcdbf6', size=1)  
        # Iterates the ID
        num = num + 1
 
    # Maximum weight value for scaling
    maxval = max(node.values(), key=lambda x: x)
    prim_co_ord = {}
 
    # Adds the central node with maximum size and centred into frame
    NG.add_node(word, size=maxval, label=word, x=3, y=3, color='#F6851F',
                borderColor='#bcdbf6', borderWidth=2)
 
    # Adds in the primary nodes
    for p in primary_list:
        val = node[p]
        # normalizes size of node. This can be modified or even removed
        v = ( (val / (maxval*0.6) * 12) + 12 )
        # Assigns unique random co-ordinates for the graph
        co_ord_x = rand(90, 110) / float(100)
        co_ord_y = rand(50, 200) / float(100)
        # Store these for later
        prim_co_ord[p] = [co_ord_x, co_ord_y]
        # Add node to graph with parameters
        NG.add_node(p, size=v, label=p, x=co_ord_x, y=co_ord_y, color='#F6851F',
                    borderColor='#bcdbf6', borderWidth=2)
 
    for t in tertiary_list:
        # Retrieves the word and it's pairs for tertiary nodes
        u=t[0]
        w=t[1]
        # The weight for the node of interest
        val = node[u]
        # normalizes size of node. This can be modified or even removed
        v = ((val / (maxval * 0.6) * 12) + 12)
 
        # Adds in x, y co-ords close to the primary node.
        try:
            tert_co_ord_x = prim_co_ord[w][0] + (rand(-5, 5) / float(100))
            tert_co_ord_y = prim_co_ord[w][1] + (rand(-5, 5) / float(100))
        except:
            tert_co_ord_x = prim_co_ord[u][0] + (rand(-5, 5) / float(100))
            tert_co_ord_y = prim_co_ord[u][1] + (rand(-5, 5) / float(100))
 
        # Adds in node with attributes
        NG.add_node(u, size=v, label=u, x=tert_co_ord_x, y=tert_co_ord_y, 
                    color='#F6851F', borderColor='#bcdbf6', borderWidth=2)
 
    # Converts the Network graph to a JSON format
    fixNG = json_graph.node_link_data(NG)
    # Fixes the network so that edges use node names instead of integers
    fixNG['links'] = [
        {
            'source': fixNG['nodes'][link['source']]['id'],
            'target': fixNG['nodes'][link['target']]['id'],
            'id': link['id'], 'size': link['size'], 'color':'#bcdbf6'
        }
        for link in fixNG['links']]
    # Stringifies the json
    fixNG = str(json.dumps(fixNG))
    # Changes links to edges to comply with Sigma.JS
    rtnNG = fixNG.replace('links', 'edges')
 
    return rtnNG, fixNG
 
def parse(listDB, word):        # Parses the data into dictionaries
    pairDict = {}
    nodeDict = {}
    # Separates primary nodes into their own list and sorts on count
    # A maximum of 15 nodes are selected from t
    pLst = [l for l in listDB if l[3] == 'P']
    pLst = sorted(pLst, key=lambda x: x[2], reverse=True)[:15]
    # A temp list for tertiary nodes linked to primary nodes
    m_pLst = [n[0] for n in pLst]
 
    # Separates tertiary nodes into their own list if in m_pLst
    tLst = [l for l in listDB if l[3] == 'T' and l[1] in set(m_pLst)]
    tLst = sorted(tLst, key=lambda x: x[2], reverse=True)[:20]
 
    for l in tLst:
        pLst.append(l)
 
    for lst in pLst:
        # Defines 1st word, 2nd word and count
        x = lst[0].lower()
        y = lst[1].lower()
        z = lst[2]
        if x and y:
            # Defines key and swapped order key for pairs
            key = (x+" "+y)
            varkey = (y+" "+x)
            val = lst[2], lst[3]
            val = list(val)
            # If these don't exist, add to pairs dict
            if key in pairDict:
                pass
            elif varkey in pairDict:
                pass
            else:
                pairDict[key] = val
            # Adds weights to node dicts for each word in the pair
            if x in nodeDict:
                nodeDict[x] += z
            else:
                nodeDict[x] = z
            if y in nodeDict:
                nodeDict[y] += z
            else:
                nodeDict[y] = z
    return graphing(pairDict, nodeDict, word)
 
list_in = [['friend', 'country', 3, 'P'],
['look', 'country', 4, 'P'],
['person', 'country', 2, 'P'],
['make', 'country', 2, 'P'],
['mimisamat8', 'look', 2, 'T'],
['heoolll', 'look', 2, 'T'],
['look', 'look', 1, 'T'],
['judge', 'person', 1, 'T'],
['kind', 'person', 1, 'T'],
['looks', 'make', 1, 'T'],
['thing', 'make', 1, 'T'],
['personality', 'make', 1, 'T'],
['pasta', 'make', 1, 'T'],
['italy', 'make', 1, 'T']]
 
print(parse(list_in, 'country'))

TypeError: list indices must be integers or slices, not str

In [35]:
di_graph.node(data=True)

NodeDataView({'EAStarWars': {'graphics': {'x': -207.26126, 'y': 188.25581, 'z': 0.0, 'w': 10.0, 'h': 10.0, 'd': 10.0, 'fill': '#df89ff'}, 'user_id': '3033103596', 'file': 'TheDataFox.dat', 'image': 'E:\\Users\\Lee Joshi-Jones\\Documents\\Data Science\\GitHub\\sentiment-influencer-analysis\\src\\data\\influencer\\img\\3033103596.jpg', 'type': 'friends', 'statuses': '52474', 'friends': '75', 'followers': '468345', 'listed': '1503', 'ffr': '6244.6', 'lfr': '0.0321', 'shape': 'triangle-up', 'ModularityClass': '13', 'mc': 11, 'x': -0.13697854515090027, 'y': 0.1556508532887628, 'bc': 5.373733565508596e-05, 'dc': 0.0794392523364486, 'idc': 0.06542056074766354, 'odc': 0.11446838616518366, 'cc': 0.33012052599813446, 'pr': 0.0052783968287691335}, 'NVIDIAGeForce': {'graphics': {'x': -109.17039, 'y': 55.39589, 'z': 0.0, 'w': 10.0, 'h': 10.0, 'd': 10.0, 'fill': '#73c000'}, 'user_id': '86395621', 'file': 'TheDataFox.dat', 'image': 'E:\\Users\\Lee Joshi-Jones\\Documents\\Data Science\\GitHub\\sentime