# Introduction to Network Analysis with Python and NetworkX

**Avery Fernandez and Vincent Scalfani**

UA Libraries, Data Services

February 15, 2023

Code in this notebook is MIT licensed, you can find a copy of the license here: https://github.com/ualibweb/UALIB_Workshops

Bibliographic data is credited to NCBI and NLM. Please see the NCBI Website and Data Usage Policies and Disclaimers for more information regarding the data: https://www.ncbi.nlm.nih.gov/home/about/policies/

NetworkX Docs: https://networkx.org/documentation/stable/

## Installing Libraries

We will need to install a few libraries. You can use conda or pip. Here is a conda recipe:

```
conda create --name my-env
conda activate my-env
conda install -c conda-forge jupyterlab numpy matplotlib pandas networkx
```
or through pip:

``` 
pip install numpy matplotlib pandas networkx
```

## A Basic Network Example

In [None]:
colabs = {
    ('Avery', 'Krishav'): 1,
    ('Avery', 'Cyrus'): 1,
    ('Avery', 'Jay'): 1,
    ('Avery', 'Michael'): 1,
    ('Adam', 'Avery'): 1,
    ('Adam', 'Cyrus'): 1,
    ('Jay', 'Michael'): 1
}

In [None]:
import networkx as nx
G = nx.Graph()

for key, value in colabs.items():
    G.add_edge(key[0], key[1], weight=value)

In [None]:
name_labels = {}
for nodes in G.nodes(data=True):
    name_labels[nodes[0]] = str(nodes[0])

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
pos = nx.spring_layout(G, seed=5)
nx.draw_networkx_nodes(G, pos, ax=ax, node_size=3000, node_color="plum")
nx.draw_networkx_labels(G, pos, name_labels, ax=ax)
nx.draw_networkx_edges(G, pos, ax=ax)


plt.axis("off")
plt.tight_layout()
plt.show()

## Create a Co-Author Network

### Import the author data

In [None]:
# take a look at the data first
!head UA_pubmed_authors.tsv

In [None]:
# import the data into a list variable where each row is a string
data = []
with open("UA_pubmed_authors.tsv", "r") as fileIn:
    for line in fileIn:
        data.append(line)

In [None]:
data[0:10]

In [None]:
print(len(data))

### Data Preparation

In [None]:
# Filter the data such that we only keep article rows
# with less than six authors (for simplicity)

clean_data = []
for line in data:
    authors = line.split("\t")
    if len(authors) > 10: # 5 authors
        continue
    clean_data.append(line.split("\t"))

In [None]:
clean_data[0:10]

In [None]:
print(len(clean_data))

In [None]:
# do some cleanup to get a list of author collaborations into a list
authors = []
for line in clean_data:
    length = len(line[1:])
    article_authors = []
    for i in range(0,length,2):
        author = (f"{line[i+1]}, {line[i+2]}").rstrip()
        article_authors.append(author)
    authors.append(article_authors)   

In [None]:
print(authors[0:10])

In [None]:
# compute combination pairs of authors
from itertools import combinations
author_pairs = []
for article in authors:
    subsets = list(combinations(article, 2))
    author_pairs+=subsets

In [None]:
author_pairs[0:10]

In [None]:
print(len(author_pairs))

In [None]:
# sort the author pairs, so that they appear in the same
# order each time
sorted_pairs = []
for i in author_pairs:
    author_list = list(i)
    author_list.sort()
    sorted_pairs.append(tuple(author_list))

In [None]:
sorted_pairs[0:10]

In [None]:
len(set(sorted_pairs))

In [None]:
# count occurances
counted_pairs = {}
for pair in sorted_pairs:
    if pair in counted_pairs.keys():
        counted_pairs[pair] += 1
    else:
        counted_pairs[pair] = 1

In [None]:
list(counted_pairs.items())[0:10]

In [None]:
# sort the pairs by highest frequency
sorted_pairs = dict(sorted(counted_pairs.items(), key=lambda x:x[1], reverse=True))
list(sorted_pairs.items())[0:10]

In [None]:
# filter for collaborations > 3
filtered_pairs = {}
for key, value in counted_pairs.items():
    if value > 3:
        filtered_pairs[key] = value

In [None]:
len(filtered_pairs) # edges of graph

In [None]:
nodes = []
for key in filtered_pairs.keys():
    author1, author2 = key
    nodes.append(author1)
    nodes.append(author2)
nodes = list(set(nodes))

In [None]:
len(nodes)

In [None]:
nodes[0:10]

### Plotting the Network

In [None]:
import networkx as nx

In [None]:
# create a graph
# then add edges (nodes are automatically added)

G = nx.Graph()
for key, value in filtered_pairs.items():
    G.add_edge(key[0], key[1], weight=value)

In [None]:
nx.draw_spring(G, edgecolors="black", node_size=40)

In [None]:
# create a list of the individual connected graphs
connected_graphs = [G.subgraph(c).copy() for c in nx.connected_components(G)]

In [None]:
len(connected_graphs)

In [None]:
# get number of nodes in each connected graph
graph_sizes = []
for idx, i in enumerate(connected_graphs):
    size = nx.number_of_nodes(i)
    graph_sizes.append((idx, size))   

In [None]:
# We are sorting the graph_sizes data using the second value as its sorting number
sorted_graphs = sorted(graph_sizes, key=lambda x:x[1], reverse=True)

In [None]:
# idx, number of ndoes
sorted_graphs[:5]

In [None]:
nx.draw_spring(connected_graphs[48]) #idx 48

In [None]:
nx.draw_spring(connected_graphs[22]) # idx 22

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()

pos = nx.spring_layout(connected_graphs[22], seed=1)
nx.draw_networkx_nodes(connected_graphs[22], pos, ax=ax, edgecolors="black", node_size=300)
plt.axis("off")
plt.tight_layout()
plt.show()

In [None]:
table_data = []
custom_labels = {}
for idx, nodes in enumerate(connected_graphs[22].nodes(data=True)):
    custom_labels[nodes[0]] = str(idx)
    table_data.append([idx, nodes[0]])

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()

pos = nx.spring_layout(connected_graphs[22], seed=1)
nx.draw_networkx_nodes(connected_graphs[22], pos, ax=ax, edgecolors="black", node_size=300)
nx.draw_networkx_labels(connected_graphs[22], pos, custom_labels, ax=ax)

plt.axis("off")
plt.tight_layout()
plt.show()

In [None]:
# view the node data in a dataframe
import pandas as pd
columns = ("Node ID", "Author")
df = pd.DataFrame(table_data, columns=columns)
df

In [None]:
# Now add edges
import matplotlib.pyplot as plt
fig, ax = plt.subplots()

pos = nx.spring_layout(connected_graphs[22], seed=1)
nx.draw_networkx_nodes(connected_graphs[22], pos, ax=ax, edgecolors="black", node_size=300)
nx.draw_networkx_labels(connected_graphs[22], pos, custom_labels, ax=ax)
nx.draw_networkx_edges(connected_graphs[22], pos, ax=ax)

plt.axis("off")
plt.tight_layout()
plt.show()

In [None]:
# list out edge data
connected_graphs[22].edges(data=True)

In [None]:
# Let's add in the weights as edge line thickness
# Adapted from: https://networkx.org/documentation/stable/auto_examples/drawing/plot_weighted_graph.html?highlight=weighted_graph

thick = [(u, v) for (u, v, d) in connected_graphs[22].edges(data=True) if d["weight"] >= 10]
medium = [(u, v) for (u, v, d) in connected_graphs[22].edges(data=True) if 5 < d["weight"] < 10]
thin = [(u, v) for (u, v, d) in connected_graphs[22].edges(data=True) if d["weight"] <= 5]

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()

pos = nx.spring_layout(connected_graphs[22], seed=1)
nx.draw_networkx_nodes(connected_graphs[22], pos, ax=ax, edgecolors="black", node_size=300, node_color="white")
nx.draw_networkx_labels(connected_graphs[22], pos, custom_labels, ax=ax, font_size=8)
nx.draw_networkx_edges(connected_graphs[22], pos, ax=ax)
nx.draw_networkx_edges(connected_graphs[22], pos, ax=ax, edgelist=thick, width=4, edge_color="indigo")
nx.draw_networkx_edges(connected_graphs[22], pos, ax=ax, edgelist=medium, width=3, edge_color="mediumpurple")
nx.draw_networkx_edges(connected_graphs[22], pos, ax=ax, edgelist=thin, width=2, edge_color="thistle")

plt.axis("off")
plt.tight_layout()
plt.show()

In [None]:
# add some additional features to plot such as node size = # of collaborators

node_sizes = {}
for nodes in connected_graphs[22].edges(data=True):
    author1,author2,size = nodes
    if author1 in node_sizes.keys():
        node_sizes[author1] += size["weight"]
    else:
        node_sizes[author1] = size["weight"]
    if author2 in node_sizes.keys():
        node_sizes[author2] += size["weight"]
    else:
        node_sizes[author2] = size["weight"]
node_sizes = dict(node_sizes.items())

In [None]:
custom_sizes = []
for nodes in connected_graphs[22].nodes:
    custom_sizes.append(node_sizes[nodes] * 20)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()

pos = nx.spring_layout(connected_graphs[22], seed=1)
nx.draw_networkx_nodes(connected_graphs[22], pos, ax=ax, edgecolors="black", node_size=custom_sizes, node_color="white")
nx.draw_networkx_labels(connected_graphs[22], pos, custom_labels, ax=ax, font_size=8)
nx.draw_networkx_edges(connected_graphs[22], pos, ax=ax)
nx.draw_networkx_edges(connected_graphs[22], pos, ax=ax, edgelist=thick, width=4, edge_color="indigo")
nx.draw_networkx_edges(connected_graphs[22], pos, ax=ax, edgelist=medium, width=3, edge_color="mediumpurple")
nx.draw_networkx_edges(connected_graphs[22], pos, ax=ax, edgelist=thin, width=2, edge_color="thistle")

plt.axis("off")
plt.tight_layout()
plt.show()

In [None]:
new_table = []
for line in table_data:
    idx, author = line
    new_table.append([idx, author, node_sizes[author]])
columns = ("Node ID", "Author", "Collaborations")
df = pd.DataFrame(new_table, columns=columns)
df

## Notes

NetworkX has many built in Network Analysis Functions:

https://networkx.org/documentation/stable/reference/index.html

See here for some of our examples:
https://github.com/vfscalfani/CSN_tutorial/blob/main/CSN_Jupyter_Notebooks/CSN_glucocorticoid_MCS_2.ipynb