In [46]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import itertools

G = nx.read_gml("GraphMissingEdges.gml")

In [47]:
print("Number of nodes:", len(G.nodes))
print("Number of edges:", len(G.edges))

Number of nodes: 4575
Number of edges: 18991


In [64]:
print("Attributes of each node:")
print(G.nodes[list(G.nodes())[900]])

Attributes of each node:
{'longitude': -79.952564, 'latitude': 40.481878, 'categories': '327,518', 'stars': '4.5', 'name': 'The Cigar Den', 'reviewCount': '28'}


In [49]:
print("Attributes of each edge:")
print(G.edges[list(G.edges())[20]])

Attributes of each edge:
{'weight': 1}


In [50]:
print(f"Average clustering: {nx.average_clustering(G)}")

Average clustering: 0.04489358044799769


In [51]:
all_categories = []
for node, data in G.nodes(data=True):
    cats = [c.strip() for c in data["categories"].split(",")]
    all_categories.extend(cats)

cat_counts = pd.Series(all_categories).value_counts()

print("Top 10 categories:")
print(cat_counts.head(10))

Top 10 categories:
1      1681
641     907
327     731
58      504
275     467
280     404
839     364
133     298
289     290
604     286
Name: count, dtype: int64


In [53]:
cat_edges = Counter()
for u, v in G.edges():
    cats_u = [c.strip() for c in G.nodes[u]["categories"].split(",")]
    cats_v = [c.strip() for c in G.nodes[v]["categories"].split(",")]
    for cu in cats_u:
        for cv in cats_v:
            if cu != "" and cv != "":
                pair = tuple(sorted([cu, cv]))
                cat_edges[pair] += 1

cat_edges_df = pd.DataFrame(cat_edges.items(), columns=["Connection", "Count"])
cat_edges_df = cat_edges_df.sort_values("Count", ascending=False).reset_index(drop=True)
print("Top 10 category-category connections:")
print(cat_edges_df.head(10))

Top 10 connections:
  CategoryPair  Count
0       (1, 1)  10681
1     (1, 641)   8983
2      (1, 58)   7303
3     (1, 275)   7105
4     (1, 420)   4760
5     (1, 604)   3947
6    (58, 641)   2985
7    (275, 58)   2955
8   (275, 641)   2899
9     (1, 289)   2853


In [66]:
star_edges = Counter()
for u, v in G.edges():
    stars_u = G.nodes[u]["stars"]
    stars_v = G.nodes[v]["stars"]
    
    if stars_u != "" and stars_v != "":
        pair = tuple(sorted([stars_u, stars_v]))
        star_edges[pair] += 1
    
star_edges_df = pd.DataFrame(star_edges.items(), columns=["Connection", "Count"])
star_edges_df = cat_edges_df.sort_values("Count", ascending=False).reset_index(drop=True)
print("Top 10 star-star connections:")
print(cat_edges_df.head(10))

Top 10 star-star connections:
  Connection  Count
0     (., 4)  21896
1     (., 5)  21305
2     (., .)  18991
3     (., 0)  17922
4     (., 3)  12580
5     (4, 5)  12353
6     (0, 4)  10280
7     (0, 5)   9793
8     (3, 5)   6965
9     (3, 4)   6792


In [68]:
review_count_edges = Counter()
for u, v in G.edges():
    rc_u = G.nodes[u]["reviewCount"]
    rc_v = G.nodes[v]["reviewCount"]
    
    if rc_u != "" and rc_v != "":
        pair = tuple(sorted([rc_u, rc_v]))
        review_count_edges[pair] += 1
    
review_count_edges_df = pd.DataFrame(review_count_edges.items(), columns=["Connection", "Count"])
review_count_edges_df = review_count_edges_df.sort_values("Count", ascending=False).reset_index(drop=True)
print("Top 10 review-count connections:")
print(review_count_edges_df.head(10))

Top 10 review-count connections:
Empty DataFrame
Columns: [Connection, Count]
Index: []
