In [7]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



# LOAD THE REDUCED CO-PURCHASE DATASET
df = pd.read_csv("reduceddataset14k.txt", sep="\t", header=None, names=["product", "coproduct"])
graph = nx.from_pandas_edgelist(df, 'product', 'coproduct', create_using=nx.Graph())






# DISPLAY BASIC GRAPH PROPERTIES
print("Graph info:")
print(f"Number of nodes: {graph.number_of_nodes()}")
print(f"Number of edges: {graph.number_of_edges()}")
print("\n")





# CALCULATE DEGREE CENTRALITY
deg_centrality = nx.degree_centrality(graph)

# CALCULATE BETWEENNESS CENTRALITY
btw_centrality = nx.betweenness_centrality(graph)



# SORT NODES BY DEGREE CENTRALITY IN DESCENDING ORDER
top_deg_sorted = sorted(deg_centrality.items(), key=lambda item: item[1], reverse=True)
# SORT NODES BY BETWEENNESS CENTRALITY IN DESCENDING ORDER
top_btw_sorted = sorted(btw_centrality.items(), key=lambda item: item[1], reverse=True)



# SELECT TOP 5 INFLUENTIAL NODES BASED ON BOTH CENTRALITIES
top_deg_nodes = top_deg_sorted[:5]
top_btw_nodes = top_btw_sorted[:5]



# DISPLAY THE TOP CENTRAL NODES
print("Top 5 products by degree centrality:", top_deg_nodes)
print("Top 5 products by betweenness centrality:", top_btw_nodes)




# DEFINE THE INDEPENDENT CASCADE MODEL FOR DIFFUSION SIMULATION
def run_cascade(graph, seed_node, prob=0.3, max_steps=5):
    active = set([seed_node])                 # SET OF ACTIVE (ADOPTED) NODES
    newly_active = set([seed_node])           # NODES ACTIVATED IN CURRENT ROUND

    for _ in range(max_steps):
        next_round = set()                    # STORE NEXT ROUND OF ACTIVATIONS
        for node in newly_active:
            neighbors = list(graph.neighbors(node))
            for neighbor in neighbors:
                if neighbor not in active:
                    if np.random.rand() < prob:
                        next_round.add(neighbor)
        if not next_round:
            break                              # STOP IF NO NEW ACTIVATIONS
        active.update(next_round)
        newly_active = next_round

    return active

# RUN ADOPTION SIMULATION FOR EACH TOP BETWEENNESS NODE
adoption_results = {}

for node, _ in top_btw_nodes:
    influenced = run_cascade(graph, seed_node=node, prob=0.3, max_steps=5)
    adoption_results[node] = influenced
    print(f"\nProduct {node} drives adoption of {len(influenced)} products:")
    print(influenced)


Graph Info:
Number of nodes: 12615
Number of edges: 21259



Top 5 Products by Degree Centrality: [('199628', 0.002616140795941018), ('430936', 0.002616140795941018), ('502784', 0.0025368638021246235), ('458358', 0.0022197558268590455), ('519449', 0.0020612018392262565)]
Top 5 Products by Betweenness Centrality: [('69693', 0.03580297602599901), ('520843', 0.032621983270913826), ('29075', 0.03222586857145321), ('332815', 0.03215333496930436), ('251978', 0.025428776154666932)]

Product 69693 drives adoption of the following 7 products:
{'129425', '168134', '461296', '537312', '396726', '69693', '449404'}

Product 520843 drives adoption of the following 13 products:
{'55580', '129425', '336619', '520843', '483957', '273835', '302206', '67456', '396726', '284825', '69693', '479073', '398712'}

Product 29075 drives adoption of the following 2 products:
{'29075', '332815'}

Product 332815 drives adoption of the following 1 products:
{'332815'}

Product 251978 drives adoption of the following