In [7]:
import pandas as pd
import networkx as nx
from community import community_louvain



# LOAD THE EDGE LIST
edge_list = pd.read_csv(
    'reduceddataset.txt',
    sep='\t',
    skiprows=4,
    names=['source', 'target'],
    dtype={'source': int, 'target': int}
)


# CREATING GRAPH
G = nx.from_pandas_edgelist(edge_list, 'source', 'target')



# LOUVAIN COMM. DETECTION WITH RESOLUTION<1(SO THAT ONLY BIGGER CLUSTERS ARE CONSIDERED)
community_mapping = community_louvain.best_partition(G, resolution=0.85)



# GROUPING THE NODES
clusters = {}
for node, cluster_id in community_mapping.items():
    if cluster_id not in clusters:
        clusters[cluster_id] = []
    clusters[cluster_id].append(node)





# FILTERING THE CLUSTERS(REMOVING CLUSTERS WITH <3 NODES)
valid_clusters = []
for nodes in clusters.values():
    if len(nodes) >= 3:
        valid_clusters.append(nodes)



# SORTING
valid_clusters.sort(key=len, reverse=True)



# Show the top 10 largest clusters
for i, cluster in enumerate(valid_clusters[:10]):
    print(f"Bundle {i + 1} ({len(cluster)} products):")
    first_five = ", ".join(map(str, cluster[:5]))
    if len(cluster) > 5:
        first_five += "..."
    print(first_five)
    print()


Bundle 1 (2988 products):
10758, 62691, 262484, 192773, 260955...

Bundle 2 (2499 products):
1, 88160, 118052, 161555, 244916...

Bundle 3 (2215 products):
524298, 105389, 161337, 469532, 120...

Bundle 4 (1648 products):
262163, 34095, 277807, 279560, 524361...

Bundle 5 (1605 products):
8, 55461, 453513, 43, 6893...

Bundle 6 (1553 products):
45, 28318, 179783, 296035, 490336...

Bundle 7 (1457 products):
262153, 244004, 275016, 327921, 335281...

Bundle 8 (1419 products):
262169, 2920, 282371, 312302, 349408...

Bundle 9 (1369 products):
54686, 524431, 132233, 269773, 427847...

Bundle 10 (1368 products):
105, 278985, 490079, 520172, 524516...

