In [3]:
import collections
import pandas as pd
import networkx as nx
import community as community_louvain
from collections import Counter, defaultdict

In [4]:
G = nx.read_gml('../network/products.gml')
df = pd.read_csv('../dataset/new_features.csv')

In [None]:
aa_list = []
for u, v, score in nx.adamic_adar_index(G):
    aa_list.append((u, v, score))

adamic_adar_df = pd.DataFrame(aa_list, columns=['product_u', 'product_v', 'AA_score'])

In [8]:
jc_list = []
for u, v, score in nx.jaccard_coefficient(G):
    jc_list.append((u, v, score))

jaccard_df = pd.DataFrame(jc_list, columns=['product_u', 'product_v', 'Jaccard'])

In [9]:
proj_df = adamic_adar_df.copy()
proj_df['jaccard'] = jaccard_df['Jaccard']
proj_df.to_csv('../dataset/products_projection.csv', index=False)

In [5]:
c = community_louvain.best_partition(G)

In [6]:
# number of communities

len(set(c.values()))

3

In [7]:
# number of products per community

collections.Counter(c.values())

Counter({0: 1646, 2: 1126, 1: 1000})

In [8]:
# top 5 main_categories per community

communities_categories = defaultdict(list)
for node, comm in c.items():
    main_category = G.nodes[node].get('main_category')
    if main_category:
        communities_categories[comm].append(main_category)

for comm in sorted(communities_categories.keys()):
    categories = communities_categories[comm]

    category_counts = Counter(categories)
    top_5 = category_counts.most_common(5)
    
    print(f"\n community {comm}")
    for i, (category, count) in enumerate(top_5, 1):
        percentage = (count / len(categories)) * 100
        print(f"   {i}. {category.title()}: {count} products ({percentage:.1f}%)")


 community 0
   1. All Electronics: 639 products (38.8%)
   2. Computers: 482 products (29.3%)
   3. Cell Phones & Accessories: 174 products (10.6%)
   4. Camera & Photo: 76 products (4.6%)
   5. Home Audio & Theater: 66 products (4.0%)

 community 1
   1. Computers: 323 products (32.3%)
   2. All Electronics: 253 products (25.3%)
   3. Cell Phones & Accessories: 117 products (11.7%)
   4. Camera & Photo: 113 products (11.3%)
   5. Home Audio & Theater: 77 products (7.7%)

 community 2
   1. Computers: 384 products (34.1%)
   2. All Electronics: 346 products (30.7%)
   3. Home Audio & Theater: 161 products (14.3%)
   4. Cell Phones & Accessories: 73 products (6.5%)
   5. Camera & Photo: 42 products (3.7%)


In [9]:
# average price per community

communities_prices = defaultdict(list)

for node, comm in c.items():
    price = G.nodes[node].get('price')
    
    if price:
        try:
            communities_prices[comm].append(float(price))
        except ValueError:
            pass

for comm in sorted(communities_prices.keys()):
    prices = communities_prices[comm]
    avg_price = sum(prices) / len(prices)
    
    print(f"\n community {comm}")
    print(f" Average price: ${avg_price:.2f}")


 community 0
 Average price: $51.70

 community 1
 Average price: $87.90

 community 2
 Average price: $61.03


In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,rating,parent_asin,user_id,main_category,average_rating,rating_number,price,categories,neg,...,user_deg,product_deg,user_pagerank,product_pagerank,user_closeness,product_closeness,user_eig,product_eig,product_community,user_community
0,0,5,B095LLDH4H,AFSKPY37N3C43SOI5IEXEK5JSIYA,all electronics,4.6,5564.0,17.97,"['electronics', 'power accessories', 'power co...",0.028,...,0.000288,0.002134,2.1e-05,0.000114,0.252682,0.295663,0.000719,0.002316,1,2
1,1,4,B0787GLJNQ,AFSKPY37N3C43SOI5IEXEK5JSIYA,computers,4.4,18317.0,11.99,"['electronics', 'computers & accessories', 'co...",0.04,...,0.000288,0.004037,2.1e-05,0.000192,0.252682,0.312346,0.000719,0.006395,1,2
2,2,4,B075QC3TZY,AFSKPY37N3C43SOI5IEXEK5JSIYA,cell phones & accessories,3.1,1833.0,12.46,"['electronics', 'gps, finders & accessories', ...",0.0,...,0.000288,0.002711,2.1e-05,0.000144,0.252682,0.300855,0.000719,0.003654,1,2
3,3,3,B00GXXJF72,AFSKPY37N3C43SOI5IEXEK5JSIYA,computers,4.1,1630.0,119.99,"['electronics', 'computers & accessories', 'co...",0.073,...,0.000288,0.00421,2.1e-05,0.000202,0.252682,0.310244,0.000719,0.005808,1,2
4,4,5,B003VAGXZC,AFSKPY37N3C43SOI5IEXEK5JSIYA,all electronics,4.5,12006.0,29.99,"['electronics', 'computers & accessories', 'co...",0.078,...,0.000288,0.00894,2.1e-05,0.000383,0.252682,0.321712,0.000719,0.021876,0,2


In [13]:
# average price per community

communities_prices = defaultdict(list)

for node, comm in c.items():
    price = G.nodes[node].get('average_rating')
    
    if price:
        try:
            communities_prices[comm].append(float(price))
        except ValueError:
            pass

for comm in sorted(communities_prices.keys()):
    prices = communities_prices[comm]
    avg_price = sum(prices) / len(prices)
    
    print(f"\n community {comm}")
    print(f" Average rating: {avg_price:.2f}")


 community 0
 Average rating: 4.46

 community 1
 Average rating: 4.43

 community 2
 Average rating: 4.43


In [14]:
# average price per community

communities_prices = defaultdict(list)

for node, comm in c.items():
    price = G.nodes[node].get('rating_number')
    
    if price:
        try:
            communities_prices[comm].append(float(price))
        except ValueError:
            pass

for comm in sorted(communities_prices.keys()):
    prices = communities_prices[comm]
    avg_price = sum(prices) / len(prices)
    
    print(f"\n community {comm}")
    print(f" Average rating: {avg_price:.2f}")


 community 0
 Average rating: 24173.62

 community 1
 Average rating: 12449.13

 community 2
 Average rating: 8689.85


In [None]:
df = pd.read_csv('../dataset/new_features.csv')
df['product_community'] = df['parent_asin'].map(c)
df.to_csv('../dataset/new_features.csv', index=False)