# **Link Analysis**

## Imports statements

##### If these packages are not already installed on your machine, please run: pip install networkx pandas numpy

In [None]:
import networkx as nx
import pandas as pd
import numpy as np
from collections import deque

## Constants statements

In [2]:
# --- CONFIGURATION OF COLORS ---

GREEN = "\033[92m"
RED = "\033[91m"
BLUE = "\033[94m"
RESET = "\033[0m"

## Fetching of csv files

###### For Wikipedia

In [42]:
nodes_path = "data/wiki_nodes.csv"
edges_path = "data/wiki_edges.csv"

wiki_nodes = pd.read_csv(nodes_path)
wiki_edges = pd.read_csv(edges_path)

print(f"[Wiki] {len(wiki_nodes)} nodes")
print(f"[Wiki] {len(wiki_edges)} links")

[Wiki] 853 nodes
[Wiki] 857 links


###### For MindBodyGreen

In [43]:
nodes_path = "data/MBG_nodes.csv"
edges_path = "data/MBG_edges.csv"

MBG_nodes = pd.read_csv(nodes_path)
MBG_edges = pd.read_csv(edges_path)

print(f"[MBG] {len(MBG_nodes)} nodes")
print(f"[MBG] {len(MBG_edges)} edges")

[MBG] 1365 nodes
[MBG] 10588 edges


###### For Feedspot

In [44]:
nodes_path = "data/blogs_nodes.csv"
edges_path = "data/blogs_edges.csv"

blogs_nodes = pd.read_csv(nodes_path)
blogs_edges = pd.read_csv(edges_path)

print(f"[Blogs] {len(blogs_nodes)} nodes")
print(f"[Blogs] {len(blogs_edges)} edges")

[Blogs] 4050 nodes
[Blogs] 19951 edges


## Create Graphs

In [7]:
G_wiki = nx.from_pandas_edgelist(wiki_edges, source='source', target='target', create_using=nx.DiGraph())
G_MBG = nx.from_pandas_edgelist(MBG_edges, source='source', target='target', create_using=nx.DiGraph())
G_blogs = nx.from_pandas_edgelist(blogs_edges, source='source', target='target', create_using=nx.DiGraph())

In [71]:
def print_graph_stats(graph, name="Graphe"):
    
    n = graph.number_of_nodes()
    real_edges = graph.number_of_edges()
    
    if len(graph) > 0:
        largest_cc = max(nx.weakly_connected_components(graph), key=len)
        nodes_in_largest = len(largest_cc)
    else:
        nodes_in_largest = 0
            
    print(f"{BLUE}Graph Statistics:{RESET}")
    print(f"{BLUE}Total number of nodes: {n}{RESET}")
    print(f"{BLUE}Total number of direct edges: {real_edges}{RESET}")
    print(f"{BLUE}Graph density: {nx.density(graph):.6f}{RESET}")
    print(f"{BLUE}Nodes in the largest connected component: {nodes_in_largest} ({nodes_in_largest/n:.1%} of the total){RESET}")


In [72]:
print_graph_stats(G_wiki, "Wikipedia")
print_graph_stats(G_MBG, "MindBodyGreen")
print_graph_stats(G_blogs, "Blogs Feedspot")

[94mGraph Statistics:[0m
[94mTotal number of nodes: 849[0m
[94mTotal number of direct edges: 857[0m
[94mGraph density: 0.001190[0m
[94mNodes in the largest connected component: 849 (100.0% of the total)[0m
[94mGraph Statistics:[0m
[94mTotal number of nodes: 1365[0m
[94mTotal number of direct edges: 5547[0m
[94mGraph density: 0.002979[0m
[94mNodes in the largest connected component: 1365 (100.0% of the total)[0m
[94mGraph Statistics:[0m
[94mTotal number of nodes: 4048[0m
[94mTotal number of direct edges: 11651[0m
[94mGraph density: 0.000711[0m
[94mNodes in the largest connected component: 377 (9.3% of the total)[0m


## Degree Matrix

#### Implementation

In [68]:
def get_degree_stats(Graph):

    in_degrees = dict(Graph.in_degree()) # Optimised data retrieval from NetworkX
    out_degrees = dict(Graph.out_degree())
    
    df = pd.DataFrame({ # DataFrame creation
        'node_id': list(Graph.nodes()),
        'in_degree': [in_degrees.get(n, 0) for n in Graph.nodes()],
        'out_degree': [out_degrees.get(n, 0) for n in Graph.nodes()]})

    df.set_index('node_id', inplace=True) # We index the ID to facilitate searches
    
    print(f"\n{BLUE}Overview of calculated degrees: {RESET}")
    print(df.head()) # Display first rows

    active_in = len(df[df['in_degree'] > 0]) # Count active nodes (non-zero)
    print(f"\n{BLUE}Number of pages with at least 1 inbound link: {active_in}{RESET}")
    
    return df

#### Execution

In [69]:
wiki_degrees = get_degree_stats(G_wiki)
MBG_degrees = get_degree_stats(G_MBG)
blogs_degrees = get_degree_stats(G_blogs)


[94mOverview of calculated degrees: [0m
                                               in_degree  out_degree
node_id                                                             
https://en.wikipedia.org/wiki/Lifestyle                0          47
https://en.wikipedia.org/wiki/Milton_Rokeach           1           0
https://en.wikipedia.org/wiki/Modernity                1           0
https://en.wikipedia.org/wiki/German_language          1           0
https://en.wikipedia.org/wiki/William_Dufty            1           0

[94mNumber of pages with at least 1 inbound link: 845[0m

[94mOverview of calculated degrees: [0m
                                                    in_degree  out_degree
node_id                                                                  
https://www.mindbodygreen.com/                            168          33
https://www.mindbodygreen.com/articles/anna-lei...          2          23
https://www.mindbodygreen.com/articles/case-for...          1          25
h

## Degree centrality

#### Implementation

In [38]:
def degree_centrality(Graph):
    
    in_centrality = nx.in_degree_centrality(Graph) # We are particularly interested in In-Degree Centrality (Popularity)

    df = pd.DataFrame(list(in_centrality.items()), columns=['node_id', 'centrality_score'])
    df = df.sort_values('centrality_score', ascending=False) # Sort by descending score (stars first)
    
    print(df.head(10)) # Print only the 10 most central nodes

    return df 

#### Execution

In [15]:
wiki_centrality = degree_centrality(G_wiki)
MBG_centrality = degree_centrality(G_MBG)
blogs_centrality = degree_centrality(G_blogs)

                                               node_id  centrality_score
216        https://en.wikipedia.org/wiki/United_States          0.002358
67     https://en.wikipedia.org/wiki/Physical_exercise          0.002358
399            https://en.wikipedia.org/wiki/Pollution          0.002358
37          https://en.wikipedia.org/wiki/Anthropology          0.002358
35          https://en.wikipedia.org/wiki/Georg_Simmel          0.002358
11   https://en.wikipedia.org/wiki/Sedentary_lifestyle          0.002358
46   https://en.wikipedia.org/wiki/Ecological_footp...          0.002358
47         https://en.wikipedia.org/wiki/Simple_living          0.002358
76   https://en.wikipedia.org/wiki/Lifestyle_(socio...          0.002358
300  https://en.wikipedia.org/wiki/Greenhouse_gas_e...          0.002358
                                              node_id  centrality_score
0                      https://www.mindbodygreen.com/          0.123167
29  https://institute.mindbodygreen.com/functional...

###### For Wikipedia

## Shortest Path

#### Configuration

In [36]:
start_node_wiki = 'https://en.wikipedia.org/wiki/Lifestyle'
end_node_wiki = 'https://en.wikipedia.org/wiki/Healthy_lifestyle'

start_node_MBG = 'https://www.mindbodygreen.com/movement'
end_node_MBG = 'https://www.mindbodygreen.com/articles/health-coach-vs-life-coach'

start_node_blogs = 'https://40plusstyle.com/beauty-tips-for-women-over-40/'
end_node_blogs = 'https://40plusstyle.com/best-online-shops/'

#### Implementation

In [70]:
def shortest_path(G, start_node, end_node):
    
    # Checking for the presence of nodes
    if start_node not in G:
        print(f"{RED}Error: The start page does not exist in your data{RESET}")
        print(f"{GREEN}URL searched: {start_node}{RESET}")
        return None
    
    if end_node not in G:
        print(f"{RED}Error: The end page does not exist in your data{RESET}")
        print(f"{GREEN}URL searched: {end_node}{RESET}")
        return None
    
    # Shortest path calculation
    try:
        path = nx.shortest_path(G, source=start_node, target=end_node)
        print(f"{GREEN}Path found! Distance: {len(path)-1} click(s){RESET}\n")

        for i, page in enumerate(path):
            
            if i == 0:
                print(f"{BLUE}START: {page}{RESET}\n")
            
            elif i == len(path) - 1:
                print(f"{BLUE}ARRIVAL: {page}{RESET}\n")
            
            else:
                print(f"{BLUE}Step {i}: {page}{RESET}\n")

        return path
        
    except nx.NetworkXNoPath:
        print(f"{RED}No path found between these two pages in your data{RESET}")
        return None

In [61]:
def shortest_path_matrix(A: np.ndarray) -> np.ndarray:
    n = A.shape[0]  # Number of nodes = size of A
    SP_matrix = np.full((n, n), np.inf)  # We initalize the distance matrix with the same size as A and with an infinite value everywhere
    
    for start in range(n):
        visited = [False] * n
        queue = deque()   
        queue.append(start)
        visited[start] = True
        SP_matrix[start, start] = 0  # The distance A -> A is null
        
        while queue:
            current = queue.popleft()           
            neighbors = np.where(A[current] > 0)[0]

            for neighbor in neighbors:
                if not visited[neighbor]:
                    visited[neighbor] = True
                    SP_matrix[start, neighbor] = SP_matrix[start, current] + 1 
                    queue.append(neighbor)
    
    return SP_matrix

In [62]:
def shortest_path_matrix_from_graph(G):
    # Convertit le graphe en matrice d'adjacence numpy
    A = nx.to_numpy_array(G, dtype=int)
    nodes = list(G.nodes())
    
    # Appelle ta fonction existante
    SP = shortest_path_matrix(A)
    
    return SP, nodes

#### Execution

In [37]:
path_wiki = shortest_path(G_wiki, start_node_wiki, end_node_wiki)
path_MBG = shortest_path(G_MBG, start_node_MBG, end_node_MBG)
path_blogs = shortest_path(G_blogs, start_node_blogs, end_node_blogs)

[92mPath found! Distance: 1 click(s)[0m

[94mSTART: https://en.wikipedia.org/wiki/Lifestyle[0m

[94mARRIVAL: https://en.wikipedia.org/wiki/Healthy_lifestyle[0m

[92mPath found! Distance: 3 click(s)[0m

[94mSTART: https://www.mindbodygreen.com/movement[0m

[94mStep 1: https://www.mindbodygreen.com/[0m

[94mStep 2: https://institute.mindbodygreen.com/[0m

[94mARRIVAL: https://www.mindbodygreen.com/articles/health-coach-vs-life-coach[0m

[92mPath found! Distance: 2 click(s)[0m

[94mSTART: https://40plusstyle.com/beauty-tips-for-women-over-40/[0m

[94mStep 1: https://40plusstyle.com/[0m

[94mARRIVAL: https://40plusstyle.com/best-online-shops/[0m



In [65]:
SP_wiki, nodes_wiki = shortest_path_matrix_from_graph(G_wiki)
SP_MBG, nodes_MBG = shortest_path_matrix_from_graph(G_MBG)
SP_blogs, nodes_blogs = shortest_path_matrix_from_graph(G_blogs)

print(f"\n{BLUE} Computation of shortest path matrix for Wikipedia {RESET}")
print(SP_wiki)

print(f"\n{BLUE} Computation of shortest path matrix for MindBodyGreen {RESET}")
print(SP_MBG)

print(f"\n{BLUE} Computation of shortest path matrix for Feedspot {RESET}")
print(SP_blogs)


[94m Computation of shortest path matrix for Wikipedia [0m
[[ 0.  1.  1. ... inf inf inf]
 [inf  0. inf ... inf inf inf]
 [inf inf  0. ... inf inf inf]
 ...
 [inf inf inf ...  0. inf inf]
 [inf inf inf ... inf  0. inf]
 [inf inf inf ... inf inf  0.]]

[94m Computation of shortest path matrix for MindBodyGreen [0m
[[ 0.  1.  1. ...  3.  3.  3.]
 [ 1.  0.  2. ...  3.  3.  3.]
 [ 1.  2.  0. ...  3.  3.  3.]
 ...
 [inf inf inf ...  0. inf inf]
 [inf inf inf ... inf  0. inf]
 [inf inf inf ... inf inf  0.]]

[94m Computation of shortest path matrix for Feedspot [0m
[[ 0.  1.  1. ... inf inf inf]
 [inf  0. inf ... inf inf inf]
 [ 1.  2.  0. ... inf inf inf]
 ...
 [inf inf inf ...  0. inf inf]
 [inf inf inf ... inf  0. inf]
 [inf inf inf ... inf inf  0.]]


## Betweenness centrality

#### Implementation

In [39]:
def get_betweenness(Graph): # Calculates Betweenness Centrality (with optimisation k=100 if necessary)
    
    # If the graph is large (>2000 nodes), we approximate to save time.
    if len(Graph) > 2000:
        scores = nx.betweenness_centrality(Graph, k=100)
    else:
        scores = nx.betweenness_centrality(Graph)
        
    df = pd.DataFrame(list(scores.items()), columns=['node_id', 'Betweenness'])

    return df.sort_values('Betweenness', ascending=False)

#### Execution

In [73]:
print(f"\n{BLUE}--- Betweenness calculations ---{RESET}")

wiki_betweenness = get_betweenness(G_wiki)
MBG_betweenness = get_betweenness(G_MBG)
blogs_betweenness = get_betweenness(G_blogs)

print(f"\n{GREEN}Top 5 Wikipedia:{RESET}")
print(wiki_betweenness.head(5))

print(f"\n{GREEN}Top 5 MBG:{RESET}")
print(MBG_betweenness.head(5))

print(f"\n{GREEN}Top 5 Blogs:{RESET}")
print(blogs_betweenness.head(5))


[94m--- Betweenness calculations ---[0m

[92mTop 5 Wikipedia:[0m
                                              node_id  Betweenness
26   https://en.wikipedia.org/wiki/Sustainable_living       0.0004
833      https://en.wikipedia.org/wiki/Rudi_Gernreich       0.0000
817     https://en.wikipedia.org/wiki/Market_research       0.0000
2             https://en.wikipedia.org/wiki/Modernity       0.0000
3       https://en.wikipedia.org/wiki/German_language       0.0000

[92mTop 5 MBG:[0m
                                              node_id  Betweenness
0                      https://www.mindbodygreen.com/     0.067526
33  https://podcasts.apple.com/us/podcast/the-mind...     0.010389
19   https://www.mindbodygreen.com/integrative-health     0.006446
10  https://www.mindbodygreen.com/articles/peter-a...     0.006431
17                 https://www.mindbodygreen.com/food     0.005832

[92mTop 5 Blogs:[0m
                                                node_id  Betweenness
2387         

## Page Rank

#### Implementation

In [74]:
def get_pagerank(Graph, name="Graphe"):

    print(f"{BLUE}Calculation of advanced metrics for: {name}...{RESET}\n")

    pr = nx.pagerank(Graph, alpha=0.85) # Calcules PageRank with damping factor 0.85
    
    reverse_pr = nx.pagerank(Graph.reverse(), alpha=0.85)

    df = pd.DataFrame({
        'node_id': list(Graph.nodes()),
        'pagerank_score': [pr.get(n, 0) for n in Graph.nodes()],
        'reverse_pagerank_score': [reverse_pr.get(n, 0) for n in Graph.nodes()]
    })
    
    return df.sort_values('pagerank_score', ascending=False)

In [77]:
def get_personalized_pagerank(Graph, name="Graphe", keyword="lifestyle"):
    print(f"Computation of PageRank for: {name}")

    # We search for pages that contain the keyword.
    personalization = {}
    found_count = 0

    for node in Graph.nodes():

        if keyword in str(node).lower():
            personalization[node] = 100.0 # Big bonus
            found_count += 1
        else:
            personalization[node] = 1.0  # Normal weight
    
    if found_count == 0:
        print(f"{RED}No page found with the keyword '{keyword}'.{RESET}")
        return pd.DataFrame()
    else:
        print(f"{BLUE}Pages found with the keyword '{keyword}': {found_count}{RESET}")

    # Calculation of biased PageRank
    # If no page has the keyword, we perform a normal PageRank calculation
    try:
        total_weight = sum(personalization.values())
        personalization = {k: v/total_weight for k, v in personalization.items()}
        
        ppr_scores = nx.pagerank(Graph, alpha=0.85, personalization=personalization)
        
        df = pd.DataFrame(list(ppr_scores.items()), columns=['node_id', 'Personalized_PR'])
        return df.sort_values('Personalized_PR', ascending=False)
        
    except Exception as e:
        print(f"{RED}Error: {e}{RESET}")
        return pd.DataFrame()

#### Execution

In [76]:
print(f"\n{BLUE}--- Calculate PageRank (Normal and Reverse) ---{RESET}")

wiki_pagerank = get_pagerank(G_wiki, "Wikipedia")
MBG_pagerank = get_pagerank(G_MBG, "MindBodyGreen")
blogs_pagerank = get_pagerank(G_blogs, "Blogs Feedspot")

print(f"\n{GREEN}Top 5 Wikipedia:{RESET}")
print(wiki_pagerank.head(5))

print(f"\n{GREEN}Top 5 MBG:{RESET}")
print(MBG_pagerank.head(5))

print(f"\n{GREEN}Top 5 Blogs:{RESET}")
print(blogs_pagerank.head(5))


[94m--- Calculate PageRank (Normal and Reverse) ---[0m
[94mCalculation of advanced metrics for: Wikipedia...[0m

[94mCalculation of advanced metrics for: MindBodyGreen...[0m

[94mCalculation of advanced metrics for: Blogs Feedspot...[0m


[92mTop 5 Wikipedia:[0m
                                              node_id  pagerank_score  \
11  https://en.wikipedia.org/wiki/Sedentary_lifestyle        0.001212   
16      https://en.wikipedia.org/wiki/Pierre_Bourdieu        0.001197   
37         https://en.wikipedia.org/wiki/Anthropology        0.001197   
35         https://en.wikipedia.org/wiki/Georg_Simmel        0.001197   
46  https://en.wikipedia.org/wiki/Ecological_footp...        0.001197   

    reverse_pagerank_score  
11                0.000564  
16                0.000564  
37                0.000564  
35                0.000564  
46                0.000564  

[92mTop 5 MBG:[0m
                                              node_id  pagerank_score  \
0                  

In [78]:
print(f"{BLUE}--- Focus : Lifestyle ---{RESET}")

KEYWORD = "lifestyle" # We can change it easily to ‘health’, ‘food’, etc.
print(f"{BLUE}--- Custom PageRank (Keyword: '{KEYWORD}') ---{RESET}")

wiki_perso = get_personalized_pagerank(G_wiki, "Wikipedia", keyword=KEYWORD)
if not wiki_perso.empty:
    print(f"\n{GREEN}Top 5 Wiki (Focus {KEYWORD}) :{RESET}")
    print(wiki_perso.head(5))

MBG_perso = get_personalized_pagerank(G_MBG, "MBG", keyword=KEYWORD)
if not MBG_perso.empty:
    print(f"\n{GREEN}Top 5 MBG (From a lifestyle perspective):{RESET}")
    print(MBG_perso.head(5))

blogs_perso = get_personalized_pagerank(G_blogs, "Blogs", keyword=KEYWORD)
if not blogs_perso.empty:
    print(f"\n{GREEN}Top 5 Blogs (Focus {KEYWORD}) :{RESET}")
    print(blogs_perso.head(5))

[94m--- Focus : Lifestyle ---[0m
[94m--- Custom PageRank (Keyword: 'lifestyle') ---[0m
Computation of PageRank for: Wikipedia
[94mPages found with the keyword 'lifestyle': 10[0m

[92mTop 5 Wiki (Focus lifestyle) :[0m
                                              node_id  Personalized_PR
11  https://en.wikipedia.org/wiki/Sedentary_lifestyle         0.052788
32      https://en.wikipedia.org/wiki/Green_lifestyle         0.052780
10      https://en.wikipedia.org/wiki/Lifestyle_brand         0.052780
31   https://en.wikipedia.org/wiki/Lifestyle_medicine         0.052780
28    https://en.wikipedia.org/wiki/Healthy_lifestyle         0.052780
Computation of PageRank for: MBG
[94mPages found with the keyword 'lifestyle': 3[0m

[92mTop 5 MBG (From a lifestyle perspective):[0m
                                                node_id  Personalized_PR
20              https://www.mindbodygreen.com/lifestyle         0.048551
1016  https://www.mindbodygreen.com/articles/new-res...         0

## HITS

#### Implementation

In [79]:
def get_hits(Graph):
    # Calculates Hub and Authority scores
    try:
        hubs, authorities = nx.hits(Graph, max_iter=100, tol=1e-08)
        
        df = pd.DataFrame({
            'node_id': list(Graph.nodes()),
            'Hub_Score': [hubs.get(n, 0) for n in Graph.nodes()],
            'Authority_Score': [authorities.get(n, 0) for n in Graph.nodes()]
        })
        # Returns sorted by Authority by default
        return df.sort_values('Authority_Score', ascending=False)
        
    except Exception as e:
        print(f"{RED}HITS error (non-convergence): {e}{RESET}")

        return pd.DataFrame() 

#### Execution

In [80]:
print(f"\n{BLUE}--- HITS calculations ---{RESET}")

wiki_hits = get_hits(G_wiki)
MBG_hits = get_hits(G_MBG)
blogs_hits = get_hits(G_blogs)

print(f"\n{GREEN}Top 5 Wikipedia (Authorities) :{RESET}")
print(wiki_hits[['node_id', 'Authority_Score']].head(5))

print(f"\n{GREEN}Top 5 MBG (Authorities) :{RESET}")
print(MBG_hits[['node_id', 'Authority_Score']].head(5))

print(f"\n{GREEN}Top 5 Blogs (Authorities) :{RESET}")
print(blogs_hits[['node_id', 'Authority_Score']].head(5))


[94m--- HITS calculations ---[0m

[92mTop 5 Wikipedia (Authorities) :[0m
                                               node_id  Authority_Score
472               https://en.wikipedia.org/wiki/London         0.003460
399            https://en.wikipedia.org/wiki/Pollution         0.003460
300  https://en.wikipedia.org/wiki/Greenhouse_gas_e...         0.003460
46   https://en.wikipedia.org/wiki/Ecological_footp...         0.002864
47         https://en.wikipedia.org/wiki/Simple_living         0.002864

[92mTop 5 MBG (Authorities) :[0m
                                    node_id  Authority_Score
18     https://www.mindbodygreen.com/health         0.042218
17       https://www.mindbodygreen.com/food         0.042218
20  https://www.mindbodygreen.com/lifestyle         0.042218
16     https://www.mindbodygreen.com/beauty         0.042218
24     https://www.mindbodygreen.com/planet         0.042218

[92mTop 5 Blogs (Authorities) :[0m
                                       node_id  Au

## Merge

#### Implementation

In [115]:
def merge_metrics(pagerank_df, betweenness_df, hits_df): # Merges the DataFrames of the metrics into a single one

    df = pagerank_df.copy()
    
    if 'Betweenness' in betweenness_df.columns:
        df = df.join(betweenness_df[['Betweenness']], how='outer')
        
    if 'Authority_Score' in hits_df.columns:
        df = df.join(hits_df[['Authority_Score', 'Hub_Score']], how='outer')
        
    return df

#### Execution

In [97]:
print(f"\n{BLUE}--- Metrics fusion ---{RESET}")

wiki_metrics = merge_metrics(wiki_pagerank, wiki_betweenness, wiki_hits)
MBG_metrics = merge_metrics(MBG_pagerank, MBG_betweenness, MBG_hits)
blogs_metrics = merge_metrics(blogs_pagerank, blogs_betweenness, blogs_hits)

print(f"{GREEN}Metrics merge successful !{RESET}")
print(wiki_metrics.head())


[94m--- Metrics fusion ---[0m
[92mMetrics merge successful ![0m
                                                    pagerank_score  \
node_id                                                              
https://en.wikipedia.org/wiki/1300%E2%80%931400...        0.001176   
https://en.wikipedia.org/wiki/16th_century                0.001176   
https://en.wikipedia.org/wiki/1970s_fashion               0.001176   
https://en.wikipedia.org/wiki/2000s_fashion               0.001176   
https://en.wikipedia.org/wiki/2016_United_State...        0.001176   

                                                    reverse_pagerank_score  \
node_id                                                                      
https://en.wikipedia.org/wiki/1300%E2%80%931400...                0.000564   
https://en.wikipedia.org/wiki/16th_century                        0.000564   
https://en.wikipedia.org/wiki/1970s_fashion                       0.000564   
https://en.wikipedia.org/wiki/2000s_fashion       

## Analysis of inter- and intra- theme

#### Implementation

In [125]:
def analyze_themes(Graph, nodes_df, metrics_df, name, theme_col='topic'):
    # Analyzes homophily (links between similar themes) and theme‑level scores

    print(f"\n=== THEMATIC ANALYSIS: {name.upper()} ==={RESET}")

    if theme_col not in nodes_df.columns:  # Checking the theme column
        print(f"\n{RED}Column {theme_col} not found in nodes.csv for {name}{RESET}")
        print(f"\n{GREEN}Available columns: {list(nodes_df.columns)}{RESET}")
        return

    # Preparation: URL → Theme mapping
    id_col = 'node_id'
    url_to_theme = dict(zip(nodes_df[id_col], nodes_df[theme_col]))

    # Add the theme attribute to the graph
    nx.set_node_attributes(Graph, url_to_theme, name='theme')

    # Homophily calculation
    intra, inter, unknown = 0, 0, 0
    
    for u, v in Graph.edges():
        t_u = Graph.nodes[u].get('theme', 'Unknown')
        t_v = Graph.nodes[v].get('theme', 'Unknown')
        
        if t_u == 'Unknown' or t_v == 'Unknown' or pd.isna(t_u) or pd.isna(t_v):
            unknown += 1

        elif t_u == t_v:
            intra += 1

        else:
            inter += 1
            
    valid = intra + inter
    
    if valid > 0:
        print("Homophily (Tendency to cite within the same theme):")
        print(f"Intra-theme links: {intra} ({intra/valid:.1%})")
        print(f"Inter-theme links: {inter} ({inter/valid:.1%})")
    else:
        print("Not enough qualified links to compute homophily.")

    # Theme-level scores (merged with previously computed metrics)
    if metrics_df is not None and not metrics_df.empty:
        analysis = metrics_df.copy()
        analysis['theme'] = analysis.index.map(url_to_theme)
        
        # Keep only numerical columns for averaging
        cols_to_mean = ['pagerank_score', 'Betweenness', 'Authority']
        cols_existing = [c for c in cols_to_mean if c in analysis.columns]
        
        if cols_existing:
            stats = analysis.groupby('theme')[cols_existing].mean()
            print("\nAverage Influence by Theme (Top 3 PageRank):")
            print(stats.sort_values('pagerank_score', ascending=False).head(3))
            
            # Bridges (Top 5% Betweenness)
            if 'Betweenness' in analysis.columns:
                thresh = analysis['Betweenness'].quantile(0.95)
                bridges = analysis[analysis['Betweenness'] >= thresh]
                print("\nBridges (Top 5% Betweenness):")
                print(bridges[['Betweenness']].head(5))


#### Execution

In [87]:
nodes_path_topics = "data/blogs_nodes_topics.csv"

blogs_nodes_topics = pd.read_csv(nodes_path_topics)

In [126]:
try:
    analyze_themes(G_blogs, blogs_nodes_topics, blogs_metrics, "Blogs", theme_col='topic')

except Exception as e:
    print(f"{RED}Error during the loading or analysis: {e}{RESET}")


=== THEMATIC ANALYSIS: BLOGS ===[0m
Homophily (Tendency to cite within the same theme):
Intra-theme links: 2674 (69.0%)
Inter-theme links: 1203 (31.0%)

Average Influence by Theme (Top 3 PageRank):
       pagerank_score   Betweenness
theme                              
0.0          0.000255  1.882733e-06
8.0          0.000251  0.000000e+00
1.0          0.000250  1.385441e-07

Bridges (Top 5% Betweenness):
                                     Betweenness
node_id                                         
http://24diner.com/                          0.0
http://abeautifulmess.com/                   0.0
http://accessprivacy.hearstmags.com          0.0
http://alexagmarsh.blogspot.com/             0.0
http://amzn.to/1l0gzq0                       0.0


In [None]:
# Verify if the fact that the Betweenness is nul for all topic is normal
nx.betweenness_centrality(G_blogs)

{'https://40plusstyle.com/': 6.140772272803946e-06,
 'https://40plusstyle.com/arch-support-shoes/': 8.244696736236028e-06,
 'https://40plusstyle.com/beauty-tips-for-women-over-40/': 7.550004696423548e-06,
 'https://40plusstyle.com/best-online-shops/': 8.244696736236028e-06,
 'https://40plusstyle.com/category/fashion-tips-for-women-over-40/': 6.516145899339982e-06,
 'https://40plusstyle.com/category/how-to-dress-after-40/': 4.141978598442385e-06,
 'https://40plusstyle.com/category/how-to-dress-after-40/how-to-accessorize/': 7.39710701356372e-06,
 'https://40plusstyle.com/category/how-to-dress-after-40/how-to-create-a-wardrobe-your-love/': 7.736055657164534e-06,
 'https://40plusstyle.com/category/how-to-dress-after-40/how-to-dress-for-your-body-type/': 7.186409208082132e-06,
 'https://40plusstyle.com/category/how-to-dress-after-40/what-shoes-to-wear-after-40/': 1.1430901231551793e-05,
 'https://40plusstyle.com/category/how-to-dress-after-40/what-to-wear/': 6.148188137593151e-06,
 'https: