## Import statements

In [None]:
# Import required packages
import pandas as pd
import numpy as np
import random
import statistics
from scipy.stats import ttest_1samp

random.seed(123)

## Functions

In [2]:
# Calculate cosine similarities for each edge
def calculate_cosine_similarity(row, magnitudes):
    weight = row["Weight"]
    magnitude_1 = magnitudes[row["Source"]]
    magnitude_2 = magnitudes[row["Target"]]
    return weight / (magnitude_1 * magnitude_2)


In [3]:
# Normalize edges to handle undirected graphs
def normalize_edges(df):
    df["Source"], df["Target"] = zip(*df[["Source", "Target"]].apply(lambda x: sorted(x), axis=1))
    return df

In [4]:
def get_jaccard_index(col1, col2):
    numerator = 0
    denominator = 0
    for i in range(len(col1)):
        denominator += 1
        if(col1[i] != 0 and col2[i]!= 0):
            numerator += 1
    
    jaccard_similarity = numerator / denominator
    print(f"Unweighted Jaccard Similarity: {jaccard_similarity:.4f}")
    return (jaccard_similarity)

In [5]:
def get_weighted_jaccard_index(col1, col2):
    numerator = 0
    denominator = 0
    for i in range(len(col1)):
        weight_diff = abs(col1[i]-col2[i])

        denominator += weight_diff
        if(col1[i] != 0 and col2[i]!= 0):
            numerator += weight_diff
    
    jaccard_similarity = numerator / denominator
    print(f"Weighted Jaccard Similarity: {jaccard_similarity:.4f}")
    return (jaccard_similarity)

In [6]:
def generate_random_edge_weights_exp(merged, prop_zeros, avg_val):
    out = []

    for i in range(0, len(merged)):
        missingOrNot = random.random() < prop_zeros

        if(missingOrNot):
            out.append(0)
        else:
            x = np.random.exponential(avg_val)
            if 0 <= x <= 1:
                out.append(x)
            else:
                out.append(0)
    
    return out

In [7]:
def generate_random_edge_weights_unif(merged, prop_zeros):
    out = []

    for i in range(0, len(merged)):
        missingOrNot = random.random() < prop_zeros

        if(missingOrNot):
            out.append(0)
        else:
            out.append(random.random())
    
    return out

## Run analysis

In [8]:
# Import male and female networks
f_file_path = "/Users/vsriram/Desktop/GxS/ddnComp_personalComputer/ddnsForDDNComp/ssDDNneg4_femaleBlock_edgeMap.tsv"
f_edges_df = pd.read_csv(f_file_path, sep="\t").iloc[:, :3]
f_edges_df['Weight'] = pd.to_numeric(f_edges_df['Weight'], errors='coerce')

m_file_path = "/Users/vsriram/Desktop/GxS/ddnComp_personalComputer/ddnsForDDNComp/ssDDNneg4_maleBlock_edgeMap.tsv"
m_edges_df = pd.read_csv(m_file_path, sep="\t").iloc[:, :3]
m_edges_df['Weight'] = pd.to_numeric(m_edges_df['Weight'], errors='coerce')

In [9]:
# Step 1: Compute node magnitudes
node_weights_f = {}

# Accumulate the square of weights for each node
for _, row in f_edges_df.iterrows():
    node_weights_f[row["Source"]] = node_weights_f.get(row["Source"], 0) + row["Weight"]**2
    node_weights_f[row["Target"]] = node_weights_f.get(row["Target"], 0) + row["Weight"]**2

# Take the square root to get magnitudes
node_magnitudes_f = {node: np.sqrt(total) for node, total in node_weights_f.items()}


In [10]:
# Step 1: Compute node magnitudes
node_weights_m = {}

# Accumulate the square of weights for each node
for _, row in m_edges_df.iterrows():
    node_weights_m[row["Source"]] = node_weights_m.get(row["Source"], 0) + row["Weight"]**2
    node_weights_m[row["Target"]] = node_weights_m.get(row["Target"], 0) + row["Weight"]**2

# Take the square root to get magnitudes
node_magnitudes_m = {node: np.sqrt(total) for node, total in node_weights_m.items()}


In [11]:
f_edges_df["CosineSimilarity"] = f_edges_df.apply(
    calculate_cosine_similarity, axis=1, magnitudes=node_magnitudes_f
)

In [12]:
m_edges_df["CosineSimilarity"] = m_edges_df.apply(
    calculate_cosine_similarity, axis=1, magnitudes=node_magnitudes_m
)

In [13]:
normalized_edges_f = normalize_edges(f_edges_df)
normalized_edges_m = normalize_edges(m_edges_df)

In [14]:
merged = pd.merge(
    normalized_edges_f, 
    normalized_edges_m, 
    on=["Source", "Target"], how="outer", suffixes=("_F", "_M")
)

In [15]:
# Step 3: Fill missing weights with 0
merged["CosineSimilarity_F"] = merged["CosineSimilarity_F"].fillna(0)
merged["CosineSimilarity_M"] = merged["CosineSimilarity_M"].fillna(0)

In [16]:
# Step 4: Calculate min and max weights for each edge
merged["WeightDiff"] = abs(merged["CosineSimilarity_F"]-merged["CosineSimilarity_M"])

In [17]:
rand_edge_weights_f_exp = generate_random_edge_weights_exp(
    merged,
    merged['CosineSimilarity_F'].value_counts().get(0, 0)/len(merged), 
    np.mean(merged['CosineSimilarity_F'])
)

rand_edge_weights_m_exp = generate_random_edge_weights_exp(
    merged,
    merged['CosineSimilarity_M'].value_counts().get(0, 0)/len(merged), 
    np.mean(merged['CosineSimilarity_M'])
)

In [18]:
rand_edge_weights_f_unif = generate_random_edge_weights_unif(
    merged,
    merged['CosineSimilarity_F'].value_counts().get(0, 0)/len(merged), 
)

rand_edge_weights_m_unif = generate_random_edge_weights_unif(
    merged,
    merged['CosineSimilarity_M'].value_counts().get(0, 0)/len(merged), 
)

In [19]:
get_weighted_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M'])

Weighted Jaccard Similarity: 0.0584


0.058381935697765314

In [20]:
get_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M'])

Unweighted Jaccard Similarity: 0.2639


0.2638888888888889

In [21]:
weighted_jaccards_exp = []
weighted_jaccards_unif = []
unweighted_jaccards_exp = []
unweighted_jaccards_unif = []

for i in range(100):
    rand_edge_weights_g1_exp = generate_random_edge_weights_exp(
        merged,
        merged['CosineSimilarity_F'].value_counts().get(0, 0)/len(merged), 
        np.mean(merged['CosineSimilarity_F'])
    )

    rand_edge_weights_g2_exp = generate_random_edge_weights_exp(
        merged,
        merged['CosineSimilarity_M'].value_counts().get(0, 0)/len(merged), 
        np.mean(merged['CosineSimilarity_M'])
    )

    rand_edge_weights_g1_unif = generate_random_edge_weights_unif(
        merged,
        merged['CosineSimilarity_F'].value_counts().get(0, 0)/len(merged), 
    )

    rand_edge_weights_g2_unif = generate_random_edge_weights_unif(
        merged,
        merged['CosineSimilarity_M'].value_counts().get(0, 0)/len(merged), 
    )

    weighted_jaccard_exp = get_weighted_jaccard_index(rand_edge_weights_g1_exp, rand_edge_weights_g2_exp)
    weighted_jaccards_exp.append(weighted_jaccard_exp)

    weighted_jaccard_unif = get_weighted_jaccard_index(rand_edge_weights_g1_unif, rand_edge_weights_g2_unif)
    weighted_jaccards_unif.append(weighted_jaccard_unif)

    unweighted_jaccard_exp = get_jaccard_index(rand_edge_weights_g1_exp, rand_edge_weights_g2_exp)
    unweighted_jaccards_exp.append(unweighted_jaccard_exp)

    unweighted_jaccard_unif = get_jaccard_index(rand_edge_weights_g1_unif, rand_edge_weights_g2_unif)
    unweighted_jaccards_unif.append(unweighted_jaccard_unif)


Weighted Jaccard Similarity: 0.5199
Weighted Jaccard Similarity: 0.3849
Unweighted Jaccard Similarity: 0.4226
Unweighted Jaccard Similarity: 0.4038
Weighted Jaccard Similarity: 0.4611
Weighted Jaccard Similarity: 0.3297
Unweighted Jaccard Similarity: 0.3700
Unweighted Jaccard Similarity: 0.3750
Weighted Jaccard Similarity: 0.5301
Weighted Jaccard Similarity: 0.3778
Unweighted Jaccard Similarity: 0.4067
Unweighted Jaccard Similarity: 0.4157
Weighted Jaccard Similarity: 0.5103
Weighted Jaccard Similarity: 0.3501
Unweighted Jaccard Similarity: 0.4058
Unweighted Jaccard Similarity: 0.3889
Weighted Jaccard Similarity: 0.5231
Weighted Jaccard Similarity: 0.3946
Unweighted Jaccard Similarity: 0.4067
Unweighted Jaccard Similarity: 0.4018
Weighted Jaccard Similarity: 0.4739
Weighted Jaccard Similarity: 0.3460
Unweighted Jaccard Similarity: 0.4157
Unweighted Jaccard Similarity: 0.3889
Weighted Jaccard Similarity: 0.4777
Weighted Jaccard Similarity: 0.3715
Unweighted Jaccard Similarity: 0.3690
Un

In [28]:
np.mean(weighted_jaccards_unif)


0.36739092870587725

In [29]:
np.std(weighted_jaccards_unif)

0.0188933994804983

In [28]:
# Function to perform a t-test; is the difference between the male and female network
#  significantly different from differences in a random population?
def perform_statistical_analysis(original_value, random_values):
    t_stat, p_value = ttest_1samp(random_values, original_value)
    
    return {
        "Jaccard Index": original_value,
        "t_stat": t_stat,
        "p_value": p_value,
    }

In [29]:
perform_statistical_analysis(
    get_weighted_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M']),
    weighted_jaccards_exp
)

Weighted Jaccard Similarity: 0.0584


{'Jaccard Index': 0.058381935697765314,
 't_stat': 171.10498503401428,
 'p_value': 3.327448288146901e-124}

In [30]:
perform_statistical_analysis(
    get_weighted_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M']),
    weighted_jaccards_unif
)

Weighted Jaccard Similarity: 0.0584


{'Jaccard Index': 0.058381935697765314,
 't_stat': 162.7341158553271,
 'p_value': 4.689910711150324e-122}

In [31]:
perform_statistical_analysis(
    get_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M']),
    unweighted_jaccards_exp
)

Unweighted Jaccard Similarity: 0.2639


{'Jaccard Index': 0.2638888888888889,
 't_stat': 94.92443577413574,
 'p_value': 4.942688807593054e-99}

In [32]:
perform_statistical_analysis(
    get_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M']),
    unweighted_jaccards_unif
)

Unweighted Jaccard Similarity: 0.2639


{'Jaccard Index': 0.2638888888888889,
 't_stat': 81.43424299605815,
 'p_value': 1.5909772451429588e-92}