## Import statements

In [1]:
# Import required packages
import pandas as pd
import numpy as np
import random
import statistics
from scipy.stats import ttest_1samp

random.seed(123)

## Functions

In [2]:
# Calculate cosine similarities for each edge
def calculate_cosine_similarity(row, magnitudes):
    weight = row["Weight"]
    magnitude_1 = magnitudes[row["Source"]]
    magnitude_2 = magnitudes[row["Target"]]
    return weight / (magnitude_1 * magnitude_2)


In [3]:
# Normalize edges to handle undirected graphs
def normalize_edges(df):
    df["Source"], df["Target"] = zip(*df[["Source", "Target"]].apply(lambda x: sorted(x), axis=1))
    return df

In [4]:
def get_jaccard_index(col1, col2):
    numerator = 0
    denominator = 0
    for i in range(len(col1)):
        denominator += 1
        if(col1[i] != 0 and col2[i]!= 0):
            numerator += 1
    
    jaccard_similarity = numerator / denominator
    print(f"Unweighted Jaccard Similarity: {jaccard_similarity:.4f}")
    return (jaccard_similarity)

In [5]:
def get_weighted_jaccard_index(col1, col2):
    numerator = 0
    denominator = 0
    for i in range(len(col1)):
        weight_diff = abs(col1[i]-col2[i])

        denominator += weight_diff
        if(col1[i] != 0 and col2[i]!= 0):
            numerator += weight_diff
    
    jaccard_similarity = 1-(numerator / denominator)
    print(f"Weighted Jaccard Similarity: {jaccard_similarity:.4f}")
    return (jaccard_similarity)

In [6]:
def generate_random_edge_weights_exp(merged, prop_zeros, avg_val):
    out = []

    for i in range(0, len(merged)):
        missingOrNot = random.random() < prop_zeros

        if(missingOrNot):
            out.append(0)
        else:
            x = np.random.exponential(avg_val)
            if 0 <= x <= 1:
                out.append(x)
            else:
                out.append(0)
    
    return out

In [7]:
def generate_random_edge_weights_unif(merged, prop_zeros):
    out = []

    for i in range(0, len(merged)):
        missingOrNot = random.random() < prop_zeros

        if(missingOrNot):
            out.append(0)
        else:
            out.append(random.random())
    
    return out

## Run analysis

In [8]:
# Import male and female networks
f_file_path = "/Users/vsriram/Desktop/GxS/ddnComp_personalComputer/ddnsForDDNComp/ssDDNneg4_femaleBlock_edgeMap.tsv"
f_edges_df = pd.read_csv(f_file_path, sep="\t").iloc[:, :3]
f_edges_df['Weight'] = pd.to_numeric(f_edges_df['Weight'], errors='coerce')

m_file_path = "/Users/vsriram/Desktop/GxS/ddnComp_personalComputer/ddnsForDDNComp/ssDDNneg4_maleBlock_edgeMap.tsv"
m_edges_df = pd.read_csv(m_file_path, sep="\t").iloc[:, :3]
m_edges_df['Weight'] = pd.to_numeric(m_edges_df['Weight'], errors='coerce')

In [12]:
#np.median(m_edges_df['Weight'])
f_edges_df = f_edges_df[f_edges_df['Weight'] >= 5]
m_edges_df = m_edges_df[m_edges_df['Weight'] >= 8]

In [13]:
# Step 1: Compute node magnitudes
node_weights_f = {}

# Accumulate the square of weights for each node
for _, row in f_edges_df.iterrows():
    node_weights_f[row["Source"]] = node_weights_f.get(row["Source"], 0) + row["Weight"]**2
    node_weights_f[row["Target"]] = node_weights_f.get(row["Target"], 0) + row["Weight"]**2

# Take the square root to get magnitudes
node_magnitudes_f = {node: np.sqrt(total) for node, total in node_weights_f.items()}


In [14]:
# Step 1: Compute node magnitudes
node_weights_m = {}

# Accumulate the square of weights for each node
for _, row in m_edges_df.iterrows():
    node_weights_m[row["Source"]] = node_weights_m.get(row["Source"], 0) + row["Weight"]**2
    node_weights_m[row["Target"]] = node_weights_m.get(row["Target"], 0) + row["Weight"]**2

# Take the square root to get magnitudes
node_magnitudes_m = {node: np.sqrt(total) for node, total in node_weights_m.items()}


In [15]:
f_edges_df["CosineSimilarity"] = f_edges_df.apply(
    calculate_cosine_similarity, axis=1, magnitudes=node_magnitudes_f
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f_edges_df["CosineSimilarity"] = f_edges_df.apply(


In [16]:
m_edges_df["CosineSimilarity"] = m_edges_df.apply(
    calculate_cosine_similarity, axis=1, magnitudes=node_magnitudes_m
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m_edges_df["CosineSimilarity"] = m_edges_df.apply(


In [17]:
normalized_edges_f = normalize_edges(f_edges_df)
normalized_edges_m = normalize_edges(m_edges_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Source"], df["Target"] = zip(*df[["Source", "Target"]].apply(lambda x: sorted(x), axis=1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Source"], df["Target"] = zip(*df[["Source", "Target"]].apply(lambda x: sorted(x), axis=1))


In [18]:
merged = pd.merge(
    normalized_edges_f, 
    normalized_edges_m, 
    on=["Source", "Target"], how="outer", suffixes=("_F", "_M")
)

In [19]:
# Step 3: Fill missing weights with 0
merged["CosineSimilarity_F"] = merged["CosineSimilarity_F"].fillna(0)
merged["CosineSimilarity_M"] = merged["CosineSimilarity_M"].fillna(0)

In [20]:
# Step 4: Calculate min and max weights for each edge
merged["WeightDiff"] = abs(merged["CosineSimilarity_F"]-merged["CosineSimilarity_M"])

In [21]:
rand_edge_weights_f_exp = generate_random_edge_weights_exp(
    merged,
    merged['CosineSimilarity_F'].value_counts().get(0, 0)/len(merged), 
    np.mean(merged['CosineSimilarity_F'])
)

rand_edge_weights_m_exp = generate_random_edge_weights_exp(
    merged,
    merged['CosineSimilarity_M'].value_counts().get(0, 0)/len(merged), 
    np.mean(merged['CosineSimilarity_M'])
)

In [22]:
rand_edge_weights_f_unif = generate_random_edge_weights_unif(
    merged,
    merged['CosineSimilarity_F'].value_counts().get(0, 0)/len(merged), 
)

rand_edge_weights_m_unif = generate_random_edge_weights_unif(
    merged,
    merged['CosineSimilarity_M'].value_counts().get(0, 0)/len(merged), 
)

In [23]:
get_weighted_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M'])

Weighted Jaccard Similarity: 0.8575


0.857530970196437

In [24]:
get_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M'])

Unweighted Jaccard Similarity: 0.3021


0.30214424951267055

In [25]:
weighted_jaccards_exp = []
weighted_jaccards_unif = []
unweighted_jaccards_exp = []
unweighted_jaccards_unif = []

for i in range(100):
    rand_edge_weights_g1_exp = generate_random_edge_weights_exp(
        merged,
        merged['CosineSimilarity_F'].value_counts().get(0, 0)/len(merged), 
        np.mean(merged['CosineSimilarity_F'])
    )

    rand_edge_weights_g2_exp = generate_random_edge_weights_exp(
        merged,
        merged['CosineSimilarity_M'].value_counts().get(0, 0)/len(merged), 
        np.mean(merged['CosineSimilarity_M'])
    )

    rand_edge_weights_g1_unif = generate_random_edge_weights_unif(
        merged,
        merged['CosineSimilarity_F'].value_counts().get(0, 0)/len(merged), 
    )

    rand_edge_weights_g2_unif = generate_random_edge_weights_unif(
        merged,
        merged['CosineSimilarity_M'].value_counts().get(0, 0)/len(merged), 
    )

    weighted_jaccard_exp = get_weighted_jaccard_index(rand_edge_weights_g1_exp, rand_edge_weights_g2_exp)
    weighted_jaccards_exp.append(weighted_jaccard_exp)

    weighted_jaccard_unif = get_weighted_jaccard_index(rand_edge_weights_g1_unif, rand_edge_weights_g2_unif)
    weighted_jaccards_unif.append(weighted_jaccard_unif)

    unweighted_jaccard_exp = get_jaccard_index(rand_edge_weights_g1_exp, rand_edge_weights_g2_exp)
    unweighted_jaccards_exp.append(unweighted_jaccard_exp)

    unweighted_jaccard_unif = get_jaccard_index(rand_edge_weights_g1_unif, rand_edge_weights_g2_unif)
    unweighted_jaccards_unif.append(unweighted_jaccard_unif)


Weighted Jaccard Similarity: 0.4745
Weighted Jaccard Similarity: 0.5479
Unweighted Jaccard Similarity: 0.4308
Unweighted Jaccard Similarity: 0.4815
Weighted Jaccard Similarity: 0.4689
Weighted Jaccard Similarity: 0.5941
Unweighted Jaccard Similarity: 0.4639
Unweighted Jaccard Similarity: 0.4620
Weighted Jaccard Similarity: 0.5088
Weighted Jaccard Similarity: 0.6182
Unweighted Jaccard Similarity: 0.3879
Unweighted Jaccard Similarity: 0.4016
Weighted Jaccard Similarity: 0.5506
Weighted Jaccard Similarity: 0.6335
Unweighted Jaccard Similarity: 0.3977
Unweighted Jaccard Similarity: 0.4113
Weighted Jaccard Similarity: 0.5093
Weighted Jaccard Similarity: 0.6391
Unweighted Jaccard Similarity: 0.4250
Unweighted Jaccard Similarity: 0.3899
Weighted Jaccard Similarity: 0.5018
Weighted Jaccard Similarity: 0.6267
Unweighted Jaccard Similarity: 0.4678
Unweighted Jaccard Similarity: 0.4094
Weighted Jaccard Similarity: 0.4928
Weighted Jaccard Similarity: 0.6347
Unweighted Jaccard Similarity: 0.4094
Un

In [35]:
np.mean(unweighted_jaccards_unif)


0.4239766081871344

In [None]:
np.std(unweighted_jaccards_unif)

0.019601221044313458

In [28]:
# Function to perform a t-test; is the difference between the male and female network
#  significantly different from differences in a random population?
def perform_statistical_analysis(original_value, random_values):
    t_stat, p_value = ttest_1samp(random_values, original_value)
    
    return {
        "Jaccard Index": original_value,
        "t_stat": t_stat,
        "p_value": p_value,
    }

In [29]:
perform_statistical_analysis(
    get_weighted_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M']),
    weighted_jaccards_exp
)

Weighted Jaccard Similarity: 0.8575


{'Jaccard Index': 0.857530970196437,
 't_stat': -110.55775081347778,
 'p_value': 1.5846625282082648e-105}

In [30]:
perform_statistical_analysis(
    get_weighted_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M']),
    weighted_jaccards_unif
)

Weighted Jaccard Similarity: 0.8575


{'Jaccard Index': 0.857530970196437,
 't_stat': -85.12190439462705,
 'p_value': 2.1087591484463154e-94}

In [31]:
perform_statistical_analysis(
    get_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M']),
    unweighted_jaccards_exp
)

Unweighted Jaccard Similarity: 0.3021


{'Jaccard Index': 0.30214424951267055,
 't_stat': 62.556378315865835,
 'p_value': 2.110127150773194e-81}

In [32]:
perform_statistical_analysis(
    get_jaccard_index(merged['CosineSimilarity_F'], merged['CosineSimilarity_M']),
    unweighted_jaccards_unif
)

Unweighted Jaccard Similarity: 0.3021


{'Jaccard Index': 0.30214424951267055,
 't_stat': 52.934929218770776,
 'p_value': 1.983591118841982e-74}