In [1]:
import Augusta

In [2]:
## version from the actual package
GRN = Augusta.RNASeq_to_GRN(count_table_input = '../data/Ecoli_DREAM4.csv', promoter_length = 1000, normalization_type = 'TPM', motifs_max_time=180)[0]

Count table uploaded.
Count table normalization not available - GenBank missing; skipped.
Mutual information computation...
Mutual information computation done.
GRN stored as "GRN.csv".


In [3]:
GRN

Unnamed: 0,BW25113_0564,BW25113_0979,BW25113_0978,BW25113_0972,BW25113_4401,BW25113_0313,BW25113_0039,BW25113_0734,BW25113_4122,BW25113_3516,...,BW25113_1130,BW25113_1608,BW25113_2217,BW25113_2062,BW25113_2060,BW25113_1013,BW25113_1285,BW25113_4063,BW25113_4062,BW25113_0995
,,,,,,,,,,,,,,,,,,,,,
BW25113_0564,0,1,0,0,0,0,-1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
BW25113_0979,0,0,0,0,0,0,0,0,0,0,...,0,0,-1,0,0,0,0,0,0,0
BW25113_0978,-1,1,0,0,0,0,-1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
BW25113_0972,1,1,-1,0,-1,-1,-1,1,-1,0,...,-1,1,1,0,-1,1,-1,1,1,-1
BW25113_4401,-1,-1,1,0,0,0,1,0,0,0,...,1,1,-1,0,-1,-1,0,1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BW25113_1013,1,-1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,-1
BW25113_1285,1,-1,1,0,1,0,1,-1,0,0,...,1,1,-1,0,-1,-1,0,1,1,-1
BW25113_4063,1,-1,-1,0,0,0,1,0,0,0,...,1,1,-1,0,0,1,0,0,0,-1


In [4]:
import numpy as np
import pandas as pd

# Load the time-series expression data with correct delimiter
df = pd.read_csv('../data/Ecoli_DREAM4.csv', index_col=0, sep=';')

# Convert values to float
df = df.astype(float)

# Determine number of bins D as per Eq. (4)
n_genes, n_timepoints = df.shape
D = min(int(np.floor(np.sqrt(n_genes / 5))), 10)
D = max(D, 1)  # ensure at least 1 bin

# Discretize each gene's time series into D bins using rank-based binning
data = df.values
binned = np.zeros((n_genes, n_timepoints), dtype=int)

for i in range(n_genes):
    row = data[i]
    ranks = row.argsort().argsort()  # ordinal ranks
    bins_i = np.floor(ranks * D / n_timepoints).astype(int)
    bins_i[bins_i >= D] = D - 1
    binned[i] = bins_i

# Define a custom mutual information function to avoid log(0)
def mutual_information(x, y, num_bins):
    N = len(x)
    counts_x = np.bincount(x, minlength=num_bins)
    counts_y = np.bincount(y, minlength=num_bins)
    p_x = counts_x / N
    p_y = counts_y / N
    joint = np.zeros((num_bins, num_bins), dtype=float)
    for k in range(N):
        joint[x[k], y[k]] += 1
    p_xy = joint / N
    mi = 0.0
    for a in range(num_bins):
        for b in range(num_bins):
            if p_xy[a, b] > 0:
                mi += p_xy[a, b] * np.log(p_xy[a, b] / (p_x[a] * p_y[b]))
    return mi

# Compute differences between adjacent time points
diffs = np.diff(data, axis=1)  # shape: (n_genes, n_timepoints-1)

# Initialize adjacency matrix
genes = df.index.tolist()
adj = np.zeros((n_genes, n_genes), dtype=int)

# Infer edges using custom MI and time-lag sign
for i in range(n_genes):
    for j in range(i + 1, n_genes):
        mi = mutual_information(binned[i], binned[j], D)
        if mi <= 0:
            continue  # no dependency detected
        ix = np.argmax(np.abs(diffs[i]))
        iy = np.argmax(np.abs(diffs[j]))
        if ix < iy:
            adj[i, j] = 1 if diffs[j, iy] > 0 else -1
        elif iy < ix:
            adj[j, i] = 1 if diffs[i, ix] > 0 else -1
        # ties are skipped

# Create a DataFrame for user-friendly display
adj_df = pd.DataFrame(adj, index=genes, columns=genes)


In [5]:
import numpy as np
from sklearn.metrics import jaccard_score, precision_score, recall_score, f1_score

def compare_networks(df1, df2, verbose=True):
    """
    Compare two signed GRN adjacency matrices (values -1,0,1).

    Returns a dict of similarity metrics for any-edge, positive-edge, and negative-edge.
    """
    # flatten to 1D arrays
    y1 = df1.values.flatten()
    y2 = df2.values.flatten()
    
    metrics = {}
    for label, mask1, mask2 in [
        ('any',    y1 != 0,    y2 != 0),
        ('positive', y1 == 1,    y2 == 1),
        ('negative', y1 == -1,   y2 == -1),
    ]:
        jac   = jaccard_score(mask1, mask2)
        prec  = precision_score(mask1, mask2)
        rec   = recall_score(mask1, mask2)
        f1    = f1_score(mask1, mask2)
        metrics[label] = {
            'jaccard':  jac,
            'precision': prec,
            'recall':    rec,
            'f1':        f1
        }
        if verbose:
            print(f"=== {label.capitalize()}-edge metrics ===")
            print(f"Jaccard:   {jac:.3f}")
            print(f"Precision: {prec:.3f}")
            print(f"Recall:    {rec:.3f}")
            print(f"F1 score:  {f1:.3f}\n")
    
    return metrics

# Example usage:
# metrics = compare_networks(grn_naive, grn_augusta)


In [6]:
compare_networks(adj_df, GRN)

=== Any-edge metrics ===
Jaccard:   1.000
Precision: 1.000
Recall:    1.000
F1 score:  1.000

=== Positive-edge metrics ===
Jaccard:   0.331
Precision: 0.508
Recall:    0.487
F1 score:  0.497

=== Negative-edge metrics ===
Jaccard:   0.297
Precision: 0.448
Recall:    0.469
F1 score:  0.458



{'any': {'jaccard': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0},
 'positive': {'jaccard': 0.330999719966396,
  'precision': 0.5081685296646603,
  'recall': 0.48702101359703337,
  'f1': 0.49737008205343985},
 'negative': {'jaccard': 0.2973529411764706,
  'precision': 0.44813829787234044,
  'recall': 0.4691415313225058,
  'f1': 0.4583994559056903}}

# Version that was corrected with gitingest

In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import mutual_info_score

# Load E. coli DREAM4 time-series expression data
df = pd.read_csv('../data/Ecoli_DREAM4.csv', index_col=0, sep=';').astype(float)

# Compute expression differences (time adjacent)
data = df.values
diffs = np.diff(data, axis=1)  # shape: [n_genes, n_timepoints-1]

# Identify most significant difference (MSD) position for each gene
max_expr_pos = np.argmax(np.abs(diffs), axis=1)

# Determine number of bins (Eq. 4)
n_genes = df.shape[0]
bins = int(np.floor(np.sqrt(n_genes / 5)))
if bins > 10:
    bins = 10

# Initialize MI matrix
genes = df.index.tolist()
MI_matrix = pd.DataFrame(np.zeros((n_genes, n_genes)), index=genes, columns=genes)

# Compute pairwise MI only for pairs with distinct MSD positions
for i in range(n_genes):
    for j in range(n_genes):
        if max_expr_pos[i] == max_expr_pos[j]:
            continue
        # Determine direction: fill upper or lower triangle
        if max_expr_pos[i] < max_expr_pos[j]:
            src, tgt = i, j
        else:
            src, tgt = j, i
        # Joint histogram (equal-width bins)
        x = df.iloc[src, :].values
        y = df.iloc[tgt, :].values
        bin_xy = np.histogram2d(x, y, bins)[0]
        # MI from contingency table
        MI = mutual_info_score(None, None, contingency=bin_xy)
        MI_matrix.iloc[src, tgt] = MI

# Threshold MI>0 to get binary skeleton
GRN_gpt = (MI_matrix > 0).astype(int)

# Assign sign based on MSD timepoint and difference signs
for tgt in range(n_genes):
    pos_t = max_expr_pos[tgt]
    # Skip genes whose peak change is at the first interval
    if pos_t == 0:
        GRN_gpt.iloc[:, tgt] = 0
        continue
    expr_t = diffs[tgt, pos_t]
    for src in range(n_genes):
        if GRN_gpt.iloc[src, tgt] == 1:
            expr_s = diffs[src, pos_t - 1]
            # Flip to -1 if source and target changes have opposite signs
            if (expr_s > 0 and expr_t < 0) or (expr_s < 0 and expr_t > 0):
                GRN_gpt.iloc[src, tgt] = -1



In [8]:
compare_networks(GRN, GRN_gpt)

=== Any-edge metrics ===
Jaccard:   1.000
Precision: 1.000
Recall:    1.000
F1 score:  1.000

=== Positive-edge metrics ===
Jaccard:   1.000
Precision: 1.000
Recall:    1.000
F1 score:  1.000

=== Negative-edge metrics ===
Jaccard:   1.000
Precision: 1.000
Recall:    1.000
F1 score:  1.000



{'any': {'jaccard': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0},
 'positive': {'jaccard': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0},
 'negative': {'jaccard': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}}