In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.multitest import multipletests
from scipy.stats import fisher_exact, ttest_ind, spearmanr
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
def load_expression():
    # Define parameters for the negative binomial distribution
    n = 10  # Number of successes
    p = 0.5  # Probability of success
    
    # Generate data
    time_points = [f't{i}' for i in range(1, 8)]
    genes = [f'g{i}' for i in range(1, 21)] + [f'tf{i}' for i in range(1, 8)]
    samples = [f'p{j}' for j in range(1, 11)]

    data_frames = []

    for time_point in time_points:
        # Generate data for p1-p4 columns
        p1_p4_data = np.random.negative_binomial(n, p, (27, 4))
        
        # Generate data for p5-p10 columns such that p1 is the average
        p5_p10_data = []
        for row in p1_p4_data:
            p1_value = row[0]  # Take p1 value
            remaining_columns = np.random.randint(0, 7, size=(1, 6))  # Small random values to avoid zeros
            adjustment = (p1_value - remaining_columns.mean(axis=1)).astype(int)
            p5_p10_row = remaining_columns + adjustment[:, None]
            p5_p10_data.append(p5_p10_row.flatten())
        
        p5_p10_data = np.array(p5_p10_data)
        
        # Combine all columns
        data = np.column_stack((p1_p4_data, p5_p10_data))
        
        df = pd.DataFrame(data, index=genes, columns=samples)
        df['time'] = time_point
        data_frames.append(df)

    # Concatenate all data frames
    expression_data = pd.concat(data_frames)

    return expression_data

def load_tf_network():
    # Set random seed for reproducibility
    np.random.seed(42)

    # Define genes and TFs
    genes = [f'g{i}' for i in range(1, 21)]
    tfs = [f'tf{j}' for j in range(1, 8)]

    # Generate gene list with specific occurrence requirements
    gene_occurrences = genes + np.random.choice(genes, 4, replace=False).tolist() + [np.random.choice(genes)]
    np.random.shuffle(gene_occurrences)

    # Ensure the lengths match the requirement
    assert len(gene_occurrences) == 25

    # Generate random TF-target pairs
    tf_targets = {
        'TF': np.random.choice(tfs, len(gene_occurrences)),
        'Target': gene_occurrences
    }

    # Create DataFrame
    tf_network_df = pd.DataFrame(tf_targets)
    tf_network = tf_network_df.assign(Target_numeric=tf_network_df['Target'].str.extract(r'(\d+)').astype(int)).sort_values(by=['TF', 'Target_numeric']).drop(columns=['Target_numeric']).reset_index(drop=True)
    
    return tf_network

def load_target_genes():
    target_genes = ['g2','g4','g6','g8']    
    return target_genes

expression = load_expression()
tf_network = load_tf_network()
target_genes = load_target_genes()

In [3]:
expression

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,time
g1,7,7,15,14,4,4,6,6,10,7,t1
g2,5,7,11,8,6,3,7,7,2,2,t1
g3,12,19,19,8,10,11,13,12,9,15,t1
g4,12,9,8,12,13,12,11,10,11,10,t1
g5,12,5,6,6,15,10,11,13,9,11,t1
...,...,...,...,...,...,...,...,...,...,...,...
tf3,7,18,7,11,10,5,6,6,8,7,t7
tf4,11,5,5,10,10,12,14,9,10,11,t7
tf5,7,5,13,4,9,5,7,3,8,5,t7
tf6,12,7,2,12,14,11,10,10,10,12,t7


In [4]:
tf_network

Unnamed: 0,TF,Target
0,tf1,g2
1,tf1,g5
2,tf1,g16
3,tf2,g1
4,tf2,g4
5,tf2,g8
6,tf2,g11
7,tf2,g12
8,tf2,g13
9,tf3,g10


In [5]:
target_genes

['g2', 'g4', 'g6', 'g8']

In [6]:
def identify_degs(expression_data):
    time_points = expression_data['time'].unique()
    initial_time_point = expression_data[expression_data['time'] == 't1']
    
    deg_sets = {}
    differential_expression_levels = {}

    for time_point in time_points:
        if time_point == 't1':
            continue
        current_time_point = expression_data[expression_data['time'] == time_point]
        p_values = []
        deg_list = []
        diff_expression = {}

        for gene in expression_data.index.unique():
            initial_values = initial_time_point.loc[gene].values[:-1] 
            current_values = current_time_point.loc[gene].values[:-1]
            t_stat, p_val = ttest_ind(initial_values, current_values)
            p_values.append(p_val)
            diff_expression[gene] = np.mean(current_values) - np.mean(initial_values)
        
        # Multiple testing correction
        corrected_p_values = multipletests(p_values, alpha=0.05, method='fdr_bh')[1]
        
        deg_sets[time_point] = [expression_data.index.unique()[i] for i in range(len(corrected_p_values)) if corrected_p_values[i] < 0.05]
        differential_expression_levels[time_point] = pd.DataFrame.from_dict(diff_expression, orient='index', columns=['differential_expression'])

    return deg_sets, differential_expression_levels

deg_sets, de_levels = identify_degs(expression)

In [7]:
deg_sets

{'t2': ['g1', 'g4', 'g13', 'g16', 'g17', 'tf2', 'tf3', 'tf6'],
 't3': ['g1',
  'g2',
  'g4',
  'g6',
  'g7',
  'g9',
  'g11',
  'g17',
  'g18',
  'g19',
  'tf2',
  'tf3',
  'tf4',
  'tf5',
  'tf6'],
 't4': ['g1',
  'g2',
  'g4',
  'g6',
  'g7',
  'g8',
  'g10',
  'g11',
  'g13',
  'g16',
  'g17',
  'g18',
  'g19',
  'g20',
  'tf2',
  'tf6'],
 't5': ['g1',
  'g2',
  'g3',
  'g4',
  'g10',
  'g11',
  'g12',
  'g13',
  'g16',
  'g18',
  'g19',
  'g20',
  'tf2'],
 't6': ['g4', 'g5', 'g8', 'g10', 'g12', 'g14', 'g16', 'g18', 'g19', 'tf1'],
 't7': ['g1',
  'g2',
  'g3',
  'g4',
  'g5',
  'g10',
  'g16',
  'g18',
  'g19',
  'tf3',
  'tf6',
  'tf7']}

In [8]:
# Construct time-specific networks
def construct_time_specific_networks(tf_network, deg_sets, target_genes):
    time_specific_networks = {}
    
    for time_point, deg_set in deg_sets.items():
        # Intersected gene set
        vj = (set(deg_set) & set(target_genes)) | set(tf_network['TF'])
        
        # Construct network
        edges = []
        for _, row in tf_network.iterrows():
            if row['TF'] in vj and row['Target'] in vj:
                edges.append((row['TF'], row['Target']))
        
        time_specific_networks[time_point] = {
            'nodes': list(vj),
            'edges': edges
        }
    
    return time_specific_networks

# Main pipeline function
def propa_net_pipeline():
    expression = load_expression()
    tf_network = load_tf_network()
    target_genes = load_target_genes()
    deg_sets, de_levels = identify_degs(expression)
    time_specific_networks = construct_time_specific_networks(tf_network, deg_sets, target_genes)
    
    # Populate the dictionaries
    time_specific_nodes = {}
    time_specific_graphs = {}
    
    for key, value in time_specific_networks.items():
        time_specific_nodes[key] = value['nodes']
    
        # Create DataFrame for edges
        df_edges = pd.DataFrame(value['edges'], columns=['node1', 'node2'])
        time_specific_graphs[key] = df_edges
        
    return time_specific_nodes, time_specific_graphs

time_specific_nodes, time_specific_graphs = propa_net_pipeline()

In [11]:
time_specific_nodes

{'t2': ['tf7', 'tf4', 'tf6', 'tf3', 'tf2', 'tf1', 'g8', 'tf5', 'g2'],
 't3': ['tf7', 'tf4', 'tf6', 'tf3', 'tf2', 'tf1', 'g8', 'tf5', 'g2'],
 't4': ['tf7', 'tf4', 'tf6', 'tf3', 'tf2', 'tf1', 'tf5', 'g2', 'g6'],
 't5': ['tf7', 'tf4', 'tf6', 'tf3', 'tf2', 'tf1', 'g8', 'tf5'],
 't6': ['tf7', 'tf4', 'tf6', 'tf3', 'tf2', 'tf1', 'tf5', 'g6'],
 't7': ['tf7', 'tf4', 'tf6', 'tf3', 'tf2', 'tf1', 'tf5', 'g2']}

In [12]:
time_specific_graphs

{'t2':   node1 node2
 0   tf1    g2
 1   tf2    g8
 2   tf4    g2
 3   tf6    g2,
 't3':   node1 node2
 0   tf1    g2
 1   tf2    g8
 2   tf4    g2
 3   tf6    g2,
 't4':   node1 node2
 0   tf1    g2
 1   tf4    g2
 2   tf4    g6
 3   tf6    g2,
 't5':   node1 node2
 0   tf2    g8,
 't6':   node1 node2
 0   tf4    g6,
 't7':   node1 node2
 0   tf1    g2
 1   tf4    g2
 2   tf6    g2}

In [13]:
# Function to calculate influence
def calculate_influence(time_specific_networks, tf_network, deg_sets, target_genes, de_levels, rounds=100):
    influence_results = {}
    
    for time_point, network in time_specific_networks.items():
        nodes = network['nodes']
        edges = network['edges']
        vj = list(set(nodes) & set(target_genes) & set(deg_sets[time_point]))
        
        if not vj:
            continue
        
        tf_set = list(set(tf_network['TF'].unique()) & set(nodes))
        
        # Initialize DE(s) and IL(t) using precomputed differential expression levels
        de = de_levels[time_point].to_dict()['differential_expression']
        il = {tf: 0 for tf in tf_set}
        
        for _ in range(rounds):
            g_prime_edges = [(u, v) for u, v in edges if np.random.rand() > 0.5]
            g_prime = {node: [] for node in nodes}
            
            for u, v in g_prime_edges:
                g_prime[u].append(v)
            
            for tf in tf_set:
                reachable = set()
                queue = [tf]
                
                while queue:
                    current = queue.pop(0)
                    for neighbor in g_prime[current]:
                        if neighbor not in reachable and neighbor not in tf_set:
                            reachable.add(neighbor)
                            queue.append(neighbor)
                
                if reachable:
                    il[tf] += sum(de[node] for node in reachable if node in de) / len(reachable)
        
        for tf in il:
            il[tf] /= rounds
        
        influence_results[time_point] = il
    
    return influence_results

influence_results = calculate_influence(time_specific_networks, tf_network, deg_sets, target_genes, de_levels)

In [14]:
influence_results

{'t2': {'tf7': 0.0,
  'tf4': 0.0,
  'tf6': 0.0,
  'tf2': -3.76,
  'tf3': 0.0,
  'tf1': 0.0,
  'tf5': 0.0},
 't3': {'tf7': 0.0,
  'tf4': -3.6,
  'tf6': -2.45,
  'tf2': -2.36,
  'tf3': 0.0,
  'tf1': -2.45,
  'tf5': 0.0},
 't4': {'tf7': 0.0,
  'tf4': -4.36,
  'tf6': -3.57,
  'tf2': 0.66,
  'tf3': 0.0,
  'tf1': -3.5,
  'tf5': 0.0},
 't5': {'tf7': 0.0,
  'tf4': -3.57,
  'tf6': -4.13,
  'tf2': -3.96,
  'tf3': 0.0,
  'tf1': -3.78,
  'tf5': 0.0},
 't6': {'tf7': 0.0,
  'tf4': 0.0,
  'tf6': 0.0,
  'tf2': -0.28,
  'tf3': 0.0,
  'tf1': 0.0,
  'tf5': 0.0},
 't7': {'tf7': 0.0,
  'tf4': -3.76,
  'tf6': -4.24,
  'tf2': 2.35,
  'tf3': 0.0,
  'tf1': -4.32,
  'tf5': 0.0}}

In [22]:
def network_propagation(W, p0, alpha, iterations):
    p = p0
    for _ in range(iterations):
        p = alpha * p0 + (1 - alpha) * np.dot(W, p)
    return p

def identify_major_regulatory_tfs(time_specific_networks, tf_network, deg_sets, de_levels, alpha=0.7, iterations=100):
    major_tfs = {}
    
    for time_point, network in time_specific_networks.items():
        nodes = network['nodes']
        edges = network['edges']
        
        # Create adjacency matrix W
        node_index = {node: idx for idx, node in enumerate(nodes)}
        W = np.zeros((len(nodes), len(nodes)))
        for u, v in edges:
            W[node_index[u], node_index[v]] = 1
        
        # Normalize the adjacency matrix
        row_sums = W.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1  # Avoid division by zero
        W = W / row_sums
        
        # Initialize influence scores
        tf_influence_scores = {tf: influence_results[time_point].get(tf, 0) for tf in nodes if tf.startswith('tf')}
        sorted_tfs = sorted(tf_influence_scores.keys(), key=lambda x: tf_influence_scores[x], reverse=True)
        
        # Differential expression values
        de = de_levels[time_point]['differential_expression'].to_dict()
        de_values = np.array([de[node] if node in de else 0 for node in nodes])
        
        # Network propagation and selection of major regulatory TFs
        p0 = np.zeros(len(nodes))
        selected_tfs = []
        max_scc = -1
        
        for tf in sorted_tfs:
            p0[node_index[tf]] = tf_influence_scores[tf]
            propagated_values = network_propagation(W, p0, alpha, iterations)
            
            # Check for constant input arrays
            if np.all(propagated_values == propagated_values[0]) or np.all(de_values == de_values[0]):
                scc = -1
            else:
                scc, _ = spearmanr(propagated_values, de_values)
            
            if scc > max_scc:
                max_scc = scc
                selected_tfs.append(tf)
            else:
                p0[node_index[tf]] = 0
        
        major_tfs[time_point] = selected_tfs
        
        #for tf in sorted_tfs:
        #    p0[node_index[tf]] = tf_influence_scores[tf]
        #    propagated_values = network_propagation(W, p0, alpha, iterations)
        #    scc, _ = spearmanr(propagated_values, de_values)
            
        #    if scc > max_scc:
        #        max_scc = scc
        #        selected_tfs.append(tf)
        #    else:
        #        p0[node_index[tf]] = 0
        
        #major_tfs[time_point] = selected_tfs
    
    return major_tfs

major_tfs = identify_major_regulatory_tfs(time_specific_networks, tf_network, deg_sets, de_levels)

In [24]:
major_tfs

{'t2': ['tf2'],
 't3': ['tf2', 'tf4'],
 't4': ['tf2', 'tf1'],
 't5': ['tf4', 'tf1', 'tf2'],
 't6': ['tf2'],
 't7': ['tf2', 'tf4']}