In [73]:
from SDP_interaction_inference.constraints import Constraint
from SDP_interaction_inference import optimization
from SDP_interaction_inference.dataset import Dataset
from SDP_interaction_inference.correlation import Correlation
from SDP_interaction_inference import utils
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import scipy
import tqdm
from copy import deepcopy
import matplotlib.patches
import corner
import plotly
import plotly.graph_objects as go
import gurobipy as gp

# Interacting Model-Free Infeasibility

Running Int-MF in 'data-correlation-investigation-extend' out of 1000 pairs only 878, 855, 885 were feasible (for 95%, 50%, 25% confidence).

The constraints should only adjust moments for downsampling and restrict to a valid set of moments, and if the observed bounds contain a valid moment vector, then so should the original moment bounds and so there should exist a feasible point.

In [4]:
# read in results
MF_int_df = pd.read_csv("Results/interacting_MF.csv", index_col=0)

In [None]:
# display feasibility results
for c in [95, 50, 25]:
    print("")
    print(MF_int_df[f'counts_d4_c{c}_status'].value_counts())


counts_d4_c95_status
OPTIMAL       878
CUT_LIMIT     118
INFEASIBLE      4
Name: count, dtype: int64

counts_d4_c50_status
OPTIMAL       855
CUT_LIMIT     139
INFEASIBLE      6
Name: count, dtype: int64

counts_d4_c25_status
OPTIMAL       885
CUT_LIMIT      86
INFEASIBLE     29
Name: count, dtype: int64


See that a relatively large number of samples are reaching the CUT_LIMIT of 100 cutting planes, none are reaching the TIME_LIMIT of 300 seconds, and only a few are actually giving an INFEASIBLE status

Select an example of CUT_LIMIT and INFEASIBLE status

In [22]:
mRNA_CUT_LIMIT = MF_int_df[MF_int_df['counts_d4_c95_status'] == "CUT_LIMIT"].iloc[0].name
mRNA_INFEASIBLE = MF_int_df[MF_int_df['counts_d4_c95_status'] == "INFEASIBLE"].iloc[0].name
mRNA_CUT_LIMIT, mRNA_INFEASIBLE

('ABCA5', 'ACTB')

Load in the data and see if these results are reproduced

In [24]:
def estimate_capture(df, beta_mean=0.1):

    # set mean beta
    MeanBETA = beta_mean

    #xx<-Matrix::colSums(Data)
    xx = df.sum(axis='rows')

    #Normcount <- t_sp(t_sp(Data)/xx) * mean(xx)
    # Normcount <- Matrix::t(Matrix::t(Data)/xx) * mean(xx)
    Normcount = (df / xx) * np.mean(xx) # sum = mean(xx) per column

    #means <- Matrix::rowMeans(Normcount)
    means = Normcount.mean(axis='columns')

    #lmeans <- log(means)
    lmean = np.log(means)

    #med <- apply(log(Normcount + 1), 1, function(x) {
    #    median(x)
    #})
    med = np.log(Normcount + 1).median(axis=1)

    #mad <- apply(log(Normcount + 1), 1, function(x) {
    #    mad(x)
    #})
    mad = np.log(Normcount + 1).apply(lambda row: (row - row.median()).abs().median(), axis=1)

    #bound <- med + 3 * mad
    bound = med + 3*mad

    #maxlogGene <- apply(log(Normcount + 1), 1, max)
    maxlogGene = np.log(Normcount + 1).max(axis=1)

    #ind <- which(maxlogGene < bound)
    ind = maxlogGene < bound

    #dropout = apply(Data, 1, function(x) {
    #    length(which(x == 0))/length(x)
    #})
    dropout = df.apply(lambda row: (row == 0).sum() / row.size, axis=1)


    #Select_ind <- intersect(ind, which(dropout < 0.35))
    Select_ind = ind & (dropout < 0.35)

    # Selected_genes <- rownames(Data)[Select_ind]
    Selected_genes = list(Select_ind.index)

    #temppp <- Matrix::colSums(Data[Select_ind, ])
    temppp = df[Select_ind].sum(axis=0)

    #BETA <- temppp/mean(temppp) * MeanBETA
    BETA = (temppp / temppp.mean()) * MeanBETA

    #if (length(which(BETA >= 1)) > 0) {
    #    BETA[BETA >= 1] = max(BETA[BETA < 1])
    #}
    #if (length(which(BETA <= 0)) > 0) {
    #    BETA[BETA <= 0] = min(BETA[BETA > 0])
    #}
    if (BETA >= 1).size > 0:
        BETA[BETA >= 1] = BETA[BETA < 1].max()
    if (BETA <= 0).size > 0:
        BETA[BETA <= 0] = BETA[BETA > 0].min()

    #names(BETA) <- colnames(Data)

    #return(list(BETA = BETA, Selected_genes = Selected_genes))
    BETA = BETA.to_numpy()
    
    return BETA

In [137]:
def construct_single_dataset(mirna_sample, mrna_sample, beta, resamples=1000):

    # size
    gene_pairs, cells = 1, len(mirna_sample)

    # construct paired count dataframe
    counts_df = pd.DataFrame(
        index = [f"Gene-pair-{i}" for i in range(gene_pairs)],
        columns = [f"Cell-{j}" for j in range(cells)]
    )

    # fill pair data
    counts_df.iloc[0] = list(zip(mirna_sample, mrna_sample))

    # construct dataset object
    data = Dataset()
    data.count_dataset = counts_df
    data.cells = cells
    data.gene_pairs = gene_pairs

    # settings
    data.resamples = resamples

    # set capture
    data.beta = beta

    return data

In [None]:
# read fibroblast transcript counts (for genes with mean expression > 1)
data_FIB = pd.read_csv("../../Real-Data/Datasets/GSE151334_FIB_counts_thresh.csv", index_col=0)

# remove outlier
data_counts = data_FIB.drop(labels=['Fibroblasts_P23_S383'], axis=1)

# estimate capture
beta_counts = estimate_capture(data_counts)

# read RNA types in from file
biotypes_dict = json.load(open("../../Real-Data/Datasets/GSE151334_RNA_types_thresh.json"))
biotypes_dict = {int(i): btype for i, btype in biotypes_dict.items()}
biotypes = [btype for btype in biotypes_dict.values()]

# select indices of protein coding (~mRNA) and non-coding miRNA
pcRNA_indices = [idx for idx, val in enumerate(biotypes) if val == "protein_coding"]
miRNA_indices = [idx for idx, val in enumerate(biotypes) if val == "miRNA"]

# separate data
data_counts_pcRNA = data_counts.iloc[pcRNA_indices]
data_counts_miRNA = data_counts.iloc[miRNA_indices]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
# select miRNA
miRNA = "MIR100"

# construct datasets
dataset_CUT_LIMIT = construct_single_dataset(data_counts_miRNA.loc[miRNA], data_counts_pcRNA.loc[mRNA_CUT_LIMIT], beta_counts)
dataset_INFEASIBLE = construct_single_dataset(data_counts_miRNA.loc[miRNA], data_counts_pcRNA.loc[mRNA_INFEASIBLE], beta_counts)

# bootstrap with 95% confidence
dataset_CUT_LIMIT.confidence = 0.95
dataset_INFEASIBLE.confidence = 0.95
dataset_CUT_LIMIT.bootstrap(d=4, tqdm_disable=False)
dataset_INFEASIBLE.bootstrap(d=4, tqdm_disable=False)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 15.54it/s]
100%|██████████| 1/1 [00:00<00:00, 22.27it/s]


## CUT_LIMIT sample

Repeating multiple bootstrap & optimizations the number of cuts is on average 17-18, but occasionally reaches 40-60. So due to random variation in the bootstrap samples may exceed the CUT_LIMIT of 100.

Time taken remains very low, so the cut limit could be increased significantly or even removed (although time limits may then need to be reduced to avoid long runs).

In [118]:
# Interacting model free
constraints = Constraint(
    moment_bounds=True,
    moment_matrices=True,
    factorization=False
)
opt_MF_int_CUT_LIMIT = optimization.ModelFreeOptimization(dataset_CUT_LIMIT, d=4, constraints=constraints, printing=False, silent=True)
opt_MF_int_CUT_LIMIT.analyse_dataset()
opt_MF_int_CUT_LIMIT.result_dict

100%|██████████| 1/1 [00:00<00:00,  9.92it/s]


{0: {'status': 'OPTIMAL',
  'time': 0.006000518798828125,
  'cuts': 44,
  'correlation': np.float64(-0.10755260738048607)}}

## INFEASIBLE sample

Infeasible over repeated bootstraps, but only when d = 4 (e.g. reducing to d = 3 makes feasible)

Optimizing with $B = I$ (no capture adjustment) is feasible, so the input bounds do contain a valid set of moments. In the full optimization this should then be adjusted by $B$ to produce a valid set of original moments, a feasible point. Unsure why not.

In [136]:
# Interacting model free
constraints = Constraint(
    moment_bounds=True,
    moment_matrices=True,
    factorization=False
)
opt_MF_int_INFEASIBLE = optimization.ModelFreeOptimization(dataset_INFEASIBLE, d=4, constraints=constraints, printing=False, silent=True, save_model="model_inf.lp")
opt_MF_int_INFEASIBLE.analyse_dataset()
opt_MF_int_INFEASIBLE.result_dict

100%|██████████| 1/1 [00:00<00:00, 12.72it/s]


{0: {'status': 'INFEASIBLE',
  'time': 0.002000093460083008,
  'cuts': 1,
  'correlation': None}}

In [120]:
# load written model and compute IIS (minimal set of infeasible constraints)
model = gp.read("model_inf.lp")
model.computeIIS()
model.write('iis_model_inf.ilp')

Read LP format model from file model_inf.lp
Reading time = 0.03 seconds
: 35 rows, 15 columns, 113 nonzeros
Gurobi Optimizer version 12.0.1 build v12.0.1rc0 (win64 - Windows 11+.0 (26200.2))

CPU model: Intel(R) Core(TM) i5-1035G1 CPU @ 1.00GHz, instruction set [SSE2|AVX|AVX2|AVX512]
Thread count: 4 physical cores, 8 logical processors, using up to 8 threads

Iteration    Objective       Primal Inf.    Dual Inf.      Time
       0    0.0000000e+00   6.189164e+13   0.000000e+00      0s

IIS computed: 14 constraints and 1 bounds
IIS runtime: 0.01 seconds (0.00 work units)


Repeat optimization with no capture efficiency adjustment to check if the input bounds on observed moments satisfy SDP constraints of a valid moment set

In [132]:
# copy dataset and set no capture
dataset_observed = deepcopy(dataset_INFEASIBLE)
dataset_observed.beta = np.ones(len(beta_counts))

# Interacting model free
constraints = Constraint(
    moment_bounds=True,
    moment_matrices=True,
    factorization=False
)
opt_MF_int_observed = optimization.ModelFreeOptimization(dataset_observed, d=4, constraints=constraints, printing=False, silent=True)#, save_model="model_inf.lp")
opt_MF_int_observed.analyse_dataset()
opt_MF_int_observed.result_dict

100%|██████████| 1/1 [00:00<00:00, 13.75it/s]


{0: {'status': 'OPTIMAL',
  'time': 0.0019998550415039062,
  'cuts': 0,
  'correlation': np.float64(0.08808233968378555)}}

# Repeat analysis with higher cut limits

Consistent results under re-optimization, but will inevitably vary over re-bootstrapping

Only small number of samples (6) move from CUT_LIMIT to OPTIMAL when increasing cut_limit from 100 to 1000, but only small increase in time so seems reasonble.

Removing cut_limit (setting to infinity) leads to optimizations reaching time_limit instead, this would need to be reduced from 5 minutes to be reasonable to use.

In [138]:
def construct_dataset(mirna_sample, mrna_dataset, beta, resamples=1000):

    # size
    gene_pairs, cells = mrna_dataset.shape

    # construct paired count dataframe
    counts_df = pd.DataFrame(
        index = [f"Gene-pair-{i}" for i in range(gene_pairs)],
        columns = [f"Cell-{j}" for j in range(cells)]
    )

    # fill with pairs
    for i in range(gene_pairs):
        gene_i = mirna_sample
        gene_j = mrna_dataset.iloc[i]
        gene_pair_ij = list(zip(gene_i, gene_j))
        counts_df.iloc[i] = gene_pair_ij

    # construct dataset object
    data = Dataset()
    data.count_dataset = counts_df
    data.cells = cells
    data.gene_pairs = gene_pairs

    # settings
    data.resamples = resamples

    # set capture
    data.beta = beta

    return data

In [140]:
# selected miRNA
miRNA = "MIR100"

# selected mRNA
genes = 1000
mRNA = data_counts_pcRNA.index[:genes]

# construct dataset of miRNA paired with mRNA
dataset_counts = construct_dataset(data_counts_miRNA.loc[miRNA], data_counts_pcRNA.loc[mRNA], beta_counts)

# bootstrap
dataset_counts.confidence = 0.95
dataset_counts.bootstrap(d=4, tqdm_disable=False)

100%|██████████| 1000/1000 [00:42<00:00, 23.66it/s]


### CUT_LIMIT = 100

In [152]:
# Interacting model free
constraints = Constraint(
    moment_bounds=True,
    moment_matrices=True,
    factorization=False
)
opt_MF_int = optimization.ModelFreeOptimization(dataset_counts, d=4, constraints=constraints, printing=False, silent=True)
opt_MF_int.analyse_dataset()

100%|██████████| 1000/1000 [01:11<00:00, 14.06it/s]


In [153]:
np.unique([res['status'] for res in opt_MF_int.result_dict.values()], return_counts=True)

(array(['CUT_LIMIT', 'INFEASIBLE', 'OPTIMAL'], dtype='<U10'),
 array([ 99,   7, 894]))

### CUT_LIMIT = 1000

In [150]:
# Interacting model free
constraints = Constraint(
    moment_bounds=True,
    moment_matrices=True,
    factorization=False
)
opt_MF_int_1000 = optimization.ModelFreeOptimization(dataset_counts, d=4, constraints=constraints, printing=False, silent=True, cut_limit=1000)
opt_MF_int_1000.analyse_dataset()

100%|██████████| 1000/1000 [02:27<00:00,  6.76it/s]


In [151]:
np.unique([res['status'] for res in opt_MF_int_1000.result_dict.values()], return_counts=True)

(array(['CUT_LIMIT', 'INFEASIBLE', 'OPTIMAL'], dtype='<U10'),
 array([ 93,   7, 900]))

### CUT_LIMIT = None

In [156]:
# Interacting model free
constraints = Constraint(
    moment_bounds=True,
    moment_matrices=True,
    factorization=False
)
opt_MF_int_None = optimization.ModelFreeOptimization(dataset_counts, d=4, constraints=constraints, printing=False, silent=True, cut_limit=np.inf)
opt_MF_int_None.analyse_dataset()

  2%|▏         | 22/1000 [06:14<4:37:17, 17.01s/it] 


KeyboardInterrupt: 

# Appendix

## B invertible

In [88]:
from SDP_interaction_inference.optimization_utils import compute_B
B = compute_B(beta=beta_counts, S=2, U=[], d=4)
B_inv = np.linalg.inv(B)
np.allclose(B @ B_inv, np.eye(15))

True