# MA 592 Project

## Import Data


In [1]:
from scipy.io import mmread
# Read Count Matrix data and names

# counts_mat = mmread("./CD14_mtxs/CD14_counts.mtx").todense()
counts_mat = mmread("./dataset_norm/CD14_norm_counts.mtx").todense()
# with open("./CD14_mtxs/CD14_c_colnames.txt", 'r') as f:
with open("./dataset_norm/CD14_norm_c_colnames.txt", 'r') as f:
    counts_rownames = [x[:-1] for x in f.readlines()]
# with open("./CD14_mtxs/CD14_c_rownames.txt", 'r') as f:
with open("./dataset_norm/CD14_norm_c_rownames.txt", 'r') as f:
    counts_colnames = [x[:-1] for x in f.readlines()]

# Read Pertubation Matrix data and names

# pert_mat = mmread("./CD14_mtxs/CD14_perturbations.mtx").todense()
pert_mat = mmread("./dataset_norm/CD14_norm_p.mtx").todense()
# with open("./CD14_mtxs/CD14_p_colnames.txt", 'r') as f:
with open("./dataset_norm/CD14_norm_p_colnames.txt", 'r') as f:
    pert_rownames = [x[:-1] for x in f.readlines()]
# with open("./CD14_mtxs/CD14_p_rownames.txt", 'r') as f:
with open("./dataset_norm/CD14_norm_p_rownames.txt", 'r') as f:
    pert_colnames = [x[:-1] for x in f.readlines()]


## Conversion to Pandas Dataframe

In [2]:
import numpy as np
import pandas as pd

# Make Counts Dataframe
counts_df = pd.DataFrame(data=counts_mat.transpose(), columns=counts_colnames)
# Insert proteins by rowname as first column
counts_df.insert(loc=0, column="Cell", value=counts_rownames)

# Same process for perturbations
pert_df = pd.DataFrame(data=pert_mat.transpose(), columns=pert_colnames)
pert_df.insert(loc=0, column="Cell", value=pert_rownames)

In [3]:
counts_df["CDC42"].groupby(by=lambda x: counts_df["CDC42"][x] // 0.01).count()

0.0      1657
36.0        1
37.0        2
38.0        1
40.0        1
         ... 
340.0       1
342.0       1
352.0       1
353.0       1
374.0       1
Name: CDC42, Length: 283, dtype: int64

## Instrumental Variables

In [4]:
import matplotlib.pyplot as plt

def getFHat(x, n=None):
    if x.dtype == 'float':
        x = x.astype(int)
    f = x.value_counts().sort_index()
    if n is None:
        n = f.index.max()
    return f.reindex(range(n+1), fill_value=0).cumsum() / f.sum()

def getThreshold(fx0, fx1, d=0.25, n=None):
    if n is None:
        n = fx0.index.max()
    weights = (n-pd.Series(range(n+1)))/(n*(1+pd.Series(range(n+1)))**d) # Considering setting this to 1
    return (weights*(fx1-fx0)).argmax()

def IV(Z, X, Y, d=0.25):
    X0, X1 = X.loc[Z == 0], X.loc[Z == 1]
    Y0, Y1 = Y.loc[Z == 0], Y.loc[Z == 1]

    Nx = int(X.max())
    Ny = int(Y.max())
    FX0, FX1 = getFHat(X0, Nx), getFHat(X1, Nx)
    FY0, FY1 = getFHat(Y0, Ny), getFHat(Y1, Ny)
    alpha = getThreshold(FX0, FX1, d, Nx)
    beta = getThreshold(FY0, FY1, d, Ny)
#     alpha=beta=0.5

    A0, A1 = (X0 > alpha).astype(int), (X1 > alpha).astype(int)
    B0, B1 = (Y0 > beta).astype(int), (Y1 > beta).astype(int)

    Adiff, Bdiff = A0.mean()-A1.mean(), B0.mean()-B1.mean()
    LATE = Bdiff / Adiff
    return LATE, Adiff, Bdiff

# Bootstrap

In [5]:
%matplotlib inline
def bootstrap(Z, X, Y, reps=500, prog = True):
    LATEs = []
    KEs = []
    n = X.shape[0]
    for x in range(reps):
        if prog and x % 1000 == 0:
            print(x)
        indicies = np.random.randint(0, n, n)
        LATE, KE, _ = IV(Z[indicies], X[indicies], Y[indicies])  # KE = gene Knockout Effect (on X)
        LATEs.append(LATE)
        KEs.append(KE)
    return np.asarray(LATEs), np.asarray(KEs)

def save_graph(ZX_name, Y_name, figure):
    figure.savefig("./figures/{} vs {}".format(ZX_name, Y_name), facecolor='white', transparent=False)

def find_CI(data, CI):
    return round(np.percentile(data, 100 - CI),3), round(np.percentile(data, CI),3)

def bootstrap_n_save(ZX_name, Y_name, CI = 95, print_below = True, save = True, reps=2500, prog=False):
    if print_below:
        plt.ion()
    else:
        plt.ioff()
    
    # Get bootstrap values
    LATEs, KEs = bootstrap(pert_df[ZX_name], counts_df[ZX_name], counts_df[Y_name], reps, prog)
    
    
    for effectVals, xLabel, yLabel in [(KEs,ZX_name+" gene",ZX_name + " rna"), (LATEs,ZX_name,Y_name)]:
        # Get CI values
        lower_bound, upper_bound = find_CI(effectVals, CI)

        # Create graph
        fig, ax = plt.subplots()
        ax.hist(effectVals)
        ax.set_title("{} vs {} ({}% CI: {}-{})".format(xLabel, yLabel, CI, lower_bound, upper_bound))
        ax.set_xlabel("Effect")
        ax.set_ylabel("Count")

        # Save figure if desired
        if save:
            save_graph(xLabel, yLabel, fig)

        # Show figure in jupyter if desired
        if print_below:
            plt.show()
        else:
            plt.close()
            
    return LATEs, KEs

## Hypothesis Testing

In [6]:
def HT(Xvals, two_sided=True):
    leftQ, rightQ = (Xvals <= 0).mean(), (Xvals >= 0).mean()
    
    if two_sided:
        pval = 2*min(leftQ, rightQ)
    else: # right-tailed otherwise since the statistic for testing is treatment effects
        pval = rightQ
    return pval

### Import gene pairs for analysis

In [7]:
markers_df = pd.read_csv("KOvsControl_markers - high priority.csv")

Xmarkers = markers_df["compare"].str.extract('^([A-Z|\d]+)_').values[:,0]
Ymarkers = markers_df["gene"].to_list()

results = []
for Xmarker,Ymarker in zip(Xmarkers, Ymarkers):
    bLATEs, bKEs = bootstrap_n_save(Xmarker, Ymarker, print_below=False, reps=2500)
    LATE_median = bLATEs.median()
    KE_pval, LATE_pval = HT(bKEs), HT(bLATEs)
    print(Xmarker, Ymarker, KE_pval, LATE_pval, LATE_median)
    results.append([Xmarker, Ymarker, KE_pval, LATE_pval, LATE_median])
    
out_df = pd.DataFrame(results, columns=["XZ", "Y", "KE_pval", "LATE_pval", "LATE_median"])
out_df.to_csv("results.csv")

TRAF6 TRAF1 0.5488 0.5488
TRAF6 CD83 0.528 0.528
TRAF6 NFKBIA 0.5216 0.5216
TRAF6 NFKB1 0.5352 0.5352
TRAF6 CD44 0.544 0.544
TRAF6 CD40 0.5328 0.5328
TRAF6 CD300A 0.54 0.5392
TRAF6 TRAF3IP3 0.5392 0.5432
TRAF6 TRAPPC6A 0.5584 0.5592
TRAF6 CD1D 0.5144 0.5152
TRAF6 MAP3K1 0.5416 0.5456
TRAF6 CD164 0.5144 0.5176
TRAF6 CD82 0.5208 0.5256
TRAF6 MAP3K11 0.5176 0.5176
TRAF6 IKBKG 0.484 0.4888
TRAF6 CDC123 0.5056 0.5056
NFKB1 NFKB1 0.0 0.0
NFKB1 TRAF1 0.0 0.0
NFKB1 NFKB2 0.0 0.0
NFKB1 MAP3K8 0.0 0.0
NFKB1 CD82 0.0 0.0064
NFKB1 CD69 0.0 0.0008
NFKB1 NFKBIA 0.0 0.0136
NFKB1 MAPKBP1 0.0 0.0024
NFKB1 IKBKE 0.0 0.0088
NFKB1 CD58 0.0 0.0232
NFKB1 CD44 0.0 0.0328
MAPK1 CD52 0.0008 0.0008
MAPK1 CD74 0.0024 0.0024
MAPK1 CD209 0.0 0.0
MAPK1 CD80 0.0 0.0
MAPK1 CD81 0.0008 0.0784
MAPK1 MAPK1 0.0008 0.0
MAPK1 CD151 0.0 0.0
MAPK1 IRF8 0.0 0.0088
MAPK1 CD274 0.0 0.0048
MAPK1 CD68 0.0008 0.0016
MAPK1 CD47 0.0016 0.0024
MAPK1 CD40 0.0016 0.0032
MAPK1 CDC20 0.0008 0.0008
MAPK1 CDC27 0.0 0.056
MAPK1 MAP4K4 0.003

KeyboardInterrupt: 