# MA 592 Project

## Import Data


In [1]:
from scipy.io import mmread
# Read Count Matrix data and names
counts_mat = mmread("./CD14_mtxs/CD14_counts.mtx").todense()
with open("./CD14_mtxs/CD14_c_colnames.txt", 'r') as f:
    counts_rownames = [x[:-1] for x in f.readlines()]
with open("./CD14_mtxs/CD14_c_rownames.txt", 'r') as f:
    counts_colnames = [x[:-1] for x in f.readlines()]

# Read Pertubation Matrix data and names
pert_mat = mmread("./CD14_mtxs/CD14_perturbations.mtx").todense()
with open("./CD14_mtxs/CD14_p_colnames.txt", 'r') as f:
    pert_rownames = [x[:-1] for x in f.readlines()]
with open("./CD14_mtxs/CD14_p_rownames.txt", 'r') as f:
    pert_colnames = [x[:-1] for x in f.readlines()]


## Conversion to Pandas Dataframe

In [2]:
import numpy as np
import pandas as pd

# Make Counts Dataframe
counts_df = pd.DataFrame(data=counts_mat.transpose(), columns=counts_colnames)
# Insert proteins by rowname as first column
counts_df.insert(loc=0, column="Cell", value=counts_rownames)

# Same process for perturbations
pert_df = pd.DataFrame(data=pert_mat.transpose(), columns=pert_colnames)
pert_df.insert(loc=0, column="Cell", value=pert_rownames)

## Instrumental Variables

In [3]:
import matplotlib.pyplot as plt

def getFHat(x, n=None):
    f = x.value_counts().sort_index()
    if n is None:
        n = f.index.max()
    return f.reindex(range(n+1), fill_value=0).cumsum() / f.sum()

def IV(Z, X, Y, d=0.25):
    X0, X1 = X.loc[Z == 0], X.loc[Z == 1]
    Y0, Y1 = Y.loc[Z == 0], Y.loc[Z == 1]

    N = X.max()
    FX0, FX1 = getFHat(X0, N), getFHat(X1, N)
    FY0, FY1 = getFHat(Y0, N), getFHat(Y1, N)
    alpha = ((FX1-FX0) / (1+pd.Series(range(N+1)))**d).argmax()
    beta = ((FY1-FY0) / (1+pd.Series(range(N+1)))**d).argmax()

    A0, A1 = (X0 > alpha).astype(int), (X1 > alpha).astype(int)
    B0, B1 = (Y0 > beta).astype(int), (Y1 > beta).astype(int)

    LATE = (B0.mean()-B1.mean()) / (A0.mean()-A1.mean())
#     LATE = (Y0.mean()-Y1.mean()) / (A0.mean()-A1.mean())
#     print(B0.mean()-B1.mean(), A0.mean()-A1.mean(), LATE)
    return LATE

In [4]:
print(IV(pert_df["NFKB1"], counts_df["NFKB1"], counts_df["CD14"]))

0.2533244204645833
