# MA 592 Project

## Import Data


In [1]:
from scipy.io import mmread
# Read Count Matrix data and names
counts_mat = mmread("./CD14_mtxs/CD14_counts.mtx").todense()
with open("./CD14_mtxs/CD14_c_colnames.txt", 'r') as f:
    counts_rownames = [x[:-1] for x in f.readlines()]
with open("./CD14_mtxs/CD14_c_rownames.txt", 'r') as f:
    counts_colnames = [x[:-1] for x in f.readlines()]

# Read Pertubation Matrix data and names
pert_mat = mmread("./CD14_mtxs/CD14_perturbations.mtx").todense()
with open("./CD14_mtxs/CD14_p_colnames.txt", 'r') as f:
    pert_rownames = [x[:-1] for x in f.readlines()]
with open("./CD14_mtxs/CD14_p_rownames.txt", 'r') as f:
    pert_colnames = [x[:-1] for x in f.readlines()]


## Basic Read Value Info

In [2]:
print("Counts Matrix Info")
print("Counts Matrix Shape: ", counts_mat.shape)
print("Counts Col Names: ", counts_colnames)
print("Counts Row Names: ", counts_rownames)

print("\nPertubations Matrix Info")
print("Pertubations Matrix Shape: ", pert_mat.shape)
print("Pertubation Col Names: ", pert_colnames)
print("Pertubation Row Names: ", pert_rownames)

Counts Matrix Info
Counts Matrix Shape:  (310, 6204)

Pertubations Matrix Info
Pertubations Matrix Shape:  (47, 6204)


## Conversion to Pandas Dataframe

In [4]:
import numpy as np
import pandas as pd

# Make Counts Dataframe
counts_df = pd.DataFrame(data=counts_mat.transpose(), columns=counts_colnames)
# Insert proteins by rowname as first column
counts_df.insert(loc=0, column="Cell", value=counts_rownames)

# Same process for perturbations
pert_df = pd.DataFrame(data=pert_mat.transpose(), columns=pert_colnames)
pert_df.insert(loc=0, column="Cell", value=pert_rownames)

In [5]:
counts_df

Unnamed: 0,Cell,CDK11B,CDK11A,CDA,CDC42,CD52,CDCA8,CDC20,CDKN2C,CDC7,...,MAP3K10,MAPK1,MAPK12,MAPK11,MAPK8IP2,MAP3K7CL,IKBKE,IKBKG,IKBKB,IKBIP
0,AAAGAACCAGCCCAGT-1,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AAAGGATCATATTCGG-1,0,0,0,3,5,0,0,0,0,...,0,0,1,0,0,2,0,0,0,0
2,AAAGGGCAGCAAATGT-1,0,0,0,1,10,0,1,0,0,...,0,0,1,0,0,5,0,0,0,0
3,AACAAAGAGGAAACGA-1,0,0,0,4,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AACAAAGCACGCTGCA-1,0,0,0,4,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6199,TTTACCAAGGTTGACG-1,0,0,0,3,12,0,0,1,0,...,0,0,2,0,0,1,0,0,0,0
6200,TTTACCAGTACCTAGT-1,0,0,0,5,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6201,TTTAGTCGTTATTCCT-1,0,0,0,8,2,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
6202,TTTCACAGTCTGATCA-1,0,0,0,2,8,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
pert_df

Unnamed: 0,Cell,AKT1,CD14,CD151,CD19,CD36,CD40,CD47,CDHR3,CDK1,...,NFKB2,NFKBIA,NFKBIB,NFKBIE,NFKBIZ,TBK1,TRAF2,TRAF3,TRAF6,TRAPPC9
0,AAAGAACCAGCCCAGT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AAAGGATCATATTCGG-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AAAGGGCAGCAAATGT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AACAAAGAGGAAACGA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AACAAAGCACGCTGCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6199,TTTACCAAGGTTGACG-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6200,TTTACCAGTACCTAGT-1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6201,TTTAGTCGTTATTCCT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6202,TTTCACAGTCTGATCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
