In [2]:
import numpy as np
import pandas as pd
import pickle

# Loading in the Gene list

### Creating the Gene list from text file (only run once)

In [64]:
probe_sets = []
#read out data and fix the ones with multiple genes or weird edge cases
with open("MicroarrayData/probe-gene-map.txt") as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
for i, line in enumerate(content):
    probe_sets.append(line.split(" ")[0])
    if "///" in line: #there were multiple proteins for this probe. We're just going to take the first one, for simplicity sake
        content[i] = line.split("///")[0].strip() + "\n"
    if "chromosome" in line: #line 41724 is "232469_x_at chromosome 1 open reading frame 191", weird
        content[i] = " ".join(line.split(" ")[:2]) + "\n"

In [61]:
with open('MicroarrayData/gene_list.txt', 'w') as f:
    for item in content:
        f.write("%s" % item)

### Load list

In [75]:
gene_list = pd.read_csv("MicroarrayData/gene_list.txt", sep=' ', header = None, index_col = 0, names = ["gene_id"])
print(gene_list.shape)
gene_list.head()

(54675, 1)


Unnamed: 0,gene_id
1007_s_at,DDR1
1053_at,RFC2
117_at,HSPA6
121_at,PAX8
1255_g_at,GUCA1A


# Modifying Data so it's easier to process further down the line

In [8]:
def change_index_to_genes(dataframe):
    """
    Changes (in place) the row indices of a dataframe from gene IDs to gene names. Ignores rows where the gene ID is not mapped to a gene.
    """
    genes = []
    for g in dataframe.index.values:
        try:
            gene = gene_list[gene_list.affy_hg_u133_plus_2 == g].external_gene_name.values[0]
            genes.append(gene)
        except IndexError: 
            genes.append(g)

    dataframe.index = genes

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [6]:
celllines = ["5", "4", "16", "13"]

In [9]:
pd.read_csv("MicroarrayData/Cellline4-BCNU/4_data.txt", sep="\t")

Unnamed: 0.1,Unnamed: 0,A9319_1.CEL,A9320_2.CEL,A9321_3.CEL,A9322_4.CEL,A9323_5.CEL,A9324_6.CEL,A9325_7.CEL,A9326_8.CEL,A9327_9.CEL,...,A9580_37_(HG-U133_Plus_2).CEL,A9581_38_(HG-U133_Plus_2).CEL,A9582_39_(HG-U133_Plus_2).CEL,A9583_40_(HG-U133_Plus_2).CEL,A9584_41_(HG-U133_Plus_2).CEL,A9585_42_(HG-U133_Plus_2).CEL,A9586_43_(HG-U133_Plus_2).CEL,A9587_44_(HG-U133_Plus_2).CEL,A9588_45_(HG-U133_Plus_2).CEL,A9716_131_(HG-U133_Plus_2).CEL
0,1007_s_at,9.614120,9.317007,9.557054,9.983838,9.538065,10.392179,9.374719,10.513953,9.542631,...,10.042059,9.577050,10.579412,9.535178,10.806542,9.328724,10.971776,9.571900,10.661037,9.608350
1,1053_at,11.288549,11.131825,11.149245,11.285809,11.247078,11.383692,11.267291,11.189586,11.171014,...,11.283307,11.189730,11.137159,11.115018,10.956815,10.953049,10.956784,10.783763,10.853566,11.026246
2,117_at,7.532313,7.311026,7.429654,7.436265,7.452250,7.661363,7.297420,7.468830,7.569166,...,7.488520,7.690997,7.466839,7.868699,7.453535,7.342618,7.493996,8.085367,7.177791,7.660766
3,121_at,6.578588,6.478339,6.742864,6.558411,6.314287,6.512833,6.626737,6.600509,6.525123,...,6.439403,6.703377,6.429105,6.446616,6.613078,6.406999,6.429651,6.554999,6.773787,6.442475
4,1255_g_at,3.455370,3.341523,3.316277,3.436467,3.559902,3.494685,3.514012,3.557127,3.357294,...,3.522626,3.597370,3.392545,3.432974,3.467480,3.323485,3.328946,3.366585,3.596141,3.724903
5,1294_at,9.522993,9.373580,9.240497,9.416922,9.370624,9.724590,9.185869,9.618335,9.562678,...,9.314895,9.694549,9.778225,10.047796,10.018283,9.695454,10.265788,10.021979,10.163301,9.895834
6,1316_at,5.679076,5.463477,5.535111,5.424751,5.606574,5.739798,5.594149,5.545216,5.762319,...,5.460147,5.541916,5.690536,5.310960,5.562798,5.637890,5.489237,5.843059,5.717703,5.695093
7,1320_at,6.003964,6.001544,6.122189,6.127270,6.098458,6.014337,6.236785,5.987300,6.171864,...,6.135524,5.985417,5.799154,6.010188,6.040142,5.904425,6.082417,6.046559,6.038845,6.055862
8,1405_i_at,10.254576,10.321505,10.094156,10.189178,9.979919,9.904687,10.169376,10.116100,9.968643,...,9.994074,9.477634,9.705880,9.602106,9.617950,9.831240,10.006492,10.129081,10.122253,10.283888
9,1431_at,5.062330,4.855556,5.082433,5.030144,4.902557,4.940250,4.787906,5.065020,4.960783,...,4.930939,4.836857,4.700422,4.689429,4.793325,4.552692,4.665486,4.789989,4.876204,4.715222


In [6]:
for c in celllines:
    df = pd.read_csv("MicroarrayData/Cellline" + c + "-BCNU/" + c + "_data.txt", sep="\t")
    print (df.shape)
    df.index = df["Unnamed: 0"].values #the column where the probe set ids are stored
    df = df.drop(labels=["Unnamed: 0"], axis=1)
    
#     change_index_to_genes(df)#add gene symbols to columns
    save_obj(df, "MicroarrayData/" + c + "_unlabeled")
    df.head()

(54675, 31)
(54675, 31)
(54675, 31)
(54675, 31)


In [12]:
## relabel columns with metadata
for c in celllines:
    line = pd.read_pickle("MicroarrayData/" + c + "_unlabeled.pkl")
    
    #get replicate
    replicates = [1]*15 + [2]*15
    if c == "4":
        replicates = [1]*14 + [2]*15 + [1]
    
    #get timempoints
    timepoints = [0, 4, 4, 8, 8, 12, 12, 24, 24, 36, 36, 48, 48, 72, 72]
    if c == "4":
        timepoints = [4, 4, 12, 12, 24, 24, 36, 36, 48, 48, 72, 72, 8, 8] +  timepoints + [0]
    else:
        timepoints += timepoints

    #get treated/untreated
    base = ["untreated", "treated"]
    treated = []
    for i in range(len(np.unique(timepoints))-1):
        treated += base
    if c == "4":
        treated = treated + ["untreated"] + treated + ["untreated"]
    else:
        treated = ["untreated"] + treated
        treated += treated
    
    #make columns
    line.columns = ["_".join([str(c), str(replicates[i]), str(timepoints[i]), treated[i]]) for i in range(len(treated))]
    save_obj(line, "MicroarrayData/" + c + "_processed")