In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv(
    "../../data/input/GSE80655_GeneExpressionData_Updated_3-26-2018.txt",
    sep="\t",
    index_col=0,
)

In [3]:
data.head()

Unnamed: 0_level_0,SL31501,SL31502,SL31503,SL31504,SL31505,SL31506,SL31507,SL31508,SL32225,SL32226,...,SL7849,SL7851,SL7852,SL7853,SL7854,SL7855,SL7856,SL7857,SL7858,SL8435
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972,0,0,0,0,0,0,0,0,3,0,...,1,0,0,0,0,2,0,1,1,0
ENSG00000227232,93,80,116,50,233,113,119,146,225,156,...,147,166,118,74,88,194,121,83,418,50
ENSG00000243485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000268020,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0


In [4]:
data.shape

(57905, 281)

In [5]:
metadata = open("../../data/input/GSE80655_family.soft").read()

In [6]:
metadata_split = metadata.split("^SAMPLE")[1:]

In [7]:
len(metadata_split)

281

In [8]:
name_to_region_diagnosis = {}
for i in range(len(metadata_split)):
    sample = metadata_split[i].split("\n!")[1:]
    region = sample[11].split(": ")[-1]
    diagnosis = sample[13].split(": ")[-1]
    description = sample[21].split(" = ")[-1]
    human_id = sample[0][15:20]
    name_to_region_diagnosis[description] = (human_id, region, diagnosis)

In [9]:
metadata = pd.DataFrame(
    name_to_region_diagnosis, index=["human_id", "region", "diagnosis"]
).T
metadata.head()

Unnamed: 0,human_id,region,diagnosis
SL31501,X1834,AnCg,Control
SL31502,X2315,AnCg,Major Depression
SL31503,X2566,AnCg,Bipolar Disorder
SL31504,X3031,AnCg,Major Depression
SL31505,X2353,nAcc,Schizophrenia


In [10]:
metadata = metadata[metadata.diagnosis.isin(["Control", "Schizophrenia"])]
metadata.head()

Unnamed: 0,human_id,region,diagnosis
SL31501,X1834,AnCg,Control
SL31505,X2353,nAcc,Schizophrenia
SL31506,X2664,nAcc,Control
SL31507,X2805,nAcc,Control
SL31508,X2976,nAcc,Schizophrenia


In [11]:
data_with_metadata = metadata.join(data.T)
data_with_metadata.head()

Unnamed: 0,human_id,region,diagnosis,ENSG00000223972,ENSG00000227232,ENSG00000243485,ENSG00000237613,ENSG00000268020,ENSG00000240361,ENSG00000186092,...,ENSG00000198886,ENSG00000210176,ENSG00000210184,ENSG00000210191,ENSG00000198786,ENSG00000198695,ENSG00000210194,ENSG00000198727,ENSG00000210195,ENSG00000210196
SL31501,X1834,AnCg,Control,0,93,0,0,1,0,0,...,556653,1,0,0,119564,21277,0,195813,13,186
SL31505,X2353,nAcc,Schizophrenia,0,233,0,0,0,0,0,...,1290628,0,1,0,324555,47014,0,541391,43,613
SL31506,X2664,nAcc,Control,0,113,0,0,0,0,0,...,840452,2,0,0,242680,42762,2,448690,29,362
SL31507,X2805,nAcc,Control,0,119,0,0,0,0,0,...,1004983,0,0,1,372854,69277,1,528267,24,634
SL31508,X2976,nAcc,Schizophrenia,0,146,0,0,0,0,0,...,1073688,0,1,1,287422,42661,0,494724,38,549


In [12]:
replace_dict = {
    "DLPFC": "34 Dorsolateral Prefrontal (BA9)",
    "nAcc": "56 Nucleus Accumbens",
    "AnCg": "Cingulate Anterior",
}

In [13]:
data_with_metadata["region"].replace(replace_dict, inplace=True)
data_with_metadata.head()

Unnamed: 0,human_id,region,diagnosis,ENSG00000223972,ENSG00000227232,ENSG00000243485,ENSG00000237613,ENSG00000268020,ENSG00000240361,ENSG00000186092,...,ENSG00000198886,ENSG00000210176,ENSG00000210184,ENSG00000210191,ENSG00000198786,ENSG00000198695,ENSG00000210194,ENSG00000198727,ENSG00000210195,ENSG00000210196
SL31501,X1834,Cingulate Anterior,Control,0,93,0,0,1,0,0,...,556653,1,0,0,119564,21277,0,195813,13,186
SL31505,X2353,56 Nucleus Accumbens,Schizophrenia,0,233,0,0,0,0,0,...,1290628,0,1,0,324555,47014,0,541391,43,613
SL31506,X2664,56 Nucleus Accumbens,Control,0,113,0,0,0,0,0,...,840452,2,0,0,242680,42762,2,448690,29,362
SL31507,X2805,56 Nucleus Accumbens,Control,0,119,0,0,0,0,0,...,1004983,0,0,1,372854,69277,1,528267,24,634
SL31508,X2976,56 Nucleus Accumbens,Schizophrenia,0,146,0,0,0,0,0,...,1073688,0,1,1,287422,42661,0,494724,38,549


In [14]:
data_with_metadata.to_csv("../../data/intermediate/GSE80655.csv")

# GSE78936

In [15]:
data = pd.read_csv("../../data/input/GSE78936_fpkmMatrix.txt", sep="\t")

  data = pd.read_csv("../../data/input/GSE78936_fpkmMatrix.txt", sep="\t")


In [17]:
data.head()

Unnamed: 0,BA11_1,BA11_2,BA11_3,BA11_4,BA11_5,BA11_6,BA11_7,BA11_8,BA11_9,BA11_10,...,BA9_10,BA9_11,BA9_12,BA9_13,BA9_14,BA9_15,BA9_16,BA9_17,BA9_18,BA9_19
1,3.909188,1.968322,2.720665,3.229451,2.349099,3.129997,2.703979,3.011143,3.017315,0.377656,...,1.601336,0.91061,0.354513,1.343493,1.20569,1.947324,1.71646,1.162038,1.670975,1.428307
10,0.0,0.251369,0.0,0.0,0.251997,0.0,0.0,0.278464,0.0,0.0,...,0.113014,0.0,0.237688,0.120102,0.0,0.118692,0.0,0.239724,0.0,0.0
100,4.286465,0.864368,0.224016,5.753298,1.191477,0.5306,1.558494,0.59846,0.119937,0.65301,...,0.388616,1.049697,0.612993,0.825974,0.496374,2.040692,0.312416,0.515204,0.506895,0.484255
1000,8.394588,4.736731,1.764683,4.018277,2.337188,3.852677,3.487385,3.771491,5.586674,1.118278,...,6.022812,7.709723,8.46797,9.759894,12.478585,7.828082,7.169154,8.43464,5.833343,8.392375
10000,8.761966,5.637057,1.449253,4.473051,2.486505,6.444563,4.119753,4.296327,5.331356,1.703467,...,10.279508,14.944918,11.854477,6.593274,9.074382,7.623143,8.280194,16.772853,6.600951,10.429361


In [18]:
data.columns

Index(['BA11_1', 'BA11_2', 'BA11_3', 'BA11_4', 'BA11_5', 'BA11_6', 'BA11_7',
       'BA11_8', 'BA11_9', 'BA11_10', 'BA11_11', 'BA11_12', 'BA11_13',
       'BA11_14', 'BA11_15', 'BA11_16', 'BA11_17', 'BA11_18', 'BA11_19',
       'BA11_20', 'BA11_21', 'BA11_22', 'BA11_23', 'BA11_24', 'BA11_25',
       'BA11_26', 'BA11_27', 'BA11_28', 'BA11_29', 'BA11_30', 'BA11_31',
       'BA11_32', 'BA11_33', 'BA11_34', 'BA11_35', 'BA11_36', 'BA11_37',
       'BA11_38', 'BA11_39', 'BA11_40', 'BA11_41', 'BA11_42', 'BA11_43',
       'BA11_44', 'BA24_1', 'BA24_2', 'BA24_3', 'BA24_4', 'BA24_5', 'BA24_6',
       'BA24_7', 'BA24_8', 'BA24_9', 'BA24_10', 'BA24_11', 'BA24_12',
       'BA24_13', 'BA24_14', 'BA24_15', 'BA24_16', 'BA24_17', 'BA24_18',
       'BA24_19', 'BA9_1', 'BA9_2', 'BA9_3', 'BA9_4', 'BA9_5', 'BA9_6',
       'BA9_7', 'BA9_8', 'BA9_9', 'BA9_10', 'BA9_11', 'BA9_12', 'BA9_13',
       'BA9_14', 'BA9_15', 'BA9_16', 'BA9_17', 'BA9_18', 'BA9_19'],
      dtype='object')

In [2]:
data = pd.read_csv("../../data/input/GSM1304918_8_SCZ_HPC.CEL")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8c in position 8: invalid start byte

In [13]:
from Bio.Affy import CelFile
import gzip

with gzip.open("../../data/input/GSM1304950_4_C_B46.CEL.gz") as handle:
    c = CelFile.read(handle)

In [15]:
print(c.ncols, c.nrows)

1164 1164


In [16]:
print(c.intensities)

[[ 4.61442085e-08  4.53892426e-08  4.61395331e-08 ...  4.53583198e-08
   2.30810274e+23  2.32043900e+23]
 [ 2.31236854e+23 -2.04439552e+08  4.55092959e-08 ... -4.03347484e-23
   6.16194655e-39  4.58292106e-08]
 [ 6.12284472e-39  4.52091626e-08  6.11782247e-39 ...  6.15548937e-39
   6.11782247e-39  2.29744974e+23]
 ...
 [ 4.54310793e-08  6.11208275e-39 -4.03040522e-23 ... -2.04865536e+08
  -2.04763136e+08  6.11495261e-39]
 [-2.05076480e+08  2.29652740e+23  4.53546818e-08 ...  6.13360669e-39
   2.34006244e+23 -2.04177408e+08]
 [-4.09876223e-23  2.29910995e+23  6.44632046e-39 ... -5.13203308e+02
   1.45002424e+14 -1.63770820e+29]]


In [17]:
print(c.stdevs)

[[ 1.01876487e-23  1.00322305e-23  1.01872259e-23 ...  1.00027460e-23
   1.00417623e-23  1.00811422e-23]
 [ 1.00579182e-23  1.00054920e-23  1.00431357e-23 ...  1.00193863e-23
   1.00206792e-23  1.01035183e-23]
 [ 1.00069467e-23  9.94943165e-24  1.00020999e-23 ...  1.00142168e-23
   9.99079084e-24  1.00001609e-23]
 ...
 [ 1.00229409e-23  9.98432849e-24  1.00113083e-23 ...  1.00333609e-23
   1.00127622e-23  9.99402201e-24]
 [ 1.00325531e-23  9.99208299e-24  1.00092084e-23 ...  1.00085623e-23
   1.01713374e-23  1.00103388e-23]
 [ 1.01683611e-23  1.00082389e-23  1.01671423e-23 ... -5.13781433e+02
   5.66432629e+11 -2.67630546e+36]]


In [18]:
print(c.npix)

[[    0     0     0 ...     0     0     0]
 [    0     0     0 ...     0     0     0]
 [    0     0     0 ...     0     0     0]
 ...
 [    0     0     0 ...     0     0     0]
 [    0     0     0 ...     0     0     0]
 [    0     0     0 ...  5635 -6652 20739]]
