# Split-seq + UNCURL

Data source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE110823

In [2]:
import numpy as np
import pandas as pd
import scipy.io
import scipy.sparse

In [2]:
!tar -xf GSE110823_RAW.tar

In [3]:
!ls

2018-04-19+AICS+Analysis.ipynb
2018-05-11-split-seq-loading.ipynb
GSE110823_RAW.tar
GSM3017260_100_CNS_nuclei.mat.gz
GSM3017261_150000_CNS_nuclei.mat
GSM3017262_same_day_cells_nuclei_3000_UBCs.mat.gz
GSM3017263_same_day_cells_nuclei_300_UBCs.mat.gz
GSM3017264_frozen_preserved_cells_nuclei_1000_UBCs.mat.gz
GSM3017265_frozen_preserved_cells_nuclei_200_UBCs.mat.gz


In [6]:
!gunzip GSM3017261_150000_CNS_nuclei.mat.gz

In [4]:
mat = scipy.io.loadmat('GSM3017261_150000_CNS_nuclei.mat')

In [32]:
!gzip GSM3017261_150000_CNS_nuclei.mat

In [5]:
DGE = scipy.sparse.csc_matrix(mat['DGE'].T)

In [6]:
DGE.shape

(26894, 156049)

In [7]:
genes = pd.Series(mat['genes']).str.strip(' ')

In [8]:
genes

0        0610005C13Rik
1        0610007N19Rik
2        0610007P14Rik
3        0610008F07Rik
4        0610009B14Rik
5        0610009B22Rik
6        0610009D07Rik
7        0610009E02Rik
8        0610009L18Rik
9        0610009O20Rik
10       0610010F05Rik
11       0610010K14Rik
12       0610011F06Rik
13       0610012D04Rik
14       0610012H03Rik
15       0610025J13Rik
16       0610030E20Rik
17       0610031J06Rik
18       0610037L13Rik
19       0610038L08Rik
20       0610038P03Rik
21       0610039K10Rik
22       0610040B09Rik
23       0610040B10Rik
24       0610040F04Rik
25       0610040J01Rik
26       0610043K17Rik
27       1010001B22Rik
28       1010001N08Rik
29       1100001G20Rik
             ...      
26864          n-R5s29
26865           n-R5s3
26866          n-R5s30
26867          n-R5s33
26868          n-R5s40
26869          n-R5s41
26870          n-R5s45
26871          n-R5s47
26872          n-R5s48
26873          n-R5s54
26874          n-R5s56
26875          n-R5s58
26876      

In [9]:
sample_type = pd.Series(mat['sample_type']).str.strip(' ')

In [10]:
sample_type

0          p2_brain
1         p11_brain
2         p11_brain
3          p2_brain
4          p2_brain
5          p2_spine
6          p2_spine
7          p2_brain
8          p2_brain
9          p2_brain
10         p2_brain
11         p2_brain
12         p2_brain
13         p2_brain
14        p11_brain
15        p11_brain
16        p11_spine
17        p11_spine
18        p11_brain
19        p11_brain
20         p2_brain
21         p2_brain
22         p2_spine
23         p2_spine
24         p2_brain
25        p11_brain
26        p11_brain
27        p11_brain
28        p11_brain
29        p11_spine
            ...    
156019     p2_brain
156020    p11_brain
156021    p11_brain
156022    p11_brain
156023    p11_brain
156024    p11_spine
156025    p11_spine
156026    p11_brain
156027    p11_brain
156028     p2_brain
156029     p2_brain
156030     p2_brain
156031     p2_spine
156032     p2_brain
156033     p2_brain
156034     p2_brain
156035    p11_brain
156036    p11_brain
156037    p11_brain


In [11]:
sample_type.unique()

array(['p2_brain', 'p11_brain', 'p2_spine', 'p11_spine'], dtype=object)

In [12]:
cluster_assignment = pd.Series(mat['cluster_assignment']).str.strip(' ')

In [13]:
spinal_cluster_assignment = pd.Series(mat['spinal_cluster_assignment']).str.strip(' ')

In [14]:
# separate data into brain and spinal
p2_brain = (sample_type == 'p2_brain')
p11_brain = (sample_type == 'p11_brain')
p2_spine = (sample_type == 'p2_spine')
p11_spine = (sample_type == 'p11_spine')

In [15]:
p2_brain.sum()

74862

In [16]:
p11_brain.sum()

58573

In [17]:
p2_spine.sum()

7028

In [18]:
p11_spine.sum()

15586

In [19]:
spinal_cluster_assignment = pd.Series(mat['spinal_cluster_assignment']).str.strip()

In [20]:
cluster_assignment = pd.Series(mat['cluster_assignment']).str.strip(' ')

In [21]:
(cluster_assignment == '53 Unresolved').sum()

61044

In [24]:
del mat

# Data Notes

p2 = postnatal day 2 mouse

p11 = postnatal day 11 mouse

# Writing out spinal cord data as mtx, save gene names and labels

In [26]:
spine_indices = (p2_spine | p11_spine).as_matrix()

In [40]:
spine_indices.sum()

22614

In [28]:
# save spinal cord data

spinal_cord_data = DGE[:, spine_indices]

In [29]:
spinal_cord_data.shape

(26894, 22614)

In [30]:
scipy.io.mmwrite('spinal_cord.mtx', spinal_cord_data)

In [31]:
!gzip spinal_cord.mtx

In [35]:
genes.to_csv('genes.txt', index=False, sep=' ')

In [43]:
spinal_cluster_assignment[spine_indices].to_csv('spinal_cluster_assignment.txt', index=False, sep=' ')