In [1]:
%matplotlib inline

In [2]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_context("talk")

In [3]:
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

r = robjects.r

nsn = importr("NanoStringNorm")
egr = importr("edgeR")

py = pandas2ri.ri2py

# Read in RCC data

In [4]:
base_path = u'/home/gus/MEGAsync/zim/main/BCH/Projects/James/Nanostring_pipeline/OKT3/data/compare_ours_to_existing/treg2'

our_rcc_path = base_path+u"/ours"
thr_rcc_path = base_path+u"/theirs"

In [5]:
our_data = py(r(u"""read.markup.RCC(rcc.path="{PATH}")""".format(PATH=our_rcc_path)).rx2('x'))
thr_data = py(r(u"""read.markup.RCC(rcc.path="{PATH}")""".format(PATH=thr_rcc_path)).rx2('x'))

In [6]:
our_data.shape

(608, 8)

In [7]:
thr_data.shape

(540, 6)

In [8]:
our_data.head()

Unnamed: 0,CodeClass,Name,Accession,BWH006_Treg_w0_12,BWH008_Treg_w0_01,MGH001_Treg_w0_04,MGH004_Treg_w0_01,MGH007_Treg_w0_09
1,Endogenous,VTN,NM_000638.3,2,2,3,0,0
2,Endogenous,HLA-DQB1,NM_002123.3,2,0,2,0,0
3,Endogenous,KIT,NM_000222.2,1,2,1,0,1
4,Endogenous,LAG3,NM_002286.5,1,0,4,2,1
5,Endogenous,SOCS3,NM_003955.3,1,1,5,0,2


In [9]:
our_genes = set(our_data.Name.unique())
thr_genes = set(thr_data.Name.unique())

In [10]:
len(our_genes.intersection(thr_genes))

520

## Join tables by gene names

- We will only deal with the genes that are in commen between the tables from now on
- thats 520 including the pos/neg controls etc.

In [11]:
cmb_data = pd.merge(left=our_data, right=thr_data,
                    how='inner', on="Name",
                    left_on=None, right_on=None,
                    left_index=False, right_index=False,
                    sort=False, suffixes=('_OUR', '_THR'), copy=True, indicator=False)

cmb_data.index = cmb_data.Name.values
cmb_data = cmb_data.drop(labels=['Name'],axis=1)
cmb_data.head()

Unnamed: 0,CodeClass_OUR,Accession_OUR,BWH006_Treg_w0_12,BWH008_Treg_w0_01,MGH001_Treg_w0_04,MGH004_Treg_w0_01,MGH007_Treg_w0_09,CodeClass_THR,Accession_THR,GSM1518464_CpTp,GSM1518469_CpTp,GSM1518474_CpTp
VTN,Endogenous,NM_000638.3,2,2,3,0,0,Endogenous,NM_000638.3,10,32,18
HLA-DQB1,Endogenous,NM_002123.3,2,0,2,0,0,Endogenous,NM_002123.3,11,22,14
KIT,Endogenous,NM_000222.2,1,2,1,0,1,Endogenous,NM_000222.1,17,94,31
SOCS3,Endogenous,NM_003955.3,1,1,5,0,2,Endogenous,NM_003955.3,305,1604,904
TCF7,Endogenous,NM_003202.2,3,3,2,5,16,Endogenous,NM_003202.2,134,182,199


In [12]:
code_class_missmatch = cmb_data.CodeClass_OUR != cmb_data.CodeClass_THR
sum(code_class_missmatch)

15

15 rows have CodeClass designations that don't match up...

In [13]:
cmb_data[code_class_missmatch][['CodeClass_OUR','CodeClass_THR']]

Unnamed: 0,CodeClass_OUR,CodeClass_THR
EEF1G,Housekeeping,Endogenous
TUBB,Housekeeping,Endogenous
TBP,Housekeeping,Endogenous
POLR2A,Housekeeping,Endogenous
GUSB,Housekeeping,Endogenous
HPRT1,Housekeeping,Endogenous
GAPDH,Housekeeping,Endogenous
SDHA,Housekeeping,Endogenous
OAZ1,Housekeeping,Endogenous
PPIA,Housekeeping,Endogenous


Looks like its not too big a deal.  They just added some genes as housekeepers.

## OK lets get rid of CodeClass and Accession Columns and write to file

In [14]:
cmb_data_extra_cols = cmb_data[['CodeClass_OUR','Accession_OUR','CodeClass_THR','Accession_THR']].copy()

cmb_data_nsn = cmb_data.drop(labels=['CodeClass_THR','Accession_THR'],axis=1)

In [15]:
cmb_data_nsn.rename(columns={'CodeClass_OUR':'CodeClass','Accession_OUR':'Accession'}, inplace=True)

In [16]:
cmb_data_nsn.head()

Unnamed: 0,CodeClass,Accession,BWH006_Treg_w0_12,BWH008_Treg_w0_01,MGH001_Treg_w0_04,MGH004_Treg_w0_01,MGH007_Treg_w0_09,GSM1518464_CpTp,GSM1518469_CpTp,GSM1518474_CpTp
VTN,Endogenous,NM_000638.3,2,2,3,0,0,10,32,18
HLA-DQB1,Endogenous,NM_002123.3,2,0,2,0,0,11,22,14
KIT,Endogenous,NM_000222.2,1,2,1,0,1,17,94,31
SOCS3,Endogenous,NM_003955.3,1,1,5,0,2,305,1604,904
TCF7,Endogenous,NM_003202.2,3,3,2,5,16,134,182,199


### Replace index to column and reorder first three columns

In [17]:
cmb_data_nsn = cmb_data_nsn.reset_index().rename(columns={'index':'Name'})

In [18]:
cmb_data_nsn.head()

Unnamed: 0,Name,CodeClass,Accession,BWH006_Treg_w0_12,BWH008_Treg_w0_01,MGH001_Treg_w0_04,MGH004_Treg_w0_01,MGH007_Treg_w0_09,GSM1518464_CpTp,GSM1518469_CpTp,GSM1518474_CpTp
0,VTN,Endogenous,NM_000638.3,2,2,3,0,0,10,32,18
1,HLA-DQB1,Endogenous,NM_002123.3,2,0,2,0,0,11,22,14
2,KIT,Endogenous,NM_000222.2,1,2,1,0,1,17,94,31
3,SOCS3,Endogenous,NM_003955.3,1,1,5,0,2,305,1604,904
4,TCF7,Endogenous,NM_003202.2,3,3,2,5,16,134,182,199


In [19]:
cmb_data_nsn = pd.concat([cmb_data_nsn[['CodeClass','Name','Accession']], cmb_data_nsn.iloc[:,3:]], axis=1)
cmb_data_nsn.head()

Unnamed: 0,CodeClass,Name,Accession,BWH006_Treg_w0_12,BWH008_Treg_w0_01,MGH001_Treg_w0_04,MGH004_Treg_w0_01,MGH007_Treg_w0_09,GSM1518464_CpTp,GSM1518469_CpTp,GSM1518474_CpTp
0,Endogenous,VTN,NM_000638.3,2,2,3,0,0,10,32,18
1,Endogenous,HLA-DQB1,NM_002123.3,2,0,2,0,0,11,22,14
2,Endogenous,KIT,NM_000222.2,1,2,1,0,1,17,94,31
3,Endogenous,SOCS3,NM_003955.3,1,1,5,0,2,305,1604,904
4,Endogenous,TCF7,NM_003202.2,3,3,2,5,16,134,182,199


# Finally write the file out

In [21]:
cmb_data_path = base_path+u"/combo_nanostring_data_common_tags.csv"
cmb_data_nsn.to_csv(cmb_data_path, 
           columns=None, header=True, index=False,)