In [1]:
%matplotlib inline

In [2]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_context("talk")

In [3]:
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

r = robjects.r

nsn = importr("NanoStringNorm")
egr = importr("edgeR")

py = pandas2ri.ri2py

In [4]:
cell_type = 'CD14'

# Read in RCC data

In [5]:
base_path = u'/home/gus/MEGAsync/zim/main/BCH/Projects/James/Nanostring_pipeline/OKT3/data/compare_ours_to_existing/cd14'

our_rcc_path = base_path+u"/ours"
thr_rcc_path = base_path+u"/theirs"

In [6]:
our_data = py(r(u"""read.markup.RCC(rcc.path="{PATH}")""".format(PATH=our_rcc_path)).rx2('x'))
thr_data = py(r(u"""read.markup.RCC(rcc.path="{PATH}")""".format(PATH=thr_rcc_path)).rx2('x'))

In [7]:
our_data.shape

(608, 9)

In [8]:
thr_data.shape

(540, 13)

In [9]:
our_data.head()

Unnamed: 0,CodeClass,Name,Accession,BWH006_CD14_w0_04,BWH008_CD14_w0_09,MGH001_CD14_w0_07,MGH002_CD14_w0_05,MGH004_CD14_w0_09,MGH007_CD14_w0_10
1,Endogenous,VTN,NM_000638.3,1,0,4,0,2,1
2,Endogenous,HLA-DQB1,NM_002123.3,8,8,2,11,0,2
3,Endogenous,KIT,NM_000222.2,2,0,3,1,3,2
4,Endogenous,LAG3,NM_002286.5,2,7,3,1,1,0
5,Endogenous,SOCS3,NM_003955.3,11,23,387,15,7,206


In [10]:
our_genes = set(our_data.Name.unique())
thr_genes = set(thr_data.Name.unique())

In [11]:
len(our_genes.intersection(thr_genes))

520

## Join tables by gene names

- We will only deal with the genes that are in commen between the tables from now on
- thats 520 including the pos/neg controls etc.

In [22]:
cmb_data = pd.merge(left=our_data, right=thr_data,
                    how='inner', on="Name",
                    left_on=None, right_on=None,
                    left_index=False, right_index=False,
                    sort=False, suffixes=('_OUR', '_THR'), copy=True, indicator=False)

cmb_data.index = cmb_data.Name.values
cmb_data = cmb_data.drop(labels=['Name'],axis=1)
cmb_data.head()

Unnamed: 0,CodeClass_OUR,Accession_OUR,BWH006_CD14_w0_04,BWH008_CD14_w0_09,MGH001_CD14_w0_07,MGH002_CD14_w0_05,MGH004_CD14_w0_09,MGH007_CD14_w0_10,CodeClass_THR,Accession_THR,GSM973583,GSM973584,GSM973585,GSM973586,GSM973587,GSM973588,GSM973589,GSM973590,GSM973591,GSM973592
VTN,Endogenous,NM_000638.3,1,0,4,0,2,1,Endogenous,NM_000638.3,20,44,36,27,22,23,9,22,27,20
HLA-DQB1,Endogenous,NM_002123.3,8,8,2,11,0,2,Endogenous,NM_002123.3,790,4686,4274,27,11,16,1778,2381,8212,2225
KIT,Endogenous,NM_000222.2,2,0,3,1,3,2,Endogenous,NM_000222.1,42,43,32,36,35,27,22,37,29,31
SOCS3,Endogenous,NM_003955.3,11,23,387,15,7,206,Endogenous,NM_003955.3,4265,8859,6741,5401,4226,2073,3250,6298,3883,4945
TCF7,Endogenous,NM_003202.2,0,1,3,1,0,1,Endogenous,NM_003202.2,44,81,59,138,38,44,30,58,62,33


In [23]:
code_class_missmatch = cmb_data.CodeClass_OUR != cmb_data.CodeClass_THR
sum(code_class_missmatch)

15

15 rows have CodeClass designations that don't match up...

In [24]:
cmb_data[code_class_missmatch][['CodeClass_OUR','CodeClass_THR']]

Unnamed: 0,CodeClass_OUR,CodeClass_THR
EEF1G,Housekeeping,Endogenous
TUBB,Housekeeping,Endogenous
TBP,Housekeeping,Endogenous
POLR2A,Housekeeping,Endogenous
GUSB,Housekeeping,Endogenous
HPRT1,Housekeeping,Endogenous
GAPDH,Housekeeping,Endogenous
SDHA,Housekeeping,Endogenous
OAZ1,Housekeeping,Endogenous
PPIA,Housekeeping,Endogenous


Looks like its not too big a deal.  They just added some genes as housekeepers.

## OK lets get rid of CodeClass and Accession Columns and write to file

In [25]:
cmb_data_extra_cols = cmb_data[['CodeClass_OUR','Accession_OUR','CodeClass_THR','Accession_THR']].copy()

cmb_data_nsn = cmb_data.drop(labels=['CodeClass_THR','Accession_THR'],axis=1)

In [30]:
cmb_data_nsn.rename(columns={'CodeClass_OUR':'CodeClass','Accession_OUR':'Accession'}, inplace=True)

In [31]:
cmb_data_nsn.head()

Unnamed: 0,CodeClass,Accession,BWH006_CD14_w0_04,BWH008_CD14_w0_09,MGH001_CD14_w0_07,MGH002_CD14_w0_05,MGH004_CD14_w0_09,MGH007_CD14_w0_10,GSM973583,GSM973584,GSM973585,GSM973586,GSM973587,GSM973588,GSM973589,GSM973590,GSM973591,GSM973592
VTN,Endogenous,NM_000638.3,1,0,4,0,2,1,20,44,36,27,22,23,9,22,27,20
HLA-DQB1,Endogenous,NM_002123.3,8,8,2,11,0,2,790,4686,4274,27,11,16,1778,2381,8212,2225
KIT,Endogenous,NM_000222.2,2,0,3,1,3,2,42,43,32,36,35,27,22,37,29,31
SOCS3,Endogenous,NM_003955.3,11,23,387,15,7,206,4265,8859,6741,5401,4226,2073,3250,6298,3883,4945
TCF7,Endogenous,NM_003202.2,0,1,3,1,0,1,44,81,59,138,38,44,30,58,62,33


### Replace index to column and reorder first three columns

In [34]:
cmb_data_nsn = cmb_data_nsn.reset_index().rename(columns={'index':'Name'})

In [35]:
cmb_data_nsn.head()

Unnamed: 0,Name,CodeClass,Accession,BWH006_CD14_w0_04,BWH008_CD14_w0_09,MGH001_CD14_w0_07,MGH002_CD14_w0_05,MGH004_CD14_w0_09,MGH007_CD14_w0_10,GSM973583,GSM973584,GSM973585,GSM973586,GSM973587,GSM973588,GSM973589,GSM973590,GSM973591,GSM973592
0,VTN,Endogenous,NM_000638.3,1,0,4,0,2,1,20,44,36,27,22,23,9,22,27,20
1,HLA-DQB1,Endogenous,NM_002123.3,8,8,2,11,0,2,790,4686,4274,27,11,16,1778,2381,8212,2225
2,KIT,Endogenous,NM_000222.2,2,0,3,1,3,2,42,43,32,36,35,27,22,37,29,31
3,SOCS3,Endogenous,NM_003955.3,11,23,387,15,7,206,4265,8859,6741,5401,4226,2073,3250,6298,3883,4945
4,TCF7,Endogenous,NM_003202.2,0,1,3,1,0,1,44,81,59,138,38,44,30,58,62,33


In [39]:
cmb_data_nsn = pd.concat([cmb_data_nsn[['CodeClass','Name','Accession']], cmb_data_nsn.iloc[:,3:]], axis=1)
cmb_data_nsn.head()

Unnamed: 0,CodeClass,Name,Accession,BWH006_CD14_w0_04,BWH008_CD14_w0_09,MGH001_CD14_w0_07,MGH002_CD14_w0_05,MGH004_CD14_w0_09,MGH007_CD14_w0_10,GSM973583,GSM973584,GSM973585,GSM973586,GSM973587,GSM973588,GSM973589,GSM973590,GSM973591,GSM973592
0,Endogenous,VTN,NM_000638.3,1,0,4,0,2,1,20,44,36,27,22,23,9,22,27,20
1,Endogenous,HLA-DQB1,NM_002123.3,8,8,2,11,0,2,790,4686,4274,27,11,16,1778,2381,8212,2225
2,Endogenous,KIT,NM_000222.2,2,0,3,1,3,2,42,43,32,36,35,27,22,37,29,31
3,Endogenous,SOCS3,NM_003955.3,11,23,387,15,7,206,4265,8859,6741,5401,4226,2073,3250,6298,3883,4945
4,Endogenous,TCF7,NM_003202.2,0,1,3,1,0,1,44,81,59,138,38,44,30,58,62,33


# Finally write the file out

In [42]:
cmb_data_path = base_path+u"/combo_nanostring_data_common_tags.xls"
cmb_data_nsn.to_excel(cmb_data_path, 
           columns=None, header=True, index=False,)