## Datasets used
1. Synthetic data
2. [Titanic survivors](https://www.kaggle.com/c/titanic/data)
3. [Breast Cancer Wisconsin (Diagnostic) Data Set](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic))
4. Harmonizome
  + ProteomicsDB Cell Type and Tissue Protein Expression Profiles ([Gene-Attribute Matrix Cleaned](http://amp.pharm.mssm.edu/static/hdfs/harmonizome/data/proteomicsdb/gene_attribute_matrix_cleaned.txt.gz))
  + [HGNC gene family](http://www.genenames.org/cgi-bin/genefamilies/download-all/tsv)


In [1]:
import pandas as pd
from utils import *


In [2]:
## Synthetic data
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
	n_informative=3, n_redundant=1, flip_y=0,
	n_features=20, n_clusters_per_class=1,
	n_samples=1000, random_state=10)
print X.shape, y.shape

(1000, 20) (1000,)


In [3]:
## Load Titanic
titanic = pd.read_csv('datasets/Titanic/train.csv')
print titanic.shape
titanic.head()


(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [4]:
## Examine class ratio
print titanic['Survived'].sum()
print titanic['Survived'].sum()/float(titanic.shape[0])

342
0.383838383838


In [5]:
## Process features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ['Sex', 'Cabin', 'Embarked']:
    titanic[col] = le.fit_transform(titanic[col])

## Split df into X and y and convert to numpy.array
X = titanic.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1).values
y = titanic['Survived'].values
print X.shape, y.shape

(891, 8) (891,)


  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [6]:
## Load breast cancer data
bc = pd.read_csv('datasets/wpbc.data', 
                    names=['ID', 'outcome'] + ['Attr%s'%i for i in range(33)])
print bc.shape
bc.head()

(198, 35)


Unnamed: 0,ID,outcome,Attr0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,...,Attr23,Attr24,Attr25,Attr26,Attr27,Attr28,Attr29,Attr30,Attr31,Attr32
0,119513,N,31,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,5
1,8423,N,61,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,2
2,842517,N,116,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,0
3,843483,N,123,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,0
4,843584,R,27,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,0


In [7]:
print bc['outcome'].value_counts()

N    151
R     47
dtype: int64


In [32]:
## Split df into X and y and convert to numpy.array
X = bc.drop(['ID', 'outcome'], axis=1)
y = bc['outcome'].map({'N':0, 'R':1})

X = X.values
y = y.values

print X.shape, y.shape
print y.sum()/float(y.shape[0])

(198, 33) (198,)
0.237373737374


In [8]:
## Load ProteomicsDB data
proteome = pd.read_csv('datasets/Harmonizome/gene_attribute_matrix_cleaned.txt.gz', 
                       sep='\t', compression='gzip', skiprows=2)
print proteome.shape
proteome.head()

(2776, 56)


Unnamed: 0,GeneSym,UniprotAcc,GeneID/Brenda Tissue Ontology BTO:,38,675,1615,568,2806,1938,18,...,1158,1363,1078,763,2553,1129,776,289,2417,914
0,LOC102724985,Q6P996,102724985,5.017355,4.830592,5.783336,5.04944,4.858041,4.605708,4.868083,...,5.395317,4.734884,4.835634,4.659831,4.64315,4.376222,4.026439,4.659961,3.929821,4.854218
1,PDXDC1,H3BU11,23042,4.744159,4.601433,5.267198,4.741168,4.571314,4.51861,4.565993,...,4.894014,4.403376,4.440597,4.472297,4.47905,4.216782,3.893532,4.520178,3.725262,4.458436
2,COG5,Q9UP83,10466,4.213385,4.213385,5.759206,4.528063,4.035077,4.35953,4.30975,...,4.634878,4.799939,4.51071,4.04824,3.924763,4.041728,4.135629,4.715816,4.092942,4.545271
3,HEXA,H3BU85,3073,4.535638,4.740634,5.118528,4.18138,4.731774,4.665979,4.865457,...,4.708274,4.937703,4.5984,5.290727,5.423842,4.154158,3.857923,5.115823,4.369458,5.008221
4,HEXB,H0Y9B6,3074,4.60552,4.591239,6.294882,4.139686,5.471996,4.442832,5.214406,...,5.13961,4.979131,5.654624,5.278758,5.283525,4.972013,4.536124,5.388605,4.576318,5.425797


In [9]:
## Load HGNC gene family 
gene_family = pd.read_csv('datasets/Harmonizome/HGNC_gene_family.txt',sep='\t')
print gene_family.shape
gene_family.head()

(20086, 12)


Unnamed: 0,HGNC ID,Approved Symbol,Approved Name,Status,Previous Symbols,Synonyms,Chromosome,Accession Numbers,RefSeq IDs,Gene Family Tag,Gene family description,Gene family ID
0,324,AGPAT1,1-acylglycerol-3-phosphate O-acyltransferase 1,Approved,,LPAAT-alpha,6p21.3,U56417,NM_006411,AGPAT,1-acylglycerol-3-phosphate O-acyltransferases,46
1,325,AGPAT2,1-acylglycerol-3-phosphate O-acyltransferase 2,Approved,BSCL,LPAAT-beta,9q34.3,AF000237,NM_006412,AGPAT,1-acylglycerol-3-phosphate O-acyltransferases,46
2,326,AGPAT3,1-acylglycerol-3-phosphate O-acyltransferase 3,Approved,,LPAAT-gamma,21q22.3,AF156774,NM_020132,AGPAT,1-acylglycerol-3-phosphate O-acyltransferases,46
3,20885,AGPAT4,1-acylglycerol-3-phosphate O-acyltransferase 4,Approved,,"LPAAT-delta, dJ473J16.2",6q25.3,AF156776,NM_020133,AGPAT,1-acylglycerol-3-phosphate O-acyltransferases,46
4,20886,AGPAT5,1-acylglycerol-3-phosphate O-acyltransferase 5,Approved,,"FLJ11210, LPAAT-e, LPAAT-epsilon",8p23.1,AF375789,NM_018361,AGPAT,1-acylglycerol-3-phosphate O-acyltransferases,46


In [10]:
print gene_family['Gene family description'].value_counts()

MicroRNAs                                            1777
Zinc fingers C2H2-type                                720
RNAs, 7SL, cytoplasmic                                689
Cytoplasmic transfer RNAs                             588
Solute carriers                                       395
CD molecules                                          394
Small nucleolar RNAs, C/D box                         326
Ring finger proteins                                  275
WD repeat domain containing                           262
Immunoglobulin-like domain containing                 245
Ankyrin repeat domain containing                      242
Endogenous ligands                                    236
EF-hand domain containing                             222
RNA binding motif containing                          213
Long non-coding RNAs                                  211
Pleckstrin homology domain containing                 206
Immunoglobulin heavy locus at 14q32.33                185
Protein phosph

In [11]:
# Count kinases
print filter(lambda x: 'kinase' in x.lower(), gene_family['Gene family description'].unique())
print len(filter(lambda x: 'kinase' in x.lower(), gene_family['Gene family description']))

['6-phosphofructo-2-kinases/fructose-2,6-biphosphatases', 'A-kinase anchoring proteins', 'Adenylate kinases', 'C2 domain containing protein kinases', 'CDC like kinases', 'Cyclin-dependent kinases', 'Death-associated protein kinases', 'Diacylglycerol kinases', 'Erb-b2 receptor tyrosine kinases', 'Glycerol kinases', 'MAP kinase phosphatases', 'Membrane-associated guanylate kinases', 'Mitogen-activated protein kinase kinase kinase kinases', 'Mitogen-activated protein kinase kinase kinases', 'Mitogen-activated protein kinase kinases', 'mitogen-activated protein kinase-activated protein kinases', 'Mitogen-activated protein kinases', 'MOB kinase activators', 'Phosphatidylinositol 3-kinase subunits', 'Receptor Tyrosine Kinases', 'SCY1-like, kinase-like proteins', 'Type I receptor serine/threonine kinases', 'Type II receptor serine/threonine kinases']
273


In [12]:
# Count TF
print filter(lambda x: 'transcription factor' in x.lower(), gene_family['Gene family description'].unique())
print len(filter(lambda x: 'transcription factor' in x.lower(), gene_family['Gene family description']))

['E2F transcription factors', 'General transcription factors', 'Kruppel-like transcription factors', 'Sp transcription factors', 'TEA domain transcription factors', 'Transcription factor Dp family']
66


In [13]:
# Inner join with proteome data
gene_family = gene_family[['Approved Symbol', 'Gene family description']]
gene_family.set_index('Approved Symbol', inplace=True)
print gene_family.shape

proteome = proteome.drop(['UniprotAcc', 'GeneID/Brenda Tissue Ontology BTO:'], axis=1)
proteome.set_index('GeneSym', inplace=True)
print proteome.shape

proteome = proteome.merge(gene_family, left_index=True, right_index=True, how='inner')
print proteome.shape

(20086, 1)
(2776, 53)
(1863, 54)


In [14]:
# Split X and y
X = proteome.drop(['Gene family description'], axis=1)
print X.columns
X = X.values
y = proteome['Gene family description']
y = map(lambda x: 'kinase' in x.lower(), y)
y = np.array(y, dtype=np.int64)
print y.sum(), y.shape, X.shape


Index([u'38', u'675', u'1615', u'568', u'2806', u'1938', u'18', u'4860', u'200001', u'200005', u'3884', u'3323', u'1939', u'2178', u'200004', u'1961', u'200002', u'1908', u'200003', u'179', u'4189', u'2696', u'599', u'1321', u'7', u'93', u'975', u'773', u'200043', u'1008', u'661', u'567', u'1890', u'269', u'200034', u'200045', u'2805', u'1061', u'664', u'562', u'1175', u'759', u'988', u'1158', u'1363', u'1078', u'763', u'2553', u'1129', u'776', u'289', u'2417', u'914'], dtype='object')
34 (1863,) (1863, 53)
