In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 24723613
paper_name = 'lee_giaever_2014' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
compounds = pd.read_excel('raw_data/1250217s1.xlsx', sheet_name='compound library')

In [6]:
screenid_datasetid = pd.read_excel('extras/screenid_datasetid.xlsx', 
                                   sheet_name='screenid_datasetid',
                                   names=['compound','hom_dataset_id','conditionset_id','het_dataset_id'],
                                  index_col='compound')

In [7]:
screenid_datasetid.head()

Unnamed: 0_level_0,hom_dataset_id,conditionset_id,het_dataset_id
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SGTC_1,1353,4497,12712
SGTC_10,1354,4498,12713
SGTC_100,1355,4499,12714
SGTC_1000,1356,4500,14394
SGTC_1001,1357,4501,12715


# Load & process the data

In [8]:
original_data1 = pd.read_csv('raw_data/fitness_defect_matrix_hom.txt', sep='\t')

In [9]:
print('Original data dimensions: %d x %d' % (original_data1.shape))

Original data dimensions: 4810 x 3357


In [10]:
original_data1.rename(columns={'Unnamed: 0': 'orfs'}, inplace=True)

In [11]:
original_data1['orfs'] = original_data1['orfs'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data1['orfs'] = clean_orf(original_data1['orfs'])

In [13]:
# Translate to ORFs 
original_data1['orfs'] = translate_sc(original_data1['orfs'], to='orf')

In [14]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orfs'])
print(original_data1.loc[~t,])

Empty DataFrame
Columns: [orfs, SGTC_1, SGTC_2, SGTC_3, SGTC_4, SGTC_5, SGTC_6, SGTC_7, SGTC_8, SGTC_9, SGTC_10, SGTC_11, SGTC_12, SGTC_13, SGTC_14, SGTC_15, SGTC_16, SGTC_17, SGTC_18, SGTC_19, SGTC_20, SGTC_21, SGTC_22, SGTC_23, SGTC_24, SGTC_25, SGTC_26, SGTC_27, SGTC_28, SGTC_29, SGTC_30, SGTC_31, SGTC_32, SGTC_33, SGTC_34, SGTC_35, SGTC_36, SGTC_37, SGTC_38, SGTC_39, SGTC_40, SGTC_41, SGTC_42, SGTC_43, SGTC_44, SGTC_45, SGTC_46, SGTC_47, SGTC_48, SGTC_49, SGTC_50, SGTC_51, SGTC_52, SGTC_53, SGTC_54, SGTC_55, SGTC_56, SGTC_57, SGTC_58, SGTC_59, SGTC_60, SGTC_61, SGTC_62, SGTC_63, SGTC_64, SGTC_65, SGTC_66, SGTC_67, SGTC_68, SGTC_69, SGTC_70, SGTC_71, SGTC_72, SGTC_73, SGTC_74, SGTC_75, SGTC_76, SGTC_77, SGTC_78, SGTC_79, SGTC_80, SGTC_81, SGTC_82, SGTC_83, SGTC_84, SGTC_85, SGTC_86, SGTC_87, SGTC_88, SGTC_89, SGTC_90, SGTC_91, SGTC_92, SGTC_93, SGTC_94, SGTC_95, SGTC_96, SGTC_97, SGTC_98, SGTC_99, ...]
Index: []

[0 rows x 3357 columns]


In [15]:
original_data1.set_index('orfs', inplace=True)
original_data1.index.name='orf'

In [16]:
original_data1 = original_data1.groupby(original_data1.index).mean()

In [17]:
original_data1.shape

(4808, 3356)

In [18]:
# Switch sign (as per convention: lower values = lower phenotype)
original_data1 = -original_data1

In [19]:
# Match to dataset ids

In [20]:
hom_dataset_ids = screenid_datasetid.reindex(index=original_data1.columns.values)['hom_dataset_id'].values

In [21]:
np.sum(np.isnan(hom_dataset_ids))

0

In [22]:
# Average the replicates (same dataset id)
original_data1.columns = hom_dataset_ids
original_data1 = original_data1.T

In [23]:
original_data1 = original_data1.groupby(original_data1.index).mean()
original_data1.shape

(3351, 4808)

In [24]:
original_data1 = original_data1.T

# Prepare the final dataset

In [25]:
data = original_data1.copy()

In [26]:
dataset_ids = original_data1.columns.values
datasets = datasets.reindex(index=dataset_ids)

In [27]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [28]:
data.head()

dataset_id,1353,1354,1355,1356,1357,1358,1359,1360,1361,1362,...,4699,4700,4701,4702,4703,4704,4705,4706,4707,4708
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL002W,-3.564202,-0.35322,-1.978155,-0.941862,-0.336995,-1.206435,0.264837,-0.35034,0.412929,0.424757,...,-0.085767,-0.012407,0.66936,1.454074,-0.7378,0.222105,-0.639059,0.716066,-1.469485,-0.71237
YAL004W,0.885645,-1.013808,0.622587,-2.297572,-0.347374,-0.101408,0.635994,-0.40835,-3.195733,0.083219,...,0.517328,-2.653658,-0.109715,1.161742,0.977498,-4.146284,1.13049,-1.481961,2.246581,1.041353
YAL005C,0.810421,-0.929355,-0.505755,0.118226,0.246723,-0.106781,0.258916,0.912332,-1.645592,-0.845358,...,-1.175725,0.472672,0.070476,-0.189061,0.39374,-2.125223,0.296836,-0.934903,1.196578,-1.505333
YAL007C,-0.750878,1.786408,-0.716383,-0.048343,0.308352,0.462982,-0.62415,-0.187065,-2.05626,-1.851755,...,-1.773152,1.809751,-0.126539,1.300545,0.040817,-1.969714,1.321792,0.275722,2.729434,0.204023
YAL008W,1.883027,0.189418,0.002204,1.259501,-0.964669,-0.522965,0.71205,-1.781294,0.066292,-0.078822,...,-0.253605,0.278854,-0.034391,1.240425,0.025583,-0.458295,1.049414,-1.408832,0.020373,-1.389672


## Subset to the genes currently in SGD

In [29]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 14


In [30]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [31]:
data.head()

Unnamed: 0_level_0,dataset_id,1353,1354,1355,1356,1357,1358,1359,1360,1361,1362,...,4699,4700,4701,4702,4703,4704,4705,4706,4707,4708
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,-3.564202,-0.35322,-1.978155,-0.941862,-0.336995,-1.206435,0.264837,-0.35034,0.412929,0.424757,...,-0.085767,-0.012407,0.66936,1.454074,-0.7378,0.222105,-0.639059,0.716066,-1.469485,-0.71237
1863,YAL004W,0.885645,-1.013808,0.622587,-2.297572,-0.347374,-0.101408,0.635994,-0.40835,-3.195733,0.083219,...,0.517328,-2.653658,-0.109715,1.161742,0.977498,-4.146284,1.13049,-1.481961,2.246581,1.041353
4,YAL005C,0.810421,-0.929355,-0.505755,0.118226,0.246723,-0.106781,0.258916,0.912332,-1.645592,-0.845358,...,-1.175725,0.472672,0.070476,-0.189061,0.39374,-2.125223,0.296836,-0.934903,1.196578,-1.505333
5,YAL007C,-0.750878,1.786408,-0.716383,-0.048343,0.308352,0.462982,-0.62415,-0.187065,-2.05626,-1.851755,...,-1.773152,1.809751,-0.126539,1.300545,0.040817,-1.969714,1.321792,0.275722,2.729434,0.204023
6,YAL008W,1.883027,0.189418,0.002204,1.259501,-0.964669,-0.522965,0.71205,-1.781294,0.066292,-0.078822,...,-0.253605,0.278854,-0.034391,1.240425,0.025583,-0.458295,1.049414,-1.408832,0.020373,-1.389672


# Normalize

In [32]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [33]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [34]:
ix = np.isnan(data.values)
data_norm_values = data_norm.values
data_norm_values[ix] = np.nan
data_norm = pd.DataFrame(index=data_norm.index, 
                         columns=data_norm.columns, 
                         data=data_norm_values)

In [35]:
data_all = data.join(data_norm)

In [36]:
data_all.head()

Unnamed: 0_level_0,dataset_id,1353,1354,1355,1356,1357,1358,1359,1360,1361,1362,...,4699,4700,4701,4702,4703,4704,4705,4706,4707,4708
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,-3.564202,-0.35322,-1.978155,-0.941862,-0.336995,-1.206435,0.264837,-0.35034,0.412929,0.424757,...,-0.156562,-0.05801,0.627496,1.345876,-0.671576,0.216722,-0.602522,0.601191,-1.341568,-0.678575
1863,YAL004W,0.885645,-1.013808,0.622587,-2.297572,-0.347374,-0.101408,0.635994,-0.40835,-3.195733,0.083219,...,0.355291,-2.451122,-0.081059,1.089638,0.615451,-3.554135,0.452632,-1.348896,1.902119,0.694645
4,YAL005C,0.810421,-0.929355,-0.505755,0.118226,0.246723,-0.106781,0.258916,0.912332,-1.645592,-0.845358,...,-1.08162,0.381497,0.082822,-0.094381,0.177444,-1.809526,-0.044462,-0.863547,0.98559,-1.299491
5,YAL007C,-0.750878,1.786408,-0.716383,-0.048343,0.308352,0.462982,-0.62415,-0.187065,-2.05626,-1.851755,...,-1.588663,1.59296,-0.096361,1.211303,-0.087362,-1.675288,0.566702,0.210518,2.323592,0.03899
6,YAL008W,1.883027,0.189418,0.002204,1.259501,-0.964669,-0.522965,0.71205,-1.781294,0.066292,-0.078822,...,-0.299008,0.205888,-0.012554,1.158606,-0.098792,-0.370609,0.404288,-1.284016,-0.041098,-1.208924


# Print out

In [37]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_hom_' + f + '.txt', sep='\t')

# Save to DB

In [42]:
# Special approach (dataset too large)

In [43]:
f = 'value'
df = data_all.xs(f, level='data_type', axis=1).copy()
df = df.droplevel('orf', axis=0)

In [44]:
df_long_value = pd.melt(df.reset_index(), id_vars=['gene_id'])

In [45]:
df_long_value.shape

(16064694, 3)

In [46]:
f = 'valuez'
df = data_all.xs(f, level='data_type', axis=1).copy()
df = df.droplevel('orf', axis=0)
df_long_valuez = pd.melt(df.reset_index(), id_vars=['gene_id'])

In [47]:
df_long_valuez.head()

Unnamed: 0,gene_id,dataset_id,value
0,2,1353,-3.321909
1,1863,1353,0.841825
2,4,1353,0.771437
3,5,1353,-0.689474
4,6,1353,1.775078


In [48]:
df_long_valuez.shape

(16064694, 3)

In [49]:
df_long = df_long_value.merge(df_long_valuez, how='outer', left_on=['gene_id','dataset_id'], right_on=['gene_id','dataset_id'])

In [50]:
df_long.shape

(16064694, 4)

In [51]:
df_long = df_long[df_long['value_x'].notnull()]

In [52]:
df_long['id'] = np.arange(df_long.shape[0]) + 97509664

In [53]:
df_long.columns = ['gene_id','dataset_id','value','valuez','id']

In [57]:
df_long.tail()

Unnamed: 0,gene_id,dataset_id,value,valuez,id
16064689,6083,4708,0.817928,0.519697,113568089
16064690,6084,4708,1.49858,1.052669,113568090
16064691,6085,4708,-0.687806,-0.659341,113568091
16064692,6086,4708,3.526903,2.640911,113568092
16064693,6087,4708,0.52736,0.292173,113568093


In [59]:
df_long[['id','dataset_id','value','gene_id','valuez']].to_csv('lee_giaever_2014_hom_long.txt', 
                                                               sep=',', header=False, index=False)

In [56]:
df_long['id'].max()

113568093