In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 24723613
paper_name = 'lee_giaever_2014' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
screenid_datasetid = pd.read_excel('extras/screenid_datasetid.xlsx', 
                                   sheet_name='screenid_datasetid',
                                   names=['compound','hom_dataset_id','conditionset_id','het_dataset_id'],
                                  index_col='compound')

In [6]:
screenid_datasetid.head()

Unnamed: 0_level_0,hom_dataset_id,conditionset_id,het_dataset_id
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SGTC_1,1353,4497,12712
SGTC_10,1354,4498,12713
SGTC_100,1355,4499,12714
SGTC_1000,1356,4500,14394
SGTC_1001,1357,4501,12715


# Load & process the data

In [7]:
original_data1 = pd.read_csv('raw_data/fitness_defect_matrix_het.txt', sep='\t')

In [8]:
print('Original data dimensions: %d x %d' % (original_data1.shape))

Original data dimensions: 1095 x 3357


In [9]:
original_data1.rename(columns={'Unnamed: 0': 'orfs'}, inplace=True)

In [10]:
original_data1['orfs'] = original_data1['orfs'].astype(str)

In [11]:
# Eliminate all white spaces & capitalize
original_data1['orfs'] = clean_orf(original_data1['orfs'])

In [12]:
# Translate to ORFs 
original_data1['orfs'] = translate_sc(original_data1['orfs'], to='orf')

In [13]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orfs'])
print(original_data1.loc[~t,])

Empty DataFrame
Columns: [orfs, SGTC_1, SGTC_2, SGTC_3, SGTC_4, SGTC_5, SGTC_6, SGTC_7, SGTC_8, SGTC_9, SGTC_10, SGTC_11, SGTC_12, SGTC_13, SGTC_14, SGTC_15, SGTC_16, SGTC_17, SGTC_18, SGTC_19, SGTC_20, SGTC_21, SGTC_22, SGTC_23, SGTC_24, SGTC_25, SGTC_26, SGTC_27, SGTC_28, SGTC_29, SGTC_30, SGTC_31, SGTC_32, SGTC_33, SGTC_34, SGTC_35, SGTC_36, SGTC_37, SGTC_38, SGTC_39, SGTC_40, SGTC_41, SGTC_42, SGTC_43, SGTC_44, SGTC_45, SGTC_46, SGTC_47, SGTC_48, SGTC_49, SGTC_50, SGTC_51, SGTC_52, SGTC_53, SGTC_54, SGTC_55, SGTC_56, SGTC_57, SGTC_58, SGTC_59, SGTC_60, SGTC_61, SGTC_62, SGTC_63, SGTC_64, SGTC_65, SGTC_66, SGTC_67, SGTC_68, SGTC_69, SGTC_70, SGTC_71, SGTC_72, SGTC_73, SGTC_74, SGTC_75, SGTC_76, SGTC_77, SGTC_78, SGTC_79, SGTC_80, SGTC_81, SGTC_82, SGTC_83, SGTC_84, SGTC_85, SGTC_86, SGTC_87, SGTC_88, SGTC_89, SGTC_90, SGTC_91, SGTC_92, SGTC_93, SGTC_94, SGTC_95, SGTC_96, SGTC_97, SGTC_98, SGTC_99, ...]
Index: []

[0 rows x 3357 columns]


In [14]:
original_data1.set_index('orfs', inplace=True)
original_data1.index.name='orf'

In [15]:
original_data1 = original_data1.groupby(original_data1.index).mean()

In [16]:
original_data1.shape

(1095, 3356)

In [17]:
# Switch sign (as per convention: lower values = lower phenotype)
original_data1 = -original_data1

In [18]:
# Match to dataset ids

In [19]:
het_dataset_ids = screenid_datasetid.reindex(index=original_data1.columns.values)['het_dataset_id'].values

In [20]:
np.sum(np.isnan(het_dataset_ids))

0

In [21]:
# Average the replicates (same dataset id)
original_data1.columns = het_dataset_ids
original_data1 = original_data1.T

In [22]:
original_data1 = original_data1.groupby(original_data1.index).mean()
original_data1.shape

(3351, 1095)

In [23]:
original_data1 = original_data1.T

# Prepare the final dataset

In [24]:
data = original_data1.copy()

In [25]:
dataset_ids = original_data1.columns.values
datasets = datasets.reindex(index=dataset_ids)

In [26]:
datasets.loc[datasets['name'].isnull()]

Unnamed: 0_level_0,name
dataset_id,Unnamed: 1_level_1


In [27]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [28]:
data.head()

dataset_id,12627,12628,12629,12630,12631,12632,12633,12634,12635,12636,...,15973,15974,15975,15976,15977,15978,15979,15980,15981,15982
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL001C,2.854542,2.517708,-1.153309,-0.952176,1.92091,2.296125,-0.466234,1.027668,2.067729,-4.000097,...,-1.003608,0.139967,-2.348368,-3.200169,0.312923,1.26075,0.187037,0.412209,0.480659,-0.149563
YAL003W,2.065927,-0.963073,0.837815,0.609618,-1.494138,-0.03759,-0.478915,-0.591904,0.879805,0.469542,...,0.337857,1.778926,0.090333,-1.331668,-1.071445,0.232153,0.250174,-1.527926,0.176402,-0.603382
YAL025C,-0.771042,-1.008536,0.495327,-1.001732,-0.484244,-2.9545,-0.79852,0.189401,-1.293275,0.859315,...,-0.010238,0.863016,1.379514,-1.426761,-1.040631,2.344398,-1.4898,-0.737161,0.872477,0.080705
YAL032C,-1.097148,-2.154759,-0.274118,-1.247344,-0.21382,0.013804,0.531671,-0.596314,1.481032,-1.208819,...,-1.135614,1.722025,-2.145999,-0.508375,-1.047365,-1.093894,0.970007,0.477508,0.368423,0.489545
YAL033W,-1.792244,0.104146,-1.003579,-0.70657,-0.55436,2.762787,-0.216875,0.70512,-1.032982,-0.22923,...,0.238247,-0.615638,-0.436098,-0.361598,0.36172,-0.140814,0.900067,0.265923,-0.59729,0.174155


## Subset to the genes currently in SGD

In [29]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [30]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [31]:
data.head()

Unnamed: 0_level_0,dataset_id,12627,12628,12629,12630,12631,12632,12633,12634,12635,12636,...,15973,15974,15975,15976,15977,15978,15979,15980,15981,15982
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,2.854542,2.517708,-1.153309,-0.952176,1.92091,2.296125,-0.466234,1.027668,2.067729,-4.000097,...,-1.003608,0.139967,-2.348368,-3.200169,0.312923,1.26075,0.187037,0.412209,0.480659,-0.149563
3,YAL003W,2.065927,-0.963073,0.837815,0.609618,-1.494138,-0.03759,-0.478915,-0.591904,0.879805,0.469542,...,0.337857,1.778926,0.090333,-1.331668,-1.071445,0.232153,0.250174,-1.527926,0.176402,-0.603382
23,YAL025C,-0.771042,-1.008536,0.495327,-1.001732,-0.484244,-2.9545,-0.79852,0.189401,-1.293275,0.859315,...,-0.010238,0.863016,1.379514,-1.426761,-1.040631,2.344398,-1.4898,-0.737161,0.872477,0.080705
30,YAL032C,-1.097148,-2.154759,-0.274118,-1.247344,-0.21382,0.013804,0.531671,-0.596314,1.481032,-1.208819,...,-1.135614,1.722025,-2.145999,-0.508375,-1.047365,-1.093894,0.970007,0.477508,0.368423,0.489545
31,YAL033W,-1.792244,0.104146,-1.003579,-0.70657,-0.55436,2.762787,-0.216875,0.70512,-1.032982,-0.22923,...,0.238247,-0.615638,-0.436098,-0.361598,0.36172,-0.140814,0.900067,0.265923,-0.59729,0.174155


# Normalize

In [32]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [33]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [34]:
ix = np.isnan(data.values)

In [35]:
data_norm_values = data_norm.values
data_norm_values[ix] = np.nan

In [36]:
data_norm = pd.DataFrame(index=data_norm.index, columns=data_norm.columns, data=data_norm_values)

In [37]:
data_all = data.join(data_norm)

In [38]:
data_all.head()

Unnamed: 0_level_0,dataset_id,12627,12628,12629,12630,12631,12632,12633,12634,12635,12636,...,15973,15974,15975,15976,15977,15978,15979,15980,15981,15982
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,2.854542,2.517708,-1.153309,-0.952176,1.92091,2.296125,-0.466234,1.027668,2.067729,-4.000097,...,-0.891425,0.055741,-2.205544,-2.970069,0.19268,1.001627,0.134087,0.264971,0.442474,-0.290169
3,YAL003W,2.065927,-0.963073,0.837815,0.609618,-1.494138,-0.03759,-0.478915,-0.591904,0.879805,0.469542,...,0.362341,1.582325,0.000702,-1.296534,-1.067811,0.180979,0.194375,-1.289249,0.167416,-0.702663
23,YAL025C,-0.771042,-1.008536,0.495327,-1.001732,-0.484244,-2.9545,-0.79852,0.189401,-1.293275,0.859315,...,0.037003,0.729214,1.166999,-1.381705,-1.039754,1.866197,-1.467085,-0.655777,0.796689,-0.080868
30,YAL032C,-1.097148,-2.154759,-0.274118,-1.247344,-0.21382,0.013804,0.531671,-0.596314,1.481032,-1.208819,...,-1.014801,1.529325,-2.022464,-0.559146,-1.045886,-0.876985,0.881727,0.31728,0.341009,0.290744
31,YAL033W,-1.792244,0.104146,-1.003579,-0.70657,-0.55436,2.762787,-0.216875,0.70512,-1.032982,-0.22923,...,0.269243,-0.648055,-0.47555,-0.427684,0.23711,-0.116586,0.814942,0.147782,-0.532024,0.004073


# Print out

In [40]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_het_' + f + '.txt', sep='\t')

# Save to DB

In [41]:
# Special approach (dataset too large)

In [42]:
f = 'value'
df = data_all.xs(f, level='data_type', axis=1).copy()
df = df.droplevel('orf', axis=0)

In [43]:
df_long_value = pd.melt(df.reset_index(), id_vars=['gene_id'])

In [44]:
df_long_value.shape

(3669345, 3)

In [45]:
f = 'valuez'
df = data_all.xs(f, level='data_type', axis=1).copy()
df = df.droplevel('orf', axis=0)
df_long_valuez = pd.melt(df.reset_index(), id_vars=['gene_id'])

In [46]:
df_long_valuez.head()

Unnamed: 0,gene_id,dataset_id,value
0,1,12627,2.131003
1,3,12627,1.570766
2,23,12627,-0.444634
3,30,12627,-0.676302
4,31,12627,-1.170102


In [47]:
df_long_valuez.shape

(3669345, 3)

In [48]:
df_long = df_long_value.merge(df_long_valuez, how='outer', left_on=['gene_id','dataset_id'], right_on=['gene_id','dataset_id'])

In [49]:
df_long.shape

(3669345, 4)

In [50]:
df_long = df_long[df_long['value_x'].notnull()]

In [57]:
df_long['id'] = np.arange(df_long.shape[0]) + 113568094

In [58]:
df_long.columns = ['gene_id','dataset_id','value','valuez','id']

In [59]:
df_long.head()

Unnamed: 0,gene_id,dataset_id,value,valuez,id
0,1,12627,2.854542,2.131003,113568094
1,3,12627,2.065927,1.570766,113568095
2,23,12627,-0.771042,-0.444634,113568096
3,30,12627,-1.097148,-0.676302,113568097
4,31,12627,-1.792244,-1.170102,113568098


In [63]:
df_long[['id','dataset_id','value','gene_id','valuez']].to_csv('lee_giaever_2014_het_long.txt', 
                                                               sep=',', header=False, index=False)

In [60]:
df_long['id'].min()

113568094