In [1]:
%run ../yp_utils.py

# Initial setup

In [4]:
paper_pmid = 33109726
paper_name = 'ayers_gallagher_2020' 

In [5]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [6]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [7]:
original_data = pd.read_excel('raw_data/MCHMs KO full list.xlsx', sheet_name='ScreenKOs_withNames_v3')

In [8]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 930 x 20


In [9]:
original_data['orfs'] = original_data['Systematic.Name'].astype(str)

In [10]:
# Eliminate all white spaces & capitalize
original_data['orfs'] = clean_orf(original_data['orfs'])

In [11]:
# Translate to ORFs 
original_data['orfs'] = translate_sc(original_data['orfs'], to='orf')

In [12]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orfs'])
print(original_data.loc[~t,])

            Systematic.Name Gene Name  Plate  Row  Column Rep.1 Rep.2 Rep.3  \
index_input                                                                   
815                     NaN       NaN    3.0    D     4.0   NaN   NaN     X   
816                     NaN       NaN    3.0    D     7.0   NaN     X   NaN   
821                     NaN       NaN    4.0    F     9.0     X     X   NaN   
822                     NaN       NaN    4.0    F    10.0   NaN     X   NaN   
838                     NaN       NaN    7.0    F    12.0     X   NaN     X   
862                     NaN       NaN   15.0    F     4.0     X   NaN     X   
863                     NaN       NaN   15.0    F     5.0     X   NaN   NaN   
864                     NaN       NaN   15.0    F     6.0     X     X     X   
867                     NaN       NaN   16.0    A     7.0   NaN     X   NaN   
868                     NaN       NaN   16.0    A     8.0   NaN     X     X   
872                     NaN       NaN   16.0    G   

In [13]:
original_data = original_data.loc[t,:]

In [14]:
for c in ['Rep.1','Rep.2','Rep.3','Definite']:
    original_data.loc[original_data.loc[:,c]=='X',c] = -1
    original_data.loc[original_data.loc[:,c].isnull(),c] = 0
    original_data.loc[:,c] = original_data.loc[:,c].astype(int)

In [15]:
original_data['data'] = original_data[['Rep.1','Rep.2','Rep.3']].sum(axis=1)

In [16]:
original_data.set_index('orfs', inplace=True)
original_data.index.name='orf'

In [17]:
original_data.head()

Unnamed: 0_level_0,Systematic.Name,Gene Name,Plate,Row,Column,Rep.1,Rep.2,Rep.3,questionable,Definite,...,SGD ID,X4,Verified ORF?,X7,Chromosome,Location Start,Location End,Description,all 3 plates but not most extreme,data
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YKL134C,YKL134C,2019-10-01 00:00:00,18.0,F,7.0,0,-1,0,,0,...,S000001617,OCTapeptidyl aminopeptidase,Verified,metalloendopeptidase,chrXI,189124.0,191442.0,Mitochondrial intermediate peptidase; cleaves ...,,-1
YBR085W,YBR085W,AAC3,1.0,H,4.0,0,-1,0,X,0,...,S000000289,ADP/ATP Carrier,Verified,ADP/ATP carrier protein AAC3|ANC3,chrII,415983.0,416906.0,Mitochondrial inner membrane ADP/ATP transloca...,,-1
YDL243C,YDL243C,AAD4,7.0,D,3.0,-1,0,-1,,0,...,S000002402,Aryl-Alcohol Dehydrogenase,Verified,putative aryl-alcohol dehydrogenase,chrIV,17577.0,18566.0,Putative aryl-alcohol dehydrogenase; involved ...,,-2
YMR072W,YMR072W,ABF2,29.0,C,11.0,-1,-1,-1,,-1,...,S000004676,ARS-Binding Factor,Verified,DNA-binding protein ABF2|HM|mtTFA|p19,chrXIII,411569.0,412120.0,Mitochondrial DNA-binding protein; involved in...,,-3
YGR037C,YGR037C,ACB1,11.0,F,7.0,-1,0,-1,,-1,...,S000003269,Acyl-CoA-Binding,Verified,long-chain fatty acid transporter ACB1,chrVII,559731.0,559994.0,Acyl-CoA-binding protein; transports newly syn...,,-2


In [18]:
original_data = original_data.groupby(original_data.index).mean()

In [19]:
original_data.shape

(889, 10)

In [20]:
original_data.head()

Unnamed: 0_level_0,Plate,Column,Rep.1,Rep.2,Rep.3,Definite,Location Start,Location End,all 3 plates but not most extreme,data
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
YAL010C,39.0,8.0,-1.0,-1.0,-1.0,-1.0,134184.0,135665.0,,-3.0
YAL016W,32.0,2.0,-1.0,-1.0,-1.0,-1.0,124879.0,126786.0,,-3.0
YAL024C,36.0,1.0,-1.0,-1.0,-1.0,0.0,101565.0,105872.0,,-3.0
YAL035W,39.0,1.0,-1.0,-1.0,-1.0,-1.0,76427.0,79435.0,,-3.0
YAL039C,39.0,10.0,0.0,0.0,-1.0,0.0,68716.0,69525.0,,-1.0


# Load & process tested strains

In [21]:
tested = pd.read_excel('raw_data/MATalpha yeast knockout collection.xls', sheet_name='list')

In [22]:
tested['orfs'] = tested['ORF'].astype(str)

In [23]:
tested['orfs'] = clean_orf(tested['orfs'])

In [24]:
tested['orfs'] = translate_sc(tested['orfs'], to='orf')

In [25]:
# Make sure everything translated ok
t = looks_like_orf(tested['orfs'])
print(tested.loc[~t,])

            instructions below fill in record information  count   \
index_input                                                         
80                         NaN                        NaN    81.0   
81                         NaN                        NaN    82.0   
82                         NaN                        NaN    83.0   
83                         NaN                        NaN    84.0   
148                        NaN                        NaN   149.0   
...                        ...                        ...     ...   
5851                       NaN                        NaN     NaN   
5852                       NaN                        NaN     NaN   
5853                       NaN                        NaN     NaN   
5854                       NaN                        NaN     NaN   
5855                       NaN                        NaN     NaN   

             record number  ORF batch original box position Unnamed: 7  \
index_input                 

In [26]:
tested = tested.loc[t,]

In [27]:
tested_orfs = np.unique(tested['orfs'].values)

In [28]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

['YPL183W-C']

In [29]:
tested_orfs = np.append(tested_orfs, 'YPL183W-C')

In [30]:
# tested_orfs

In [31]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [32]:
data = original_data[['data']].copy()

In [33]:
dataset_ids = [16680]
datasets = datasets.reindex(index=dataset_ids)

In [34]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [35]:
data.head()

dataset_id,16680
data_type,value
orf,Unnamed: 1_level_2
YAL002W,0.0
YAL004W,0.0
YAL005C,0.0
YAL007C,0.0
YAL008W,0.0


In [36]:
data.shape

(4952, 1)

## Subset to the genes currently in SGD

In [37]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [38]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [39]:
data.head()

Unnamed: 0_level_0,dataset_id,16680
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,0.0
1863,YAL004W,0.0
4,YAL005C,0.0
5,YAL007C,0.0
6,YAL008W,0.0


# Normalize

In [40]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [41]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [42]:
data_norm[data.isnull()] = np.nan

In [43]:
data_all = data.join(data_norm)

In [44]:
data_all.head()


Unnamed: 0_level_0,dataset_id,16680,16680
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0.0,0.0
1863,YAL004W,0.0,0.0
4,YAL005C,0.0,0.0
5,YAL007C,0.0,0.0
6,YAL008W,0.0,0.0


# Print out

In [45]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [47]:
from IO.save_data_to_db3 import *

In [48]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 33109726...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.40s/it]

Updating the data_modified_on field...



