In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 17846143
paper_name = 'morton_coote_2007' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [4]:
datasets.set_index('pmid', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/Table1.xlsx', sheet_name='Sheet1')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 32 x 5


In [7]:
original_data.head()

Unnamed: 0,Functional category,% deletion mutants,DsS3(1-16) and Mag 2,DsS3(1-16) only,Mag 2 only
0,Metabolism,7,ARV1,"PPM1, TMA29, AMD1, YMR010W, GLO1, ARO8, SAM1","PGM1, PRP18"
1,Cell cycle,7,"KIN3, CLG1, CDC50","SRC1, SAP4, BFR1, KAR3, MSC1",HOP2
2,DNA repair,5,"RSC1, MMS22, RSC2, IMP2′","MRE11, CTF4, DPB3",
3,Biosynthesis,5,,"HFA1, GLY1, RTS1, DGA1, SUR4, AAT2","FEN1, ERG4"
4,Cell wall organization and biogenesis,5,LDB7,"GAS1, ECM19, YLR020C","CWH41, ROM1"


In [9]:
d_genes = []
mag_genes = []

cols = original_data.columns.values[2:]
for r in original_data.iterrows():
    for c in cols:
        s = str(r[1][c])
        s = s.replace('\xa0','')
        genes = s.split(',')
        if not isinstance(genes, list):
            genes = [genes]
        if c == cols[0]:
            d_genes = d_genes + genes
            mag_genes = mag_genes + genes
        elif c == cols[1]:
            d_genes = d_genes + genes
        elif c == cols[2]:
            mag_genes = mag_genes + genes

In [10]:
d_genes = [s.strip() for s in d_genes if not s == 'nan']
mag_genes = [s.strip() for s in mag_genes if not s == 'nan']

In [11]:
d_genes = clean_genename(d_genes)
mag_genes = clean_genename(mag_genes)

In [12]:
d_orfs = translate_sc(d_genes, to='orf')
mag_orfs = translate_sc(mag_genes, to='orf')

In [13]:
d_orfs = np.array(d_orfs)
mag_orfs = np.array(mag_orfs)

In [14]:
d_orfs[d_orfs=='TMA29'] = 'YMR226C'

In [15]:
t = looks_like_orf(d_orfs)
print(d_orfs[~np.array(t)])

[]


In [16]:
t = looks_like_orf(mag_orfs)
print(mag_orfs[~np.array(t)])

[]


In [17]:
all_orfs = np.unique(np.concatenate((d_orfs, mag_orfs)))

In [18]:
data = pd.DataFrame(index=all_orfs, columns=['D','M'], data=np.zeros((len(all_orfs),2)))

In [19]:
data.loc[d_orfs,'D'] = -1
data.loc[mag_orfs,'M'] = -1

In [21]:
data.index.name = 'orf'

# Prepare the final dataset

In [22]:
dataset_ids = [16536,16535]
datasets = datasets.reindex(index=dataset_ids)

In [23]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [24]:
data.head()

dataset_id,16536,16535
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL002W,-1.0,0.0
YAL010C,-1.0,0.0
YAR002W,-1.0,-1.0
YAR018C,-1.0,-1.0
YBL006C,-1.0,-1.0


In [25]:
data = data.groupby(data.index).mean()

## Subset to the genes currently in SGD

In [26]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [27]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16536,16535
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,-1.0,0.0
8,YAL010C,-1.0,0.0
62,YAR002W,-1.0,-1.0
68,YAR018C,-1.0,-1.0
94,YBL006C,-1.0,-1.0


# Normalize

In [29]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [30]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [31]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16536,16535,16536,16535
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,YAL002W,-1.0,0.0,-6.330794,0.0
8,YAL010C,-1.0,0.0,-6.330794,0.0
62,YAR002W,-1.0,-1.0,-6.330794,-9.284802
68,YAR018C,-1.0,-1.0,-6.330794,-9.284802
94,YBL006C,-1.0,-1.0,-6.330794,-9.284802


# Print out

In [32]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [33]:
from IO.save_data_to_db3 import *

In [34]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 17846143...
Inserting the new data...


100%|██████████| 2/2 [00:00<00:00,  3.33it/s]

Updating the data_modified_on field...



