In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 17950387
paper_name = 'galvan_smith_2008' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/IMELDA 08Feb2006GDAraw data.xls', sheet_name='Sheet1')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 6144 x 13


In [7]:
original_data.head()

Unnamed: 0,Plate Number,Plate X,Plate Y,Area CON,Deviation CON,Area TEST,Deviation TEST,Area Difference ( CON - TEST ),Percentage of reduction,CON Normalized,TEST Normalized,Normalized Difference,Percentage
0,1,1,1,347,-85.474,411,159.57,-64,-18.444,0.80236,1.6347,-0.83229,-83.229
1,1,1,2,527,94.526,627,375.57,-100,-18.975,1.2186,2.4937,-1.2752,-127.52
2,1,1,3,418,-14.474,480,228.57,-62,-14.833,0.96653,1.9091,-0.94255,-94.255
3,1,1,4,630,197.53,647,395.57,-17,-2.6984,1.4567,2.5733,-1.1166,-111.66
4,1,1,5,603,170.53,533,281.57,70,11.609,1.3943,2.1199,-0.72558,-72.558


In [8]:
# Load plate maps
pm = pd.read_excel('raw_data/yGDA-Master_Plate_list_Combined(New).xlsx', sheet_name='Sheet1')

In [11]:
pm.set_index(['Plate Number','Plate X','Plate Y'], inplace=True)

In [12]:
original_data.set_index(['Plate Number','Plate X','Plate Y'], inplace=True)

In [13]:
original_data = original_data.join(pm, how='left')

In [15]:
original_data['orf'] = original_data['Systematic Name'].astype(str)

In [16]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [21]:
original_data.loc[original_data['orf']=='YPL072WA','orf'] = 'YPL072W'

In [22]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'].values, to='orf')

In [23]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                              Area CON  Deviation CON  Area TEST  \
Plate Number Plate X Plate Y                                       
8            2       5               0      -470.6600          0   
                     7               0      -470.6600          0   
                     9               0      -470.6600          0   
                     11              0      -470.6600          0   
                     13              0      -470.6600          0   
...                                ...            ...        ...   
16           16      20            579        79.8070        297   
                     21              0      -499.1900          0   
                     22            533        33.8070        229   
                     23              0      -499.1900          0   
                     24            505         5.8073        224   

                              Deviation TEST  Area Difference ( CON - TEST )  \
Plate Number Plate X Plate Y       

In [24]:
original_data = original_data.loc[t,:]

In [25]:
original_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Area CON,Deviation CON,Area TEST,Deviation TEST,Area Difference ( CON - TEST ),Percentage of reduction,CON Normalized,TEST Normalized,Normalized Difference,Percentage,Gene,Systematic Name,Description,orf
Plate Number,Plate X,Plate Y,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1,1,347,-85.474,411,159.57,-64,-18.444,0.80236,1.6347,-0.83229,-83.229,TCI1,YDR161W,Protein that interacts with protein phosphatas...,YDR161W
1,1,2,527,94.526,627,375.57,-100,-18.975,1.2186,2.4937,-1.2752,-127.52,YCR017C,YCR017C,Protein involved in sensitivity to certain drugs,YCR017C
1,1,3,418,-14.474,480,228.57,-62,-14.833,0.96653,1.9091,-0.94255,-94.255,NBP2,YDR162C,"Nap1p-binding protein, has an SH3 domain",YDR162C
1,1,4,630,197.53,647,395.57,-17,-2.6984,1.4567,2.5733,-1.1166,-111.66,MAK32,YCR019W,Protein required for structural stability of L...,YCR019W
1,1,5,603,170.53,533,281.57,70,11.609,1.3943,2.1199,-0.72558,-72.558,YDR163W,YDR163W,Protein of unknown function,YDR163W


In [26]:
original_data['data'] = original_data['TEST Normalized'] / original_data['CON Normalized']

In [27]:
original_data.set_index('orf', inplace=True)

In [28]:
original_data = original_data[['data']].copy()

In [29]:
original_data = original_data.groupby(original_data.index).mean()

In [30]:
original_data.shape

(4645, 1)

# Prepare the final dataset

In [31]:
data = original_data.copy()

In [32]:
dataset_ids = [134]
datasets = datasets.reindex(index=dataset_ids)

In [33]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [34]:
data.head()

dataset_id,134
data_type,value
orf,Unnamed: 1_level_2
YAL002W,0.73041
YAL004W,1.241952
YAL005C,1.068869
YAL007C,1.258973
YAL008W,0.147912


## Subset to the genes currently in SGD

In [35]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 21


In [36]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,134
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,0.73041
1863,YAL004W,1.241952
4,YAL005C,1.068869
5,YAL007C,1.258973
6,YAL008W,0.147912


# Normalize

In [37]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [38]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [39]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,134,134
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0.73041,-0.658646
1863,YAL004W,1.241952,0.947866
4,YAL005C,1.068869,0.404293
5,YAL007C,1.258973,1.001322
6,YAL008W,0.147912,-2.488002


# Print out

In [40]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [41]:
from IO.save_data_to_db3 import *

In [42]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 17950387...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.54s/it]

Updating the data_modified_on field...



