In [1]:
# activiate inline plotting
%matplotlib inline

import sys
sys.path.append("/home/kaspar")

import numpy as np
import scipy as sp
import pandas as pd

from y10k_prediction.helper_functions import summarise_Rsq

### Get data

In [2]:
from y10k_prediction.data_import import get_data_with_parents

file_name = 'data/y10k_hybrids_Yield.hdf5'
Y, snps, K, parent1, parent2, individuals, dataset, environments = get_data_with_parents(file_name)

  self.f = tables.openFile(self.file_name,'r')
  self.f = tables.openFile(self.file_name,'r')


### 4-fold CV partitions into test and two sets of training sets (distant and close relatives)

In [3]:
from y10k_prediction.train_and_test_sets import get_4foldCV_close_and_distant

sp.random.seed(0)
Itest_list, Idistant_list, Iclose_list = get_4foldCV_close_and_distant(parent1, parent2)

### Prediction accuracy of BLUP, gradually increasing training set size

In [4]:
def gradual_sample(Itrain, n_values):
    out = []
    last_selection = sp.zeros_like(Itrain)
    for i in range(len(n_values)):
        ind = last_selection.copy()
        ind[np.random.choice(np.where(Itrain & ~last_selection)[0], size=n_values[i]-last_selection.sum(), replace=False)] = True
        out.append(ind.copy())
    return out

def gradual_sample2(Itrain, n_values, Iadded, n_added):
    out = []
    last_selection = sp.zeros_like(Itrain)
    for i in range(len(n_values)):
        ind = last_selection.copy()
        newsize = n_added[i]-last_selection.sum()
        ind[np.random.choice(np.where(Itrain & ~last_selection)[0], size=newsize, replace=False)] = True
        last_selection = ind.copy()
        out.append(last_selection)
    all_ind = ind.copy()
    last_selection = sp.zeros_like(Itrain)
    for i in range(len(n_added)):
        ind = last_selection.copy()
        newsize = n_added[i]-last_selection.sum()
        ind[np.random.choice(np.where(Iadded & ~ind)[0], size=newsize, replace=False)] = True
        last_selection = ind.copy()
        out.append((all_ind | last_selection).copy())
    return out

In [5]:
from IPython.parallel import Client
c = Client()
cluster = c[:]

In [6]:
myfunction_blup = lambda j: get_BLUPs(Y[:, j:j+1], K, Itrain, Itest)

cluster.execute('''
import sys
sys.path.append("/home/kaspar")
from y10k_prediction.BLUP import get_BLUPs
''')

<AsyncResult: execute>

In [7]:
n_folds = 4
for i in range(n_folds):
    Itest = Itest_list[i]
    Itrain_distant = Idistant_list[i]
    Itrain_close = Iclose_list[i]
    # create gradually increasing sample
    n_values = [25, 50, 100, 200, 400, 800, 1640] 
    n_values_distant = np.concatenate((n_values, [(Itrain_distant).sum() + n for n in n_values]))
    Itrain_close_list = gradual_sample(Itrain_close, n_values)
    Itrain_distant_list = gradual_sample2(Itrain_distant, n_values, Itrain_close, n_values)
    
    temp_Rsq = sp.zeros((len(n_values), Y.shape[1]))
    for k in range(len(n_values)):
        Itrain = Itrain_close_list[k]
        Itest = Itest
        mydict=dict(Y=Y, K=K, Itrain=Itrain, Itest=Itest)
        cluster.push(mydict)
        res = cluster.map_sync(myfunction_blup, range(Y.shape[1]))
        ypred = np.array([obj.ravel() for obj in res]).T
        temp_Rsq[k, :] = summarise_Rsq(ypred, Y[Itest, :]).T
    Rsq = pd.DataFrame(temp_Rsq, columns=environments)
    Rsq["n"] = n_values
    Rsq["fold"] = [i]*Rsq.shape[0]
    if i==0:
        Rsq_close = Rsq
    else:
        Rsq_close = pd.concat((Rsq_close, Rsq))
        
        
    temp_Rsq = sp.zeros((len(n_values_distant), Y.shape[1]))
    for k in range(len(n_values_distant)):
        Itrain = Itrain_distant_list[k]
        Itest = Itest
        mydict=dict(Y=Y, K=K, Itrain=Itrain, Itest=Itest)
        cluster.push(mydict)
        res = cluster.map_sync(myfunction_blup, range(Y.shape[1]))
        ypred = np.array([obj.ravel() for obj in res]).T
        temp_Rsq[k, :] = summarise_Rsq(ypred, Y[Itest, :]).T
    Rsq = pd.DataFrame(temp_Rsq, columns=environments)
    Rsq["n"] = n_values_distant
    Rsq["fold"] = [i]*Rsq.shape[0]
    if i==0:
        Rsq_distant = Rsq
    else:
        Rsq_distant = pd.concat((Rsq_distant, Rsq))

In [7]:
Rsq_distant.to_csv("output/fig2c_distant.csv")
Rsq_close.to_csv("output/fig2c_close.csv")