In [None]:
from tpot import TPOTRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

housing = load_boston()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target,
                                                    train_size=0.75, test_size=0.25, random_state=42)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_boston_pipeline.py')

In [None]:
housing.target.shape

In [1]:
from tpot import TPOTRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np
import os
import joblib
import time
import timeit
import joblib
import pickle
from ExpressionExpert_Functions import Data_Src_Load, make_DataDir, split_train_test, ExpressionScaler, Sequence_Conserved_Adjusted, Est_Grad_Save, Est_Grad_Feat
from sklearn.model_selection import GroupShuffleSplit

%matplotlib inline

In [2]:
Name_Dict = dict()
with open('config_EcolPtai.txt') as Conf:
    myline = Conf.read().splitlines()
    for line in myline:
        if not line.startswith('#'):
            (key, val) = line.split(':', 1)
            Name_Dict[str(key.strip())] = val.strip()
        

Data_File = Name_Dict['Data_File']
# extract the filename for naming of newly generated files
File_Base = Name_Dict['File_Base']
# the generated files will be stored in a subfolder with custom name
Data_Folder = Name_Dict['Data_Folder']
# column name of expression values
Y_Col_Name = eval(Name_Dict['Y_Col_Name'])
# figure file type
Fig_Type = Name_Dict['Figure_Type']
make_DataDir(Name_Dict)

Already existent data directory  data-PromLib_EcolPtai .


In [3]:
SeqDat = Data_Src_Load(Name_Dict)
SeqDat.head(3)

Unnamed: 0,Strain ID Ecol,Strain ID Ptai,Sequence,Ecol Promoter Activity,Ptai Promoter Activity,Sequence_label-encrypted,Sequence_letter-encrypted,GC-content
0,SN_rep3 20180122,SynPro35_1,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1,...",0.043,0.014,"[3, 3, 3, 0, 3, 3, 3, 2, 0, 1, 0, 3, 2, 1, 2, ...",TTTATTTGACATGCGTGATGTTTAGAATTATAATTTGGGG,0.3
1,SN_rep3 20180123,SynPro35_2,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1,...",0.033,0.015,"[3, 3, 3, 0, 3, 3, 3, 2, 0, 1, 0, 3, 2, 1, 2, ...",TTTATTTGACATGCGTGATGTTTAGAATTATAATTTGGGG,0.3
2,SN_rep3 20180124,SynPro35_3,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1,...",0.036,0.018,"[3, 3, 3, 0, 3, 3, 3, 2, 0, 1, 0, 3, 2, 1, 2, ...",TTTATTTGACATGCGTGATGTTTAGAATTATAATTTGGGG,0.3


In [5]:
SeqDat['Sequence_letter-encrypted'].values

array(['TTTATTTGACATGCGTGATGTTTAGAATTATAATTTGGGG',
       'TTTATTTGACATGCGTGATGTTTAGAATTATAATTTGGGG',
       'TTTATTTGACATGCGTGATGTTTAGAATTATAATTTGGGG',
       'GCCCATTGACAAGGCTCTCGCGGCCAGGTATAATTGCACG',
       'GCCCATTGACAAGGCTCTCGCGGCCAGGTATAATTGCACG',
       'GCCCATTGACAAGGCTCTCGCGGCCAGGTATAATTGCACG',
       'GCCCATTGACAACACTATTTTTTGATACTATAATTGCACG',
       'GCCCATTGACAACACTATTTTTTGATACTATAATTGCACG',
       'GCCCATTGACAACACTATTTTTTGATACTATAATTGCACG',
       'GCCCATTGACACGTGTACTGTGTACGGTTATAATTGCACG',
       'GCCCATTGACACGTGTACTGTGTACGGTTATAATTGCACG',
       'GCCCATTGACACGTGTACTGTGTACGGTTATAATTGCACG',
       'GCCCATTGACATGGTAATGAAGCCGGTGTATAATTGCACG',
       'GCCCATTGACATGGTAATGAAGCCGGTGTATAATTGCACG',
       'GCCCATTGACATGGTAATGAAGCCGGTGTATAATTGCACG',
       'GCCCATTGACAAGGGAGTTCGGAGGTGTTATAATTGCACG',
       'GCCCATTGACAAGGGAGTTCGGAGGTGTTATAATTGCACG',
       'GCCCATTGACAAGGGAGTTCGGAGGTGTTATAATTGCACG',
       'GCCCATTGACAACGCGTCCACGTGGGGGTATAATTGCACG',
       'GCCCATTGACAACGCGTCCACGT

In [None]:
# SeqTrain, SeqTest = split_train_test(SeqDat)
train_size = 1 - eval(Name_Dict['TestRatio'])
# split number '1' because we only use one final test set. Cross validation comes later
gss = GroupShuffleSplit(n_splits=1, train_size=train_size)
X = SeqDat['Sequence']
y = SeqDat[Y_Col_Name]
groups = SeqDat['Sequence_letter-encrypted'].str.upper()
Train_Idx, Test_Idx = list(gss.split(X, y, groups))[0]
SeqTest = SeqDat.iloc[Test_Idx].reset_index(drop=True)
SeqTrain = SeqDat.iloc[Train_Idx].reset_index(drop=True)

TrainTest_Data = {'Train': SeqTrain, 'Test': SeqTest}
TrainTest_File = os.path.join(Data_Folder, '{}_{}_TrainTest-Data.pkl'.format(time.strftime('%Y%m%d'), File_Base))
pickle.dump(TrainTest_Data, open(TrainTest_File, 'wb'))


### test 2

In [None]:
SeqTrain, Expr_Scaler = ExpressionScaler(SeqTrain, Name_Dict)
# removing non-informative positions where no base diversity exists, base one hot encoding
SeqTrain_Hadj, Positions_removed, PSEntropy = Sequence_Conserved_Adjusted(SeqTrain, Name_Dict, n=1)
SeqOH = SeqTrain_Hadj

In [None]:
Sequence_Samples, Sequence_Positions, Sequence_Bases = np.array(SeqOH['OneHot'].values.tolist()).shape
X_pref = np.array(SeqTrain_Hadj['OneHot'].values.tolist()).reshape(Sequence_Samples,Sequence_Positions*Sequence_Bases)
# adding rows to x for additional features
X_pref = np.append(X_pref,np.array([SeqTrain_Hadj['GC-content']]).T, axis=1)
Y_pref = SeqTrain_Hadj['Ptai Promoter Activity_scaled'].values
print(X_pref.shape, Y_pref.shape)

In [None]:
from ExpressionExpert_Functions import list_onehot
X_pref_tmp = list_onehot(list(np.delete(np.array(list(SeqTest['Sequence_label-encrypted'])),Positions_removed, axis=1)))
X_pref_test = np.array(X_pref_tmp).reshape(len(SeqTest.index),-1)
# adding the additional feature, here GC-content
X_pref_test = np.append(X_pref_test,np.array([SeqTest['GC-content']]).T, axis=1)
# activity prediction of training set with best random forest estimator
Y_pref_test = Expr_Scaler['Ptai Promoter Activity_Scaler'].transform(SeqTest['Ptai Promoter Activity'].values.reshape(-1, 1))
print(X_pref_test.shape, Y_pref_test.shape)

In [None]:
tpot_pref = TPOTRegressor(generations=100, population_size=100, verbosity=2, random_state=42, scoring='r2', n_jobs=-1)
tpot_pref.fit(X_pref, Y_pref)
print(tpot_pref.score(X_pref_test, Y_pref_test))
tpot_pref.export('tpot_Ecol-Expression-pref_pipeline.py')

In [None]:
import joblib
joblib.dump(tpot_pref, 'tpot100_EcolPtai-selFeat.pkl')


### test 1

In [None]:
X_train = np.append(np.array(SeqTrain['Sequence_label-encrypted'].values.tolist()), SeqTrain['GC-content'].values.reshape(-1,1), axis=1)
Y_train = SeqTrain['Ecol Promoter Activity'].values

X_test = np.append(np.array(SeqTest['Sequence_label-encrypted'].values.tolist()), SeqTest['GC-content'].values.reshape(-1,1), axis=1)
Y_test = SeqTest['Ecol Promoter Activity'].values

tpot = TPOTRegressor(generations=100, population_size=100, verbosity=2, random_state=42, scoring='r2', n_jobs=-1)
tpot.fit(X_train, Y_train)
print(tpot.score(X_test, Y_test))
tpot.export('tpot_Ecol-Expression_pipeline.py')