In [1]:
import pandas as pd
import numpy as np
# ML algorithms
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor

# ML functions
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold,GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import mean_squared_error,r2_score

In [2]:

#dataframe final
df_final = pd.read_csv("../data/DF_train15noChangeContact_skempiAB_modeller_final.csv",index_col=0)


pdb_names = df_final.index
features_names = df_final.drop('ddG_exp',axis=1).columns
df_final.shape

(1392, 78)

In [3]:
# Split train and independent test data

X_train, X_test, y_train, y_test = train_test_split(df_final.drop('ddG_exp',axis=1), df_final['ddG_exp'], 
                                                    test_size=0.2, random_state=13)           

In [4]:
# join train data for Exploratory analisis of training data
train = X_train.join(y_train)

In [5]:
# Drop outliers
train = train.drop(train[(train['van_der_waals_change']>3)].index)
train = train.drop(train[(train['dg_change'].abs()>8)].index)


In [6]:
y_train = train['ddG_exp']
X_train = train.drop('ddG_exp',axis=1)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeaturesInteractions(BaseEstimator, TransformerMixin):
    
    #Class constructor method that takes ..
    def __init__(self, interaction1, interaction2 ):
        self.interaction1 = interaction1
        self.interaction2 = interaction2
        
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        
        return self
    
    def transform(self, X , y=None ):
        
        X_interactions = X.loc[:,self.interaction1].mul(X[self.interaction2],axis=0)
        X_interactions.columns = X_interactions.columns.values+'/{}'.format(self.interaction2)
        # set columns names
        
        X = X.join(X_interactions)
        
        return X
    
class SkewTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=0.6, method='quantile'):
        self.threshold = threshold
        self.method = method
        
    #Return self nothing else to do here
    def fit(self, X, y = None  ):
        skewes_ = X.skew().sort_values(ascending=False)
        self.skew_features = skewes_[skewes_.abs() > self.threshold]
        
        if self.method == 'quantile':
            self.t = QuantileTransformer(output_distribution="normal",random_state=13)
            self.t.fit(X[self.skew_features.index])
        
        return self
    
    def transform(self, X, y=None):
        
        X[self.skew_features.index] = self.t.transform(X[self.skew_features.index])
        
        return X
        
class ZeroThreshold(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=90.):
        self.threshold = threshold
    
    
    def fit(self, X, y = None  ):
        
        self.feature_names = X.columns
        self.overfit = []
        for i in X.columns:
            counts = X[i].value_counts()
            zeros = counts.iloc[0]
            if zeros / len(X) * 100 >self.threshold:
                self.overfit.append(i)
        
        return self
    
    def transform(self, X, y=None):
        
        X.drop(self.overfit,axis=1,inplace=True)
        
        return X




In [8]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1109, 77), (1109,), (279, 77), (279,))

In [52]:
#1)  ORIGINAL
## Pipeline preprocessing
interactions = FeaturesInteractions(interaction1=X_train.columns[:15].tolist(),interaction2="van_der_waals_change")
skewness = SkewTransformer(threshold=0.6,method='quantile')
zeroth = ZeroThreshold(threshold=90.)


#2)
model_ = XGBRegressor()
#3) Crear pipeline
pipeline1 = make_pipeline(interactions,skewness,zeroth, model_)



param_grid = {
    'xgbregressor__max_depth': [5],
    'xgbregressor__gamma': [0.6],
    'xgbregressor__min_child_weight': [15],
    'xgbregressor__colsample_bytree': [0.4],
    'xgbregressor__n_estimators': [100],
    'xgbregressor__subsample':[0.6],
    'xgbregressor__learning_rate':[0.05],
    'xgbregressor__objective':["reg:linear"],
    'xgbregressor__n_jobs':[-1],
    'xgbregressor__reg_alpha':[8],
    'xgbregressor__random_state':[13]}
    
cv = RepeatedKFold(n_splits=10,n_repeats=10,random_state=13)

# Instantiate the grid search model
grid1 = GridSearchCV(pipeline1, param_grid, verbose=5, n_jobs=-1,cv=cv,scoring=['neg_mean_squared_error','r2'],
                        refit='neg_mean_squared_error',return_train_score=True)

grid1.fit(X_train,y_train)

Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   27.0s finished


GridSearchCV(cv=<sklearn.model_selection._split.RepeatedKFold object at 0x7fd6d806c390>,
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('featuresinteractions', FeaturesInteractions(interaction1=['clash', 'covalent', 'vdw_clash', 'vdw', 'proximal', 'hbond', 'weak_hbond', 'xbond', 'ionic', 'metal_complex', 'aromatic', 'hydrophobic', 'carbonyl', 'polar', 'weak_polar'],
           interaction2='van_der_waals_change')), ('skewtra...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'xgbregressor__max_depth': [5], 'xgbregressor__gamma': [0.6], 'xgbregressor__min_child_weight': [15], 'xgbregressor__colsample_bytree': [0.4], 'xgbregressor__n_estimators': [100], 'xgbregressor__subsample': [0.6], 'xgbregressor__learning_rate': [0.05], 'xgbregressor__objective': ['reg:linear'], 'xgbregressor__n_jobs': [-1], 'xgbregressor__reg_alpha': [8

In [54]:
# index of best scores
rmse_bestCV_test_index = grid1.cv_results_['mean_test_neg_mean_squared_error'].argmax()
rmse_bestCV_train_index = grid1.cv_results_['mean_train_neg_mean_squared_error'].argmax()
r2_bestCV_test_index = grid1.cv_results_['mean_test_r2'].argmax()
r2_bestCV_train_index = grid1.cv_results_['mean_train_r2'].argmax()
# scores
rmse_bestCV_test_score = grid1.cv_results_['mean_test_neg_mean_squared_error'][rmse_bestCV_test_index]
rmse_bestCV_test_std = grid1.cv_results_['std_test_neg_mean_squared_error'][rmse_bestCV_test_index]
rmse_bestCV_train_score = grid1.cv_results_['mean_train_neg_mean_squared_error'][rmse_bestCV_train_index]
rmse_bestCV_train_std = grid1.cv_results_['std_train_neg_mean_squared_error'][rmse_bestCV_train_index]

r2_bestCV_test_score = grid1.cv_results_['mean_test_r2'][r2_bestCV_test_index]
r2_bestCV_test_std = grid1.cv_results_['std_test_r2'][r2_bestCV_test_index]
r2_bestCV_train_score = grid1.cv_results_['mean_train_r2'][r2_bestCV_train_index]
r2_bestCV_train_std = grid1.cv_results_['std_train_r2'][r2_bestCV_train_index]

print('CV test RMSE {:f} +/- {:f}'.format(np.sqrt(-rmse_bestCV_test_score),np.sqrt(rmse_bestCV_test_std)))
print('CV train RMSE {:f} +/- {:f}'.format(np.sqrt(-rmse_bestCV_train_score),np.sqrt(rmse_bestCV_train_std)))
print('DIFF RMSE {}'.format(np.sqrt(-rmse_bestCV_test_score)-np.sqrt(-rmse_bestCV_train_score)))
print('CV test r2 {:f} +/- {:f}'.format(r2_bestCV_test_score,r2_bestCV_test_std))
print('CV train r2 {:f} +/- {:f}'.format(r2_bestCV_train_score,r2_bestCV_train_std))

print(r2_bestCV_train_score-r2_bestCV_test_score)
print("",grid1.best_params_)
y_test_pred = grid1.best_estimator_.predict(X_test)
y_train_pred = grid1.best_estimator_.predict(X_train)
print("\nRMSE for test dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_test, y_test_pred)), 2)))
print("RMSE for train dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_train, y_train_pred)), 2)))
print("pearson corr  {:f}".format(np.corrcoef(y_test_pred,y_test.values.ravel())[0][1]))
print('R2 test',r2_score(y_test,y_test_pred))
print('R2 train',r2_score(y_train,y_train_pred))

CV test RMSE 1.628222 +/- 0.793555
CV train RMSE 1.203674 +/- 0.190291
DIFF RMSE 0.4245483017836311
CV test r2 0.244153 +/- 0.076445
CV train r2 0.590098 +/- 0.006518
0.3459456755434931
 {'xgbregressor__colsample_bytree': 0.4, 'xgbregressor__gamma': 0.6, 'xgbregressor__learning_rate': 0.05, 'xgbregressor__max_depth': 5, 'xgbregressor__min_child_weight': 15, 'xgbregressor__n_estimators': 100, 'xgbregressor__n_jobs': -1, 'xgbregressor__objective': 'reg:linear', 'xgbregressor__random_state': 13, 'xgbregressor__reg_alpha': 8, 'xgbregressor__subsample': 0.6}

RMSE for test dataset: 1.63
RMSE for train dataset: 1.2
pearson corr  0.591582
R2 test 0.32830559494034417
R2 train 0.5936342543005289


In [121]:
XGBRegressor?

In [64]:
CV test RMSE 1.650335 +/- 0.801954
CV train RMSE 1.236560 +/- 0.197516
DIFF RMSE 0.4137747464076824
CV test r2 0.223925 +/- 0.070645
CV train r2 0.567368 +/- 0.008590
0.34344367329511466
 {'xgbregressor__colsample_bytree': 0.7, 'xgbregressor__gamma': 0.3, 'xgbregressor__learning_rate': 0.05, 'xgbregressor__max_depth': 6, 'xgbregressor__min_child_weight': 1, 'xgbregressor__n_estimators': 50, 'xgbregressor__n_jobs': -1, 'xgbregressor__objective': 'reg:linear', 'xgbregressor__random_state': 13, 'xgbregressor__reg_alpha': 5.5, 'xgbregressor__subsample': 0.5}

RMSE for test dataset: 1.68
RMSE for train dataset: 1.24
pearson corr  0.577669
R2 test 0.2887416177917681
R2 train 0.5675458846465653