In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_validate
import tqdm

### Loading data

In [3]:
X = np.load('./data/X.npy')
X_polynomial = np.load('./data/X_polynomial.npy')
y = np.load('./data/y.npy')

### Pipelines

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor
from sklearn import linear_model
from sklearn import tree
from sklearn import svm
from sklearn.linear_model import PassiveAggressiveRegressor
import xgboost as xgb
from catboost import CatBoostRegressor

In [18]:
huber_reg = HuberRegressor(epsilon= 5.09, alpha= 0.0004)
ridge_reg = linear_model.Ridge(solver='saga', max_iter=4000, alpha= 0.582)
lasso_reg = linear_model.Lasso(max_iter=4000, alpha=0.0038, normalize=False)
dt_reg = tree.DecisionTreeRegressor(min_samples_split=7, min_samples_leaf=7, min_weight_fraction_leaf=0.000516, 
                                                                                             max_features='auto')
    
pa_reg = PassiveAggressiveRegressor(max_iter=1000, random_state=0, tol=1e-3)
xgb_reg = xgb.XGBRegressor(objective="reg:linear", random_state=42, verbosity=0)

In [19]:
models = [huber_reg, ridge_reg, lasso_reg, dt_reg, pa_reg, xgb_reg]

### Ensemble

In [41]:
from scipy.stats import rankdata
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from itertools import cycle
from sklearn.base import BaseEstimator, ClassifierMixin

class ShuffleVoter(BaseEstimator, ClassifierMixin):  
    """
       scikit-learn based voting aggregation ensembling.
       Using bootstrapping creates a set of models, differing only by which data sample they are fed
       """

    def __init__(self, models):
        """
        model - base model ( or a pipeline ) ( unfitted )
        """
        self.models = models
        
    def ensemble_predictions(self, predictions, weights, type_="linear"):
        """
        Combines probabilistic class estimates using a variety of strategies.
        Linear, harmonic, geometric and rank averaging are supported at this moment. 
        Insipred by well known Abhishek's kernel on Kaggle 
        model - base model ( or a pipeline ) ( unfitted )
        """
        assert np.isclose(np.sum(weights), 1.0)
        if type_ == "linear":
            res = np.average(predictions, weights=weights, axis=0)
        elif type_ == "harmonic":
            res = np.average([1 / p for p in predictions], weights=weights, axis=0)
            return 1 / res
        elif type_ == "geometric":
            numerator = np.average(
                [np.log(p) for p in predictions], weights=weights, axis=0
            )
            res = np.exp(numerator / sum(weights))
            return res
        elif type_ == "rank":
            res = np.average([rankdata(p) for p in predictions], weights=weights, axis=0)
            return res / (len(res) + 1)
        return res


    def fit( self, X, y, n_boots = 14, test_size = 100 ):
        """
        n_boots - number of bootstrapping iterations ( and respective models built)
        """
        self.clfs  = []
        for i, model in zip(range(n_boots), cycle(self.models)):
            X_tr, X_te, y_tr, y_te = train_test_split( X, y, test_size=test_size, random_state=3521 + i*10)

            pa_clf = model
            pa_clf.fit(X_tr, y_tr)

            self.clfs.append(pa_clf)

    def predict( self, X, ensemble_type = 'rank'):
        # TODO: nonuniform weights
        
        n_boots = len( self.clfs)
        preds = [ clf.predict(X) for clf in self.clfs ]
        return self.ensemble_predictions( preds, np.ones(n_boots)*(1/float(n_boots)), ensemble_type)
    
    def predict_proba( self, X, ensemble_type = 'rank' ):
        n_boots = len( self.clfs)
        preds = [ clf.predict_proba(X) for clf in self.clfs ]
        return self.ensemble_predictions( preds, np.ones(n_boots)*(1/float(n_boots)), ensemble_type)

### Count metrics

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [16]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
kf = KFold(n_splits=5)

folds_r2_scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


    my_shuffle_voter = ShuffleVoter(models)
    my_shuffle_voter.fit(X_train, y_train, n_boots=20)

    y_pred = my_shuffle_voter.predict(X_test)
    print("Fold r2-score:", r2_score(y_test, y_pred))
    print(y_test[:10])
    print(y_pred[:10])
    print()
    folds_r2_scores.append(r2_score(y_test, y_pred))
    
print("Cross-validation score:", sum(folds_r2_scores)/float(len(folds_r2_scores)))

Fold r2-score: 0.03967944226417597
[ 17.78608486  21.69208486   5.71108486   6.97408486 -20.31191514
   0.51808486  -7.59991514   4.69108486  -7.34691514   3.25408486]
[0.91963717 0.97264481 0.66053469 0.68391152 0.04154997 0.53742839
 0.32224698 0.63911521 0.3309198  0.58962444]



KeyboardInterrupt: 