In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
!pip install xgboost



In [9]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import HuberRegressor
import xgboost as xgb
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
import time

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

In [5]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [6]:
from preprocess import preprocess

X_train, X_test, y_train = preprocess(df_train, df_test, use_ohe=True, use_scaling=True)

In [11]:
ridge_pipe = Pipeline([
    ('ridge', Ridge())
])

lasso_pipe = Pipeline([
    ('lasso', Lasso())
])

hb_pipe = Pipeline([
    ('hb', HuberRegressor())
])

xgb_pipe = Pipeline([
    ('xgb', xgb.XGBRegressor())
])

dt_pipe = Pipeline([
    ('dt', DecisionTreeRegressor())
])

# svr_pipe = Pipeline([
#     ('svr', SVR())
# ])

pa_pipe = Pipeline([
    ('pa', PassiveAggressiveRegressor())
])

In [12]:
pipelines = [ridge_pipe, lasso_pipe, hb_pipe, xgb_pipe, dt_pipe, pa_pipe]

In [13]:
def test_pipeline(X, y, pipeline):
    rskf = StratifiedKFold(n_splits=5, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipeline.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, np.round(pipeline.predict(X_test)))))

    print("kfolds rmse: {0}, mean rmse: {1}".format(
        str([str(round(x, 3)) for x in sorted(rmse_scores)]),
        round(np.mean(rmse_scores), 3)
    ))

In [15]:
objects_to_calculate = 100000
for pipeline in pipelines:
    print("Model:", type(pipeline[0]).__name__)
    test_pipeline(X_train[:objects_to_calculate], y_train[:objects_to_calculate], pipeline)
    print('-----------------------------------------------------------------------')

Model: Ridge
kfolds rmse: ['1.162', '1.163', '1.167', '1.169', '1.17'], mean rmse: 1.166
-----------------------------------------------------------------------
Model: Lasso
kfolds rmse: ['1.392', '1.392', '1.392', '1.392', '1.392'], mean rmse: 1.392
-----------------------------------------------------------------------
Model: HuberRegressor
kfolds rmse: ['1.17', '1.172', '1.173', '1.174', '1.176'], mean rmse: 1.173
-----------------------------------------------------------------------
Model: XGBRegressor


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


kfolds rmse: ['1.033', '1.034', '1.035', '1.041', '1.043'], mean rmse: 1.037
-----------------------------------------------------------------------
Model: DecisionTreeRegressor
kfolds rmse: ['1.265', '1.268', '1.268', '1.269', '1.27'], mean rmse: 1.268
-----------------------------------------------------------------------
Model: PassiveAggressiveRegressor
kfolds rmse: ['1.549', '1.709', '1.828', '1.894', '1.938'], mean rmse: 1.784
-----------------------------------------------------------------------


In [23]:
classifiers = [
    ("bnb", bnb_pipe),
    ("dt", dt_pipe),
    ("etc", etc_pipe),
    #("gnb", gnb_pipe),
    ("kn", kn_pipe),
    #("svc", svc_pipe),
    #("lr", lr_pipe),
    ("rf", rf_pipe),
]

mixed_pipe = Pipeline([
    ("voting", VotingClassifier(classifiers, voting="soft"))
])

In [24]:
test_pipeline(X_train_transformed.values[:objects_to_calculate], y_train[:objects_to_calculate], mixed_pipe)

kfolds rmse: ['1.208', '1.214', '1.215', '1.227', '1.228'], mean rmse: 1.218


In [2]:
from scipy.stats import rankdata
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from itertools import cycle
from sklearn.base import BaseEstimator, ClassifierMixin

class ShuffleVoter(BaseEstimator, ClassifierMixin):  
    """
       scikit-learn based voting aggregation ensembling.
       Using bootstrapping creates a set of models, differing only by which data sample they are fed
       """

    def __init__(self, models):
        """
        model - base model ( or a pipeline ) ( unfitted )
        """
        self.models = models
        
    def ensemble_predictions(self, predictions, weights, type_="harmonic"):
        """
        Combines probabilistic class estimates using a variety of strategies.
        Linear, harmonic, geometric and rank averaging are supported at this moment. 
        Insipred by well known Abhishek's kernel on Kaggle 
        model - base model ( or a pipeline ) ( unfitted )
        """
        assert np.isclose(np.sum(weights), 1.0)
        if type_ == "linear":
            res = np.average(predictions, weights=weights, axis=0)
        elif type_ == "harmonic":
            res = np.average([1 / p for p in predictions], weights=weights, axis=0)
            return 1 / res
        elif type_ == "geometric":
            numerator = np.average(
                [np.log(p) for p in predictions], weights=weights, axis=0
            )
            res = np.exp(numerator / sum(weights))
            return res
        elif type_ == "rank":
            res = np.average([rankdata(p) for p in predictions], weights=weights, axis=0)
            return res / (len(res) + 1)
        return res


    def fit( self, X, y, n_boots = 14, test_size = 100 ):
        """
        n_boots - number of bootstrapping iterations ( and respective models built)
        """
        self.clfs  = []
        for i, model in zip(range(n_boots), cycle(self.models)):
            X_tr, X_te, y_tr, y_te = train_test_split( X, y, test_size=test_size, random_state=3521 + i*10)

            pa_clf = model
            pa_clf.fit(X_tr, y_tr)

            self.clfs.append(pa_clf)

    def predict( self, X, ensemble_type = 'rank'):
        # TODO: nonuniform weights
        
        n_boots = len( self.clfs)
        preds = [ clf.predict(X) for clf in self.clfs ]
        return self.ensemble_predictions( preds, np.ones(n_boots)*(1/float(n_boots)), ensemble_type)
    
    def predict_proba( self, X, ensemble_type = 'rank' ):
        n_boots = len( self.clfs)
        preds = [ clf.predict_proba(X) for clf in self.clfs ]
        return self.ensemble_predictions( preds, np.ones(n_boots)*(1/float(n_boots)), ensemble_type)
