In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold
import time
import tqdm

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

In [5]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [6]:
from preprocess import preprocess

X_train, X_test, y_train = preprocess(df_train, df_test, use_custom_target_encoding=True, use_scaling=True, \
                                                     filter_features=False)

In [7]:
classes_num = y_train.nunique()
y_train_binary = np.zeros((len(y_train), classes_num))

for i in tqdm.tqdm_notebook(range(len(y_train))):
    y_train_binary[i][int(y_train[i])-1] = 1

HBox(children=(IntProgress(value=0, max=515937), HTML(value='')))




In [8]:
from scipy.stats import rankdata
from itertools import cycle
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split

class ShuffleVoter(BaseEstimator, ClassifierMixin):  
    """
       scikit-learn based voting aggregation ensembling.
       Using bootstrapping creates a set of models, differing only by which data sample they are fed
       """

    def __init__(self, models):
        """
        model - base model ( or a pipeline ) ( unfitted )
        """
        self.models = models
#         self.clfs = []
        
    def ensemble_predictions(self, predictions, weights, type_="harmonic"):
        """
        Combines probabilistic class estimates using a variety of strategies.
        Linear, harmonic, geometric and rank averaging are supported at this moment. 
        Insipred by well known Abhishek's kernel on Kaggle 
        model - base model ( or a pipeline ) ( unfitted )
        """
        assert np.isclose(np.sum(weights), 1.0)
        if type_ == "linear":
            res = np.average(predictions, weights=weights, axis=0)
        elif type_ == "harmonic":
            res = np.average([1 / p for p in predictions], weights=weights, axis=0)
            return 1 / res
        elif type_ == "geometric":
            numerator = np.average(
                [np.log(p) for p in predictions], weights=weights, axis=0
            )
            res = np.exp(numerator / sum(weights))
            return res
        elif type_ == "rank":
            res = np.average([rankdata(p) for p in predictions], weights=weights, axis=0)
            return res / (len(res) + 1)
        return res


    def fit( self, X, y, n_boots = 14, test_size = 100):
        """
        n_boots - number of bootstrapping iterations ( and respective models built)
        """
        self.clfs  = []
        for i, model in zip(range(n_boots), cycle(self.models)):
            X_tr, X_te, y_tr, y_te = train_test_split( X, y, test_size=test_size, random_state=3521 + i*11)

            pa_clf = model
            pa_clf.fit(X_tr, y_tr)

            self.clfs.append(pa_clf)

    def predict( self, X, ensemble_type = 'rank', threshold=0.7):
        # TODO: nonuniform weights
        
        n_boots = len( self.clfs)
        preds = [ clf.predict(X) for clf in self.clfs ]
#         return self.ensemble_predictions( preds, np.ones(n_boots)*(1/float(n_boots)), ensemble_type)
        return self.ensemble_predictions( preds, np.ones(n_boots)*(1/float(n_boots)), ensemble_type)

    
    def predict_proba( self, X, ensemble_type = 'rank' ):
        n_boots = len( self.clfs)
        preds = [ clf.predict_proba(X) for clf in self.clfs ]
        return self.ensemble_predictions( preds, np.ones(n_boots)*(1/float(n_boots)), ensemble_type)

In [9]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVC

In [21]:
def get_pipelines():
    DT_pipeline = Pipeline([
                    ('clf', DecisionTreeClassifier()),
                ])
    
    ET_pipeline = Pipeline([
                    ('clf', ExtraTreesClassifier()),
                ])
    
    GN_pipeline = Pipeline([
                    ('clf', GaussianNB()),
                ])

    KN_pipeline = Pipeline([
                    ('clf', KNeighborsClassifier()),
                ])
    
    PA_pipeline = Pipeline([
                ('clf', PassiveAggressiveClassifier()),
            ])
    
    SVC_pipeline = Pipeline([
                    ('clf', LinearSVC()),
                ])

    pipelines = {'dt': DT_pipeline, 'extra_tree': ET_pipeline, 'gaus_nb': GN_pipeline}
    
    return pipelines

In [11]:
def test_pipeline(X, y, pipeline):
    rskf = StratifiedKFold(n_splits=5, random_state=1)
    rmse_scores = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipeline.fit(X_train, y_train)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, pipeline.predict(X_test))))

    print("kfolds rmse: {0}, mean rmse: {1}".format(
        str([str(round(x, 3)) for x in sorted(rmse_scores)]),
        round(np.mean(rmse_scores), 3)
    ))

In [26]:
pipelines = get_pipelines()

In [13]:
X_train, X_test, y_train_binary, y_test_binary = train_test_split(X_train, y_train_binary, test_size=0.2)

In [14]:
X_train, _, y_train_binary, _ = train_test_split(X_train, y_train_binary, test_size=0.8)

In [16]:
X_train.shape

(82549, 46)

In [17]:
X_test.shape

(103188, 46)

In [None]:
my_shuffle_voter = ShuffleVoter(list(pipelines.values()))

y_pred_binary = []
for i in range(classes_num):
    print("Class:", i)
    y_train = y_train_binary[:, i].copy()
    y_test = y_test_binary[:, i].copy()
    my_shuffle_voter.fit(X_train, y_train)
    y_proba = my_shuffle_voter.predict_proba(X_test)[:, 1]
    y_pred_binary.append(y_proba)
y_pred_binary = np.transpose(np.array(y_pred_binary))
print("RMSE with argmax:", np.sqrt(mean_squared_error(np.argmax(y_pred_binary, axis=1),\
                                                      np.argmax(y_test_binary, axis=1))))

sums = np.sum(y_pred_binary, axis=1)
sums[sums<0.005] = 1

y_pred_binary_norm = (y_pred_binary / sums[:, None])
y_pred = np.dot(y_pred_binary_norm, np.arange(1, 6))
print("RMSE with weighted average:", np.sqrt(mean_squared_error(np.argmax(y_pred_binary, axis=1),\
                                    y_pred)))
print('-----------------------------------------')

Class: 0




In [None]:
for pipeline in list(pipelines.values()):
    print(type(pipeline[0]).__name__)
    y_pred_binary = []
    for i in range(classes_num):
        print("Class:", i)
        y_train = y_train_binary[:, i].copy()
        y_test = y_test_binary[:, i].copy()
        pipeline.fit(X_train, y_train)
        y_proba = pipeline.predict_proba(X_test)[:, 1]
        y_pred_binary.append(y_proba)
    y_pred_binary = np.transpose(np.array(y_pred_binary))
    print("RMSE with argmax:", np.sqrt(mean_squared_error(np.argmax(y_pred_binary, axis=1),\
                                                          np.argmax(y_test_binary, axis=1))))
    
    sums = np.sum(y_pred_binary, axis=1)
    sums[sums<0.005] = 1
    
    y_pred_binary_norm = (y_pred_binary / sums[:, None])
    y_pred = np.dot(y_pred_binary_norm, np.arange(1, 6))
    print("RMSE with weighted average:", np.sqrt(mean_squared_error(np.argmax(y_pred_binary, axis=1),\
                                        y_pred)))
    print('-----------------------------------------')

In [75]:
np.sum(y_pred_binary, axis=1) < 0.1

array([False, False, False, ..., False, False, False])

In [68]:
np.sum(np.sum(y_pred_binary_norm, axis=1) > 1)

  """Entry point for launching an IPython kernel.


10108

In [53]:
np.sum(y_pred_binary, axis=1).shape

(103188,)

  """Entry point for launching an IPython kernel.


array([[0.        , 0.        , 0.45454545, 0.18181818, 0.36363636],
       [0.0625    , 0.0625    , 0.0625    , 0.3125    , 0.5       ],
       [0.09090909, 0.09090909, 0.45454545, 0.09090909, 0.27272727],
       ...,
       [0.09090909, 0.18181818, 0.09090909, 0.54545455, 0.09090909],
       [0.        , 0.16666667, 0.41666667, 0.33333333, 0.08333333],
       [0.        , 0.        , 0.2       , 0.        , 0.8       ]])

In [30]:
y_pred_binary.shape

(103188, 5)

In [51]:
y_pred_binary.shape

(103188, 5)

  """Entry point for launching an IPython kernel.


array([3.90909091, 4.125     , 3.36363636, ..., 3.36363636, 3.33333333,
       4.6       ])

array([1, 2, 3, 4, 5])

4.3

In [43]:
y_pred_binary[0] 

array([0. , 0. , 0.5, 0.2, 0.4])

In [31]:
y_pred_binary[0]

array([0. , 0. , 0.5, 0.2, 0.4])

In [32]:
y_pred_binary[1]

array([0.1, 0.1, 0.1, 0.5, 0.8])

In [37]:
y_pred_binary[:5]

array([[0. , 0. , 0.5, 0.2, 0.4],
       [0.1, 0.1, 0.1, 0.5, 0.8],
       [0.1, 0.1, 0.5, 0.1, 0.3],
       [0. , 0.5, 0.5, 0. , 0.4],
       [0. , 0.1, 0.2, 0.3, 0.4]])

In [36]:
y_test_binary[:5]

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.]])

In [38]:
y_pred_binary.shape

(103188, 5)

In [39]:
y_test_binary.shape

(103188, 5)

(103188,)

In [42]:
np.sqrt(mean_squared_error(np.argmax(y_pred_binary, axis=1), np.argmax(y_test_binary, axis=1)))

1.205359618174216