In [111]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import datetime

In [84]:
class MyModel:
    def fit(self, data):
        pass
    def predict(self, X):
        return np.repeat(0.5, X.shape[0])

In [109]:
def auc_raw(labels, scores):
    # This is important! AUC can be computed only when both positive and negative examples are
    # available
    if len(labels) > sum(labels) > 0:
        return roc_auc_score(labels, scores)

    return float('NaN')

In [110]:
def auc_true(test, pred):
    test = test.assign(pred=pred)
    return test.groupby("instanceId_userId")\
        .apply(lambda y: auc_raw(y.label.values, y.pred.values))\
        .dropna().mean()

In [103]:
def cross_validation(model_class, data, n_folds=5, n_iters=5, seed=2707):
    scores = []
    user_ids = data['instanceId_userId'].unique()
    for iter in range(n_iters):
        kf = KFold(n_splits=n_folds, random_state=seed+iter, shuffle=True)
        print(kf)
        fold = 0
        for train_index, test_index in kf.split(user_ids):
            fold += 1
            #print([user_ids[train_index], user_ids[test_index]])
            print('Prepare data: %s' % datetime.datetime.now())
            data_train = data[data['instanceId_userId'].isin(user_ids[train_index])]
            data_test = data[data['instanceId_userId'].isin(user_ids[test_index])]

            model = model_class()
            print('Fit: %s' % datetime.datetime.now())
            model.fit(data_train)
            print('Predict: %s' % datetime.datetime.now())
            pred = model.predict(data_test.drop('label', 1))
            print('Auc: %s' % datetime.datetime.now())
            score = auc_true(data_test, pred)
            scores.append(score)
            print('%2d -%2d : %1.4f, mean=%1.4f' % (iter, fold, score, np.mean(scores)))

In [104]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [11, 12], [13, 14], [2, 3], [4, 5]])
y = np.array([0, 1, 1, 1, 0, 1, 0, 0])
data = pd.DataFrame(X, columns=['instanceId_userId', 'b'])
data['label'] = y
#data

In [105]:
#cross_validation(MyModel, data, n_folds=3, n_iters=1)

In [106]:
#for i in KFold(n_splits=3, random_state=2707, shuffle=True).split([1,2,3,4,5,6]):
#    print(i)

In [107]:
#data[data['a'] == 3]

In [108]:
#roc_auc_score([0,0], [1, 1])