In [None]:
%run /Users/tyamgin/Projects/mlbootcamp/championship19/header.ipynb

In [2]:
class MyModel:
    verbose = 0
    def __init__(self, params={}):
        self.params = params
    def get_X(self, data):
        del_cols = ['objectId', 'instanceId_userId', 'metadata_ownerId', 'metadata_authorId', 'instanceId_objectId',
                    'label', 'membership_statusUpdateDate',
                    'clicked', 'viewed', 'disliked', 'reshared', 'ignored', 'commented', 'complaint', 'unliked',
                    'audit_timestamp', 'metadata_createdAt']
        return data.drop([c for c in del_cols if c in data.columns], 1)
    def fit(self, data):
        pass
    def predict(self, X):
        return np.repeat(0.5, X.shape[0])

dummy_model = MyModel()
dummy_model.verbose

0

In [7]:
class MeanModel(MyModel):
    def __init__(self, models, coefs):
        self.models = models
        self.coefs = coefs
    def fit(self, data):
        for m in self.models:
            m.fit(data)
    def predict(self, X):
        sum = np.repeat(0.0, X.shape[0])
        for m, c in zip(self.models, self.coefs):
            sum += m.predict(X) * c
        return sum 

In [None]:
def predict_to_submit(X, proba):
    X = X.assign(weight=-proba)
    scores = X[['instanceId_userId', 'objectId', 'weight']] \
            .groupby(['instanceId_userId','objectId']).mean()
    result = scores.sort_values(by=['instanceId_userId', 'weight']).reset_index()
    return result.groupby("instanceId_userId")['objectId'].apply(list)

In [37]:
def auc_true(test, pred):
    return mroc.mean_roc_auc(test.instanceId_userId.values, test.label.values, pred)


In [1]:
def cross_validation(model, data, n_folds=5, n_iters=5, seed=2707, verbose=1, split_by='instanceId_userId'):
    scores = []
    if split_by:
        user_ids = data[split_by].unique()
    else:
        user_ids = np.arange(data.shape[0])
    for iter in range(n_iters):
        kf = KFold(n_splits=n_folds, random_state=seed+iter, shuffle=True)
        print(kf)
        fold = 0
        for train_index, test_index in kf.split(user_ids):
            fold += 1
            if verbose >= 2:
                print('Prepare data: %s' % datetime.datetime.now())
                
            if split_by:
                data_train = data[data[split_by].isin(user_ids[train_index])]
                data_test = data[data[split_by].isin(user_ids[test_index])]
            else:
                data_train = data.iloc[train_index,:]
                data_test = data.iloc[test_index,:]

            if verbose >= 2:
                print('Fit: %s' % datetime.datetime.now())
            model.fit(data_train)
            if verbose >= 2:
                print('Predict: %s' % datetime.datetime.now())

            pred = model.predict(data_test.drop('label', 1))
            if verbose >= 2:
                print('Auc: %s' % datetime.datetime.now())
            score = auc_true(data_test, pred)
            scores.append(score)

            #if verbose >= 1:
            print('%2d -%2d : %1.4f, mean=%1.4f' % (iter, fold, score, np.mean(scores)))
    return np.mean(scores)

In [20]:
_feature_selection_last_result = []
def feature_selection(train, model, start_features, required_columns):
    get_score = lambda feats: cross_validation(model, train[feats + required_columns], n_iters=1, verbose=1, seed=430)

    best_score = get_score(start_features)
    best_features = start_features
    print('Start with score=%.6f: %s' % (best_score, ','.join(best_features)))
    for i in range(len(start_features)):
        remove_feature = start_features[i]
        print('Try to remove %s' % remove_feature)
        try_features = [c for c in best_features if c != remove_feature]
        try_score = get_score(try_features)
        if try_score > best_score:
            best_score = try_score
            best_features = try_features
            global _feature_selection_last_result
            _feature_selection_last_result = best_features
            print('Found [%s] score=%.6f: %s' % (remove_feature, best_score, ','.join(best_features))) 
