In [None]:
__author__ = 'Dmitry Yutkin'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import preprocessing
from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier

import xgboost as xgb

plt.style.use('ggplot')
%matplotlib inline

In [None]:
# Read data
targets = pd.read_csv('./kaggle/targets.csv')
structure = pd.read_csv('./kaggle/structure.csv')
events = pd.read_csv('./kaggle/user_activity.csv')
events_test = pd.read_csv('./kaggle/user_activity_test.csv')
targets = targets[['user_id', 'passed']]

In [None]:
# Preprocessing
events = pd.merge(events, targets, on='user_id')
events.rename(columns={'step_cost': 'earned_scores'}, inplace=True)
events.ix[events['action'] != 'passed', 'earned_scores'] = 0
events = events.sort_values(by=['time', 'user_id'])

events_test.rename(columns={'step_cost': 'earned_scores'}, inplace=True)
events_test.ix[events_test['action'] != 'passed', 'earned_scores'] = 0
events_test = events_test.sort_values(by=['time', 'user_id'])

structure = structure.sort_values(by=['module_position', 'lesson_position', 'step_position'])

# maps step id -> step cost
step2cost = {i['step_id']:i['step_cost'] for i in structure[['step_id', 'step_cost']].to_dict('records')}

In [None]:
def compute_earned_score(events, user_features):
    """ Computes users scores """
    scores = events.groupby(['user_id']).sum().earned_scores
    for usr_id, score in scores.items():
        user_features[usr_id]['score'] = score
    print('Earned score computed.')

In [None]:
def compute_number_of_actions(events, user_features):
    """ Compute number of different actions perfomed by user """
    actions_count = events[['user_id', 'action']].groupby('user_id').count()
    for row in actions_count.itertuples():
        user_features[row.Index]['actions_count'] = row.action
    print('Number_of_actions computed.')

In [None]:
def compute_solving_stats(events, users):
    """ Compute different statistics of time spent on solving problems """
    user2steps_seq = defaultdict(lambda : defaultdict(dict)) # map user_id to [sequence of user's visited steps]
    for event in events.itertuples():
        if event.action == 'viewed' and step2cost[event.step_id] > 0:
            if 'viewed' not in user2steps_seq[event.user_id][event.step_id]:
                user2steps_seq[event.user_id][event.step_id]['viewed'] = event.time
            else:
                tmp = user2steps_seq[event.user_id][event.step_id]['viewed']
                user2steps_seq[event.user_id][event.step_id]['viewed'] = max(event.time, tmp)
        elif event.action == 'passed' and step2cost[event.step_id] > 0:
            user2steps_seq[event.user_id][event.step_id]['passed'] = event.time

    users_id = events['user_id'].unique()
    for id in users_id:
        seq_of_steps = user2steps_seq[id]
        if seq_of_steps:
            time_per_steps = []
            for step_id, step_info in seq_of_steps.items():
                if 'passed' in step_info and 'viewed' in step_info:
                    time = abs(step_info['passed'] - step_info['viewed']) / 60
                    time_per_steps.append(time if time else 0.001)
            
            if not time_per_steps:
                time_per_steps.append(10**-10)
        
            time_per_steps = np.array(time_per_steps)

        # here can be harmonic mean, geometric mean, variance, min, max etc.
        users[id]['mean_time_to_solve'] = round(np.average(time_per_steps),2) if seq_of_steps else 0.0
#          users[id]['max_time_to_solve'] = round(np.max(time_per_steps), 2) if seq_of_steps else 0.0
        

    print('Solving statistics computed.')

In [None]:
def spent_time2(events, users):
    """ Compute time spent on course """
    user_info = defaultdict(dict)
    for event in events.itertuples():
        if event.action != 'viewed': 
            continue
        if 'prev' not in user_info[event.user_id]:
            user_info[event.user_id]['prev'] = event.time
        else:
            prev = user_info[event.user_id]['prev']
            diff_min = abs(event.time - prev) / 60
            
            # heuristic: if time between actions more than 20 min,
            # than don't take this span into consideration
            if diff_min / 60 <= 20:
                if 'spent_t' not in users[event.user_id]:
                    users[event.user_id]['spent_t'] = diff_min
                else:
                    users[event.user_id]['spent_t'] += diff_min
            user_info[event.user_id]['prev'] = event.time
    
    for user_id, parans in users.items():
        if 'spent_t' not in users[user_id]:
            users[user_id]['spent_t'] = 0
        else:
            users[user_id]['spent_t'] = round(users[user_id]['spent_t'], 2)
    print('Spent time computed.')

In [None]:
def compute_max_diff_between_actions(events, users):
    """ Compute max difference in time between subsequent actions """
    prev_step = defaultdict(int)
    for event in events.itertuples():
        uid = event.user_id
        if prev_step[uid] == 0:
            prev_step[uid] = event.time
        else:
            if 'max_diff_between_actions' not in users[uid]:
                users[uid]['max_diff_between_actions'] = abs(event.time - prev_step[uid]) / 60
            else:
                users[uid]['max_diff_between_actions'] = max(users[uid]['max_diff_between_actions'], 
                                                             abs(event.time - prev_step[uid]) / 60)
            prev_step[uid] = event.time
    for user_id in prev_step.keys():
        if 'max_diff_between_actions' not in users[user_id]:
            users[user_id]['max_diff_between_actions'] = 0
        else:
            users[user_id]['max_diff_between_actions'] = round(users[user_id]['max_diff_between_actions'], 2)
    print('Max diff between actions computed.')

In [None]:
train = defaultdict(dict) # maps user_id -> dict of features
for user in targets.itertuples():
    train[user.user_id]['passed'] = user.passed
compute_solving_stats(events, train)
compute_earned_score(events, train)
compute_number_of_actions(events, train)
compute_max_diff_between_actions(events, train)
spent_time2(events, train)

train = pd.DataFrame.from_dict(train, orient='index')
features = [col for col in train.columns if col not in ['passed', 'user_id']]

In [None]:
# Drop outliers
# Users that ultimately passed course, but in first two weeks
# have performed less than 10 actions, spent less than 5 minutes
# and haven't earned any score - should be dropped
train = train.drop(train[(train.passed == 1) & (train['actions_count'] < 10)].index)
train = train.drop(train[(train.passed == 1) & (train['spent_t'] < 5)].index)
train = train.drop(train[(train.passed == 1) & (train['score'] == 0)].index)

# Users, who have performed good enough in first two weeks,
# but eventually haven't passed the course, shoulde be dropped
train = train.drop(train[(train.passed == 0) & (train['score'] > train[train.passed == 1].score.median())].index)

In [None]:
trainX = train[features]
trainY = train['passed']

In [None]:
clf = xgb.XGBClassifier(n_estimators=1500, max_depth=3, learning_rate=0.01)

In [None]:
scores = cross_val_score(clf, trainX, trainY, cv=5, scoring='f1')
print("CV F1: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
print(scores)

In [None]:
clf.fit(trainX, trainY)

In [None]:
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(trainX.shape[1]):
    print('%d. feature "%s" (%f)' % (f + 1, features[indices[f]], importances[indices[f]]))

In [None]:
test = defaultdict(dict)

compute_solving_stats(events_test, test)
compute_earned_score(events_test, test)
compute_number_of_actions(events_test, test)
compute_max_diff_between_actions(events_test, test)
spent_time2(events_test, test)

test = pd.DataFrame.from_dict(test, orient='index')
test = test[trainX.columns]

In [None]:
def create_submission(X, name):
    np.savetxt('%s.csv' % name, X, delimiter=',', fmt="%d", header='user_id,passed', comments='')
def getX(ev):
    sc  = ev.groupby('user_id')
    return sc.sum().index

In [None]:
ind_test = getX(events_test)

In [None]:
ans = clf.predict(test)
result = np.concatenate((np.asarray(ind_test, dtype=int).reshape(-1, 1), np.asarray(ans, dtype=int).reshape(-1, 1)), axis=1)
create_submission(result, 'kekmda'
Counter(ans)