In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# загрузка наборов данных для обучения
events = pd.read_csv("./datasets/event_data_train.zip")
submissions = pd.read_csv("./datasets/submissions_data_train.zip")

# Создание целевой переменной 'is_gone'

In [3]:
def get_target(submissions_df, threshold=39):
    # количество успешно решенных степов
    users_count_correct = submissions_df[
        submissions_df.submission_status == 'correct'
    ].groupby('user_id').agg({
        'step_id': 'count'
    }).reset_index().rename(columns={'step_id': 'corrects'})
    
    # is_gone = 1, если пройдено > threshold           
    users_count_correct['is_gone'] = (users_count_correct.corrects > threshold).astype('int')
    
    return users_count_correct.drop(['corrects'], axis=1)

# Первые 2 дня на курсе

In [4]:
def cut_df_by_time(df, days=2):
    users_min_timestamp = df.groupby('user_id').agg(
        {'timestamp': 'min'}
    ).reset_index().rename(
        {'timestamp': 'min_timestamp'}, axis=1
    )
    users_min_timestamp['min_timestamp'] += 60 * 60 * 24 * days
    
    events_data_d = pd.merge(df, users_min_timestamp, how='inner', on='user_id')
    cond = events_data_d['timestamp'] <= events_data_d['min_timestamp']
    events_data_d = events_data_d[cond]

    assert events_data_d.user_id.nunique() == df.user_id.nunique()
    return events_data_d.drop(['min_timestamp'], axis=1)

In [5]:
# данные первых двух дней прохождения курса
submissions_2d = cut_df_by_time(submissions)
events_2d = cut_df_by_time(events)

# feature engineering

In [6]:
def get_base_features(events_df, submissions_df):
    # количество каждого значения action для юзера
    users_events = events_df.pivot_table(
        index='user_id',
        columns='action',
        values='step_id',
        aggfunc='count',
        fill_value=0
    ).reset_index()
    
    # количество каждого значения submission_status для юзера
    users_submissions = submissions_df.pivot_table(
        index='user_id',
        columns='submission_status',
        values='step_id',
        aggfunc='count',
        fill_value=0
    ).reset_index()
    
    base_train_df = users_events.merge(users_submissions, on='user_id', how='outer').fillna(0)
    assert base_train_df.user_id.nunique() == events_df.user_id.nunique()
    return base_train_df
      
    
def get_time_features(events_df, submissions_df):
    events_df['date'] = (pd.to_datetime(events_df.timestamp, unit='s')).dt.date
    events_tf = events_df.groupby('user_id').agg({
        'date': 'nunique',  # количество уникальных дней в events
        'timestamp': ['min', 'max']
    }).reset_index()
    
    # количество часов в events
    events_tf['hour_evn'] = round(
        (events_tf.timestamp['max'] - events_tf.timestamp['min']) / (60*60),
        0
    ).astype('int')
    events_tf['day'] = events_tf.date['nunique']
    # день недели начала прохождения
    events_tf['dayweek_evn'] = (pd.to_datetime(events_tf.timestamp['min'], unit='s')).dt.dayofweek

    submissions_tf = submissions_df.groupby('user_id').agg({
        'timestamp': ['min', 'max']
    }).reset_index()
    
    # количество часов в submissions
    submissions_tf['hour_sub'] = round(
        (submissions_tf.timestamp['max'] - submissions_tf.timestamp['min']) / (60*60), 
        0
    )
    
    time_features_df = events_tf.merge(submissions_tf[['user_id', 'hour_sub']], on='user_id', how='outer').fillna(-1)
    assert time_features_df.user_id.nunique() == events_df.user_id.nunique()
    return time_features_df.drop(['timestamp', 'date'], axis=1)


def get_step_count(df, name):
    step_count = df.groupby('user_id').agg({
         'step_id': 'nunique'
     }).reset_index().rename(columns={'step_id': name})
    
    return step_count
    
    
def get_steps_ohe_features(submissions_df):
    step_ids = [
        31971,  31972,  31976,  31977,  31978,  31979,  31981,  
        31983,  31986,  31991,  32031,  32075,  32089,  32173,  
        32174,  32175,  32198,  32202,  32206,  32219,  32796,  
        32812,  32929,  33332,  33334,  33367,  33413,  33415,  
        33418,  33420,  33480,  33481,  33482,  33487,  33488,  
        33534,  33536,  33540,  33673,  33674,  33675,  33677,  
        33684,  33685,  120745
    ]
    
    # прошел ли пользователь степы из step_ids
    ohe_step = pd.get_dummies(
        submissions_df[(submissions_df.submission_status == 'correct') 
                       & (submissions_df.step_id.isin(step_ids))], 
        columns=['step_id']
    )
    steps_features = ohe_step.groupby('user_id').sum().reset_index()
    
    # отбор колонок с информацие о прохождении степов
    steps_features.rename(columns={'user_id': 'step_id_user_id'}, inplace=True)
    steps_features = steps_features.loc[:,steps_features.columns.str.startswith('step_id_')]
    steps_features.rename(columns={'step_id_user_id': 'user_id'}, inplace=True)
    
    return steps_features
    

def get_custom_features(df):
    # отобранные фичи
    df['dis_to_cor'] = df.discovered - df.correct
    df['loss_step'] = df.started_attempt - df.correct
    df['step_pas'] = df.evn_steps - df.passed
    df['start_pas'] = df.started_attempt - df.passed
    df['all_sum'] = df[['correct', 'wrong', 'discovered', 'passed', 'started_attempt', 'viewed']].sum(axis=1)
    df['all_pass'] = df.all_sum - df.evn_steps
    
    return df

In [7]:
def create_work_df(events_train, submissions_train, target=None):
    '''
    Сборка датасета для работы с моделью
    '''
    df = get_base_features(events_train, submissions_train)
    
    time_features = get_time_features(events_train, submissions_train)
    df = df.merge(time_features, on='user_id')
    df.rename(columns={
        ('hour_evn', ''): 'hour_evn', 
        ('day', ''): 'day',
        ('dayweek_evn', ''): 'dayweek_evn',  
        ('hour_sub', ''): 'hour_sub'
    }, inplace=True)
    
    df = df.merge(get_step_count(submissions_train, 'sub_steps'), on='user_id', how='outer').fillna(0)
    df = df.merge(get_step_count(events_train, 'evn_steps'), on='user_id')
    df = df.merge(get_steps_ohe_features(submissions_train), on='user_id', how='outer').fillna(0)

    df = get_custom_features(df)
    
    if target is not None:
        # добавление целевой переменной
        df = df.merge(target, on='user_id', how='outer').fillna(0)

    return df.astype('int')

In [8]:
train_df = create_work_df(events_2d, submissions_2d, target=get_target(submissions))

  new_axis = axis.drop(labels, errors=errors)


# Обучение модели

In [9]:
import xgboost as xgb
import datetime as dt
import time

from sklearn.model_selection import train_test_split

In [10]:
# Отбор юзеров для test и train
users_ids = train_df.user_id.unique()
np.random.seed(17)
np.random.shuffle(users_ids)
test_sz = int(len(users_ids) * 0.2)
train_sz = len(users_ids) - test_sz
train_users = users_ids[:train_sz]
test_users = users_ids[-test_sz:]
# Проверка что пользователи не пересекаются
assert len(np.intersect1d(train_users, test_users)) == 0

# теперь делим данные
train = train_df[train_df.user_id.isin(train_users)]
test = train_df[train_df.user_id.isin(test_users)]

X_train = train.drop(['user_id', 'is_gone'], axis=1)
y_train = train['is_gone']

X_test = test.drop(['user_id', 'is_gone'], axis=1)
y_test = test['is_gone']

In [11]:
def run_single(X_train, X_test, y_train, y_test, random_state=42):
    '''
    Логика обучения модели
    '''
    eta = 0.1
    max_depth= 3 
    subsample = 1
    colsample_bytree = 1
    min_chil_weight=1
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, 
                                                                                                max_depth, 
                                                                                                subsample, 
                                                                                                colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "tree_method": 'exact',
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "min_chil_weight":min_chil_weight,
        "seed": random_state
    }
    num_boost_round = 500
    early_stopping_rounds = 20
    test_size = 0.1
    print('Length train:', len(X_train.index))
    print('Length test:', len(X_test.index))
    dtrain = xgb.DMatrix(X_train, y_train, missing=-99)
    dvalid = xgb.DMatrix(X_test, y_test, missing =-99)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(
        params, dtrain, num_boost_round, evals=watchlist, 
        early_stopping_rounds=early_stopping_rounds, verbose_eval=False) 
    
    return gbm

In [12]:
# обучение модели
start_time = dt.datetime.now()
features = list(X_train.columns.values)
print("Building model.. ",dt.datetime.now()-start_time)

gbm = run_single(X_train, X_test, y_train, y_test)

print(gbm.best_score)
print(dt.datetime.now()-start_time)

Building model..  0:00:00
XGBoost params. ETA: 0.1, MAX_DEPTH: 3, SUBSAMPLE: 1, COLSAMPLE_BY_TREE: 1
Length train: 15388
Length test: 3846
0.901962
0:00:01.133346


# Предсказание

In [13]:
events_t = pd.read_csv("./datasets/events_data_test.zip")
submissions_t = pd.read_csv("./datasets/submission_data_test.zip")

# Поднготовка данных
X_pred = create_work_df(events_t, submissions_t)
dpred = xgb.DMatrix(X_pred.drop(['user_id'], axis=1), missing=-99)

pred_proba = gbm.predict(dpred)
X_pred['is_gone'] = pred_proba

# сохранение данных
X_pred[['user_id', 'is_gone']].to_csv(f'my_pdedict_{str(gbm.best_score)[2:6]}.csv', index=False)

  new_axis = axis.drop(labels, errors=errors)


## Данное решение дает ROC score: 0.8937

# \#TODO

ROC AUC зависит от способа разбиения тренеровочных данных на train и test. Отсюда вывод, что в тренеровочных данных есть юзеры, которые вели себя не естественно. Новая гипотеза: найти удалить тех, кто сильно отличается от представителей своего класса.