In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import tqdm_notebook
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

tqdm.pandas()

In [2]:
structure = pd.read_csv("kaggle/structure.csv")
targets = pd.read_csv("kaggle/targets.csv")
df = pd.read_csv("kaggle/user_activity.csv")

In [3]:
costed_steps = structure[structure.step_cost > 0]

In [4]:
def f(x):
    passed = x[x.action == "passed"]
    not_passed = x[~(x.step_id.isin(passed.step_id))]
    
    #lol-comprehension mean attempts per day { 
    # TODO: optimize it with only unique step
    avg_attempts_per_day = np.mean([t.time.values.size for t in [x[(x.action == "started_attempt") & (x.step_id == id_)] for id_ in x.step_id.unique()]])
    # }
    
    not_passed = not_passed.drop_duplicates("step_id")
    
    # passed steps last 10 days and skipped steps all course  {
    days = 10*3600*24
    border = passed.time.max() - days
    passed_steps = len([1 for t in passed.time if t >= border])
    skipped_steps = not_passed.time.values.size
    # }
    
    # number of stepik-points at passed and viewed steps {
    passed_balls = passed.step_cost.sum()
    viewed_balls = not_passed[not_passed.action == "viewed"].step_cost.sum()
    # }
    
    # average balls per day all course {
    this_day = x.time.min() // 86400
    last_day = x.time.max() // 86400
    t = np.zeros((last_day - this_day, ))
    i, cur_sum = 0, 0
    for _, step in x.sort_values("time").iterrows():
        if step.time // 86400 >= this_day + 1:
            t[i] = cur_sum
            i += 1
            this_day += 1
            cur_sum = 0
        cur_sum += step.step_cost
    if np.sum(t):
        median_balls_per_day = np.median(t)
    else:
        median_balls_per_day = 0
    # }
        
    
    res = [passed_steps, skipped_steps, avg_attempts_per_day, passed_balls, viewed_balls, median_balls_per_day]
    return res

In [5]:
x = df[df.step_id.isin(costed_steps.step_id)].groupby("user_id").get_group(16857)

In [6]:
f(x)

[1, 2, 4.0, 1, 1, 0]

In [7]:
X_t = df[df.step_id.isin(costed_steps.step_id)].groupby("user_id").progress_apply(f)
Y_t = targets[targets.user_id.isin(X_t.index)]

100%|██████████| 9426/9426 [03:24<00:00, 46.19it/s] 


In [8]:
X_t.values, Y_t.passed.values

(array([[0, 68, 0.0, 0, 51, 134.5], [9, 0, 2.3333333333333335, 9, 0, 1.0],
        [9, 0, 2.6666666666666665, 9, 0, 44.0], ...,
        [2, 1, 3.3333333333333335, 2, 1, 0], [0, 2, 1.0, 0, 2, 0],
        [3, 0, 6.0, 3, 0, 0]], dtype=object), array([0, 0, 0, ..., 0, 0, 0]))

In [9]:
def prepare(df, num_features):
    X = df.groupby("user_id").progress_apply(f)
    return X.index, np.array([np.array(x) for x in X.values]).reshape(-1, num_features)

In [48]:
X = np.array([np.array(x) for x in X_t.values]).reshape(-1, 6)
Y = Y_t.passed.ravel()

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [50]:
X_train.shape

(7068, 6)

In [51]:
import theano  # нейронки
import numpy as np  # наука
import lasagne  # готовые нейронки
import theano.tensor as T

In [57]:
input_X = T.matrix("X", dtype="float32")
input_shape = [None, 6]
target_y = T.vector("target Y", dtype='int32')

#АРХИТЕКТУРА нейронки:
# #входной слой (вспомогательный)
input_layer = lasagne.layers.InputLayer(shape=input_shape, input_var=input_X)

#hidden layers
dense_0 = lasagne.layers.DenseLayer(input_layer, 100, nonlinearity=lasagne.nonlinearities.sigmoid)
dense_output = lasagne.layers.DenseLayer(dense_0,
                                        num_units = 1,
                                        nonlinearity = lasagne.nonlinearities.softmax,
                                        name='output')

#предсказание нейронки (theano-преобразование)
y_predicted = lasagne.layers.get_output(dense_output)

#все веса нейронки (shared-переменные)
all_weights = lasagne.layers.get_all_params(dense_output)

#функция ошибки - средняя кроссэнтропия
loss = lasagne.objectives.binary_crossentropy(y_predicted, target_y).mean()

#сразу посчитать словарь обновлённых значений с шагом по градиенту, как раньше
updates_sgd = lasagne.updates.rmsprop(loss, all_weights,learning_rate=0.01)

#функция, которая обучает сеть на 1 шаг и возвращащет значение функции потерь и точности
train_fun = theano.function([input_X,target_y], loss, updates=updates_sgd, allow_input_downcast=True)

# вспомогательная функция, которая возвращает список мини-батчей для обучения нейронки

#на вход
# X - тензор из картинок размером (много, 1, 28, 28), например - X_train
# y - вектор из чиселок - ответов для каждой картинки из X; например - Y_train
#batch_size - одно число - желаемый размер группы

#что нужно сделать
# 1) перемешать данные
# - важно перемешать X и y одним и тем же образом, чтобы сохранить соответствие картинки ответу на неё
# 3) побить данные на подгруппы так, чтобы в каждой подгруппе было batch_size картинок и ответов
# - если число картинок не делится на batch_size, одну подгруппу можно вернуть другого размера
# 4) вернуть список (или итератор) пар:
# - (подгруппа картинок, ответы из y на эту подгруппу)

def iterate_minibatches(inputs, targets, batchsize, shuffle=True):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt] 

In [58]:
#ОБУЧЕНИЕ И ВАЛИДАЦИЯ
import time

num_epochs = 10 #количество проходов по данным

batch_size = 100 #размер мини-батча

for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_acc = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, Y_train, batch_size):
        inputs, targets = batch
        train_err_batch = train_fun(inputs, targets)
        train_err += train_err_batch
        train_batches += 1
        
    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time))

    print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))

Epoch 1 of 10 took 0.024s
  training loss (in-iteration):		nan
Epoch 2 of 10 took 0.024s
  training loss (in-iteration):		nan
Epoch 3 of 10 took 0.025s
  training loss (in-iteration):		nan
Epoch 4 of 10 took 0.027s
  training loss (in-iteration):		nan
Epoch 5 of 10 took 0.035s
  training loss (in-iteration):		nan
Epoch 6 of 10 took 0.026s
  training loss (in-iteration):		nan
Epoch 7 of 10 took 0.038s
  training loss (in-iteration):		nan
Epoch 8 of 10 took 0.022s
  training loss (in-iteration):		nan
Epoch 9 of 10 took 0.036s
  training loss (in-iteration):		nan
Epoch 10 of 10 took 0.024s
  training loss (in-iteration):		nan


In [34]:
predict_net = theano.compile.function([input_X], lasagne.layers.get_output(dense_output))

In [35]:
predict_net(np.array([X[4]]))

array([[ nan]])