- add 'prior_question_had_explanation_mean'

In [1]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
from bitarray import bitarray
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc="progress: ")

_ = np.seterr(divide='ignore', invalid='ignore')
pd.set_option("max_columns", 50)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  del sys.path[0]
  from pandas import Panel


In [3]:
def make_bitarray():
    a = bitarray(32737, endian='little')
    a.setall(True)   
    return a

In [4]:
FULL_TRAIN = False

# Preprocess

In [5]:
data_types_dict = {
    #'row_id': 'uint32',
    'timestamp': 'uint64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
}
target = 'answered_correctly'

In [6]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()
train_df = train_df[train_df[target] != -1].reset_index(drop=True)

train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df['prior_question_elapsed_time'].fillna(0, inplace=True)

train_df['timestamp'] = (train_df['timestamp'] // 86400000) + 1 # days from start
train_df = train_df.astype(data_types_dict)

if FULL_TRAIN:
    train_size = 100
else:
    train_size = 24
    valid_size = 6
    
train_index = list(train_df.groupby('user_id').tail(train_size).index)

In [7]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 3, 4], 
    dtype={'question_id': 'int16', 'part': 'int8', 'tags': 'str'} 
)

#questions_df['tags'] = questions_df['tags'].apply(lambda ts: [int(x) for x in str(ts).split() if x != 'nan'])

def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

ids = ["tags"]
tfidf_svd_feats = []

for id_ in ids:
    print(id_)
    docs = questions_df.groupby("question_id")[id_].apply(join)
    max_features = int(questions_df[id_].nunique() * 0.8)
    tv = TfidfVectorizer(max_features=max_features)
    X = tv.fit_transform(docs)

    n_components = 3
    svd = TruncatedSVD(n_components=n_components, random_state=0)
    X = svd.fit_transform(X)
    df = pd.DataFrame(X, columns=[f"tfidf_{id_}_{i}" for i in range(n_components)])
    df.index = docs.index
    tfidf_svd_feats += [f"tfidf_{id_}_{i}" for i in range(n_components)]
    
kmeans = KMeans(n_clusters=20, random_state=0).fit(df.values)
df["tag_class"] = kmeans.labels_

train_df = pd.merge(train_df, questions_df[["question_id", "part"]], left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
train_df.drop(columns=['question_id'], inplace=True)

#train_df = pd.merge(train_df, df, left_on='content_id', right_on='question_id', how='left')
#train_df.drop(tfidf_svd_feats, axis=1, inplace=True)
#tag_class_array = train_df.tag_class.values
#train_df.drop("tag_class", axis=1, inplace=True)
#tag_class_array = tag_class_array[train_index]

tags


In [8]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
user_correctness_array = np.array(cum['cumsum'] / cum['cumcount'])
user_correctness_array = user_correctness_array[train_index]
train_df.drop(columns=['lag'], inplace=True)
del cum

In [9]:
#
print(1)
train_df["part_count"] = train_df["part"] ==1
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part1_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
part1_count_array = part1_count_array[train_index] 
train_df.drop(columns=['part_count_lag', 'part_count'], inplace=True)

#
print(2)
train_df["part_count"] = train_df["part"] ==2
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part2_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
part2_count_array = part2_count_array[train_index] 
train_df.drop(columns=['part_count_lag', 'part_count'], inplace=True)

#
print(3)
train_df["part_count"] = train_df["part"] ==3
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part3_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
part3_count_array = part3_count_array[train_index] 
train_df.drop(columns=['part_count_lag', 'part_count'], inplace=True)

#
print(4)
train_df["part_count"] = train_df["part"] ==4
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part4_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
part4_count_array = part4_count_array[train_index] 
train_df.drop(columns=['part_count_lag', 'part_count'], inplace=True)

#
print(5)
train_df["part_count"] = train_df["part"] ==5
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part5_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
part5_count_array = part5_count_array[train_index] 
train_df.drop(columns=['part_count_lag', 'part_count'], inplace=True)

#
print(6)
train_df["part_count"] = train_df["part"] ==6
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part6_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
part6_count_array = part6_count_array[train_index] 
train_df.drop(columns=['part_count_lag', 'part_count'], inplace=True)

#
print(7)
train_df["part_count"] = train_df["part"] ==7
train_df["part_count"] = train_df["part_count"].astype("int8")
train_df['part_count_lag'] = train_df.groupby('user_id')["part_count"].shift()
part7_count_array = train_df.groupby('user_id')['part_count_lag'].agg(['cumsum']).values
part7_count_array = part7_count_array[train_index] 
train_df.drop(columns=['part_count_lag', 'part_count'], inplace=True)

1
2
3
4
5
6
7


In [10]:
# prior_question_had_explanation_mean
train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift().astype(bool)
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
prior_question_had_explanation_mean_array = np.array(cum['cumsum'] / cum['cumcount'])
prior_question_had_explanation_mean_array = prior_question_had_explanation_mean_array[train_index]

user_prior_question_had_explanation_sum_agg = train_df.groupby('user_id')["prior_question_had_explanation"].agg(['sum'])
user_prior_question_had_explanation_sum_dict = user_prior_question_had_explanation_sum_agg['sum'].astype('int32').to_dict(defaultdict(int))
train_df.drop(columns=['lag'], inplace=True)
del cum, user_prior_question_had_explanation_sum_agg

In [11]:
first_attempt_df = pd.read_csv("../input/riiid-additional-data/content_first_attempt.csv")
first_attempt_array = first_attempt_df.first_attempt.values
train_df["first_attempt"] = first_attempt_array
unique_attempt_array= train_df.groupby("user_id")["first_attempt"].cumsum().values
train_df["unique_attempt"] = unique_attempt_array
user_unique_agg = train_df.groupby('user_id')["unique_attempt"].agg(['max'])
user_unique_dict = user_unique_agg['max'].astype('int32').to_dict(defaultdict(int))

first_attempt_array = first_attempt_array[train_index]
unique_attempt_array = unique_attempt_array[train_index]
train_df.drop(['first_attempt', 'unique_attempt'], axis=1, inplace=True)
del first_attempt_df, user_unique_agg

In [12]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

content_count_array = train_df['content_id'].map(content_agg['count']).astype('int32').values
content_id_array = train_df['content_id'].map(content_agg['sum'] / content_agg['count']).values
content_count_array = content_count_array[train_index]
content_id_array = content_id_array[train_index]

del user_agg, content_agg

In [13]:
%%time
user_content_agg = train_df.groupby("user_id")["content_id"].unique().reset_index().set_index("user_id")

value = []
for j in user_content_agg.index:
    a = bitarray(32737, endian='little')
    a.setall(True)
    for i in user_content_agg.loc[j][0]:
        a[i] = 0
    value.append(a)
    
user_content_agg["content_exp"] = value

user_content_dict = user_content_agg["content_exp"].to_dict(defaultdict(make_bitarray))
del user_content_agg

CPU times: user 2min 35s, sys: 2.32 s, total: 2min 37s
Wall time: 2min 37s


In [14]:
train_df["count"] = 1
count_array = train_df.groupby("user_id")["count"].cumsum().values
count_array = count_array[train_index]
train_df.drop("count", axis=1, inplace=True)

In [15]:
train_df["count_inday"] = 1
count_inday_array = train_df.groupby(["user_id","timestamp"])["count_inday"].cumsum().values
count_inday_array = count_inday_array[train_index]
train_df.drop("count_inday", axis=1, inplace=True)

In [16]:
train_df = train_df.groupby('user_id').tail(train_size).reset_index(drop=True)

In [17]:
train_df['content_count'] = content_count_array
train_df['content_id'] = content_id_array
train_df['user_correctness'] = user_correctness_array
train_df['first_attempt'] = first_attempt_array
train_df['count'] = count_array
train_df['count_inday'] = count_inday_array
train_df['unique_attempt'] = unique_attempt_array
train_df['part1_count'] = part1_count_array
train_df['part2_count'] = part2_count_array
train_df['part3_count'] = part3_count_array
train_df['part4_count'] = part4_count_array
train_df['part5_count'] = part5_count_array
train_df['part6_count'] = part6_count_array
train_df['part7_count'] = part7_count_array
train_df['prior_question_had_explanation_mean'] = prior_question_had_explanation_mean_array
#train_df['tag_class'] = tag_class_array

part1_count_agg = train_df.groupby('user_id')["part1_count"].agg(['max'])
part2_count_agg = train_df.groupby('user_id')["part2_count"].agg(['max'])
part3_count_agg = train_df.groupby('user_id')["part3_count"].agg(['max'])
part4_count_agg = train_df.groupby('user_id')["part4_count"].agg(['max'])
part5_count_agg = train_df.groupby('user_id')["part5_count"].agg(['max'])
part6_count_agg = train_df.groupby('user_id')["part6_count"].agg(['max'])
part7_count_agg = train_df.groupby('user_id')["part7_count"].agg(['max'])
part1_count_agg['max'].fillna(0,inplace=True)
part2_count_agg['max'].fillna(0,inplace=True)
part3_count_agg['max'].fillna(0,inplace=True)
part4_count_agg['max'].fillna(0,inplace=True)
part5_count_agg['max'].fillna(0,inplace=True)
part6_count_agg['max'].fillna(0,inplace=True)
part7_count_agg['max'].fillna(0,inplace=True)
part1_count_dict = part1_count_agg['max'].astype('int32').to_dict(defaultdict(int))
part2_count_dict = part2_count_agg['max'].astype('int32').to_dict(defaultdict(int))
part3_count_dict = part3_count_agg['max'].astype('int32').to_dict(defaultdict(int))
part4_count_dict = part4_count_agg['max'].astype('int32').to_dict(defaultdict(int))
part5_count_dict = part5_count_agg['max'].astype('int32').to_dict(defaultdict(int))
part6_count_dict = part6_count_agg['max'].astype('int32').to_dict(defaultdict(int))
part7_count_dict = part7_count_agg['max'].astype('int32').to_dict(defaultdict(int))

user_timestamp_dict = train_df.groupby("user_id").tail(1)["timestamp"].to_dict(defaultdict(int))
user_count_inday_dict = train_df.groupby("user_id").tail(1)["count_inday"].to_dict(defaultdict(int))

del part1_count_agg, part2_count_agg, part3_count_agg, part4_count_agg, part5_count_agg, part6_count_agg, part7_count_agg

In [18]:
features = [
    'content_id',
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'user_correctness',
    'part',
    'content_count',
    'count',
    'first_attempt',
    'timestamp', #?
    'count_inday', #?
    'unique_attempt', #?
    'part1_count',
    'part2_count',
    'part3_count',
    'part4_count',
    'part5_count',
    'part6_count',
    'part7_count', 
    'prior_question_had_explanation_mean',
]

print(len(features))
drop_cols = [i for i in train_df.columns if i not in features + [target]]
print(drop_cols)

19
['user_id']


In [19]:
if FULL_TRAIN:
    train_df.drop(drop_cols, axis=1, inplace=True)
    print(train_df.shape)
else:
    valid_df = train_df.groupby('user_id').tail(valid_size).copy()
    train_df.drop(valid_df.index, inplace=True)
    train_df.drop(drop_cols, axis=1, inplace=True)
    valid_df.drop(drop_cols, axis=1, inplace=True)
    print(train_df.shape, valid_df.shape)

(6536675, 20) (2360984, 20)


In [20]:
train_df.head(10)

Unnamed: 0,timestamp,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part,content_count,user_correctness,first_attempt,count,count_inday,unique_attempt,part1_count,part2_count,part3_count,part4_count,part5_count,part6_count,part7_count,prior_question_had_explanation_mean
0,1,0.687217,0,19000.0,False,1,36674,0.727273,True,23,23,23,20.0,0.0,0.0,0.0,2.0,0.0,0.0,0.045455
1,1,0.608222,0,21000.0,False,1,47047,0.695652,True,24,24,24,21.0,0.0,0.0,0.0,2.0,0.0,0.0,0.043478
2,1,0.601824,1,22000.0,False,1,40452,0.666667,True,25,25,25,22.0,0.0,0.0,0.0,2.0,0.0,0.0,0.041667
3,1,0.418436,1,16000.0,False,1,190170,0.68,True,26,26,26,23.0,0.0,0.0,0.0,2.0,0.0,0.0,0.04
4,1,0.474545,1,20000.0,False,1,56707,0.692308,True,27,27,27,24.0,0.0,0.0,0.0,2.0,0.0,0.0,0.038462
5,1,0.461387,1,22000.0,False,1,30430,0.703704,True,28,28,28,25.0,0.0,0.0,0.0,2.0,0.0,0.0,0.037037
6,1,0.543071,1,22000.0,False,1,66146,0.714286,True,29,29,29,26.0,0.0,0.0,0.0,2.0,0.0,0.0,0.035714
7,1,0.35997,0,23000.0,False,1,195861,0.724138,True,30,30,30,27.0,0.0,0.0,0.0,2.0,0.0,0.0,0.034483
8,1,0.353568,1,20000.0,False,1,15386,0.7,True,31,31,31,28.0,0.0,0.0,0.0,2.0,0.0,0.0,0.033333
9,1,0.381249,0,15000.0,False,1,47486,0.709677,True,32,32,32,29.0,0.0,0.0,0.0,2.0,0.0,0.0,0.032258


# Train

In [21]:
params = {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_bin': 800,
    'num_leaves': 80
}

In [22]:
if FULL_TRAIN:
    X_train = np.ndarray(shape=(train_df.shape[0], len(features)), dtype=np.float32)
    y_train = np.ndarray(shape=(train_df.shape[0], 1), dtype=np.float32)
    for idx, feature in enumerate(features):
        X_train[:,idx] = train_df[feature].values.astype(np.float32)
        train_df.drop(feature, axis=1, inplace=True)
    y_train = train_df[target].values.astype(np.float32)
    train_df.drop(target, axis=1, inplace=True)

    tr_data = lgb.Dataset(X_train, label=y_train)
    print("Full training starts")
    model = lgb.train(
        params, 
        tr_data, 
        num_boost_round=4000,
        valid_sets=None, 
        )
else:    
    X_train = np.ndarray(shape=(train_df.shape[0], len(features)), dtype=np.float32)
    y_train = np.ndarray(shape=(train_df.shape[0], 1), dtype=np.float32)
    for idx, feature in enumerate(features):
        X_train[:,idx] = train_df[feature].values.astype(np.float32)
        train_df.drop(feature, axis=1, inplace=True)
    y_train = train_df[target].values.astype(np.float32)
    train_df.drop(target, axis=1, inplace=True)
    tr_data = lgb.Dataset(X_train, label=y_train)
    
    X_valid = np.ndarray(shape=(valid_df.shape[0], len(features)), dtype=np.float32)
    y_valid = np.ndarray(shape=(valid_df.shape[0], 1), dtype=np.float32)
    for idx, feature in enumerate(features):
        X_valid[:,idx] = valid_df[feature].values.astype(np.float32)
        valid_df.drop(feature, axis=1, inplace=True)
    y_valid = valid_df[target].values.astype(np.float32)
    valid_df.drop(target, axis=1, inplace=True)
    va_data = lgb.Dataset(X_valid, label=y_valid)

    model = lgb.train(
        params, 
        tr_data, 
        num_boost_round=10000,
        valid_sets=[tr_data, va_data], 
        early_stopping_rounds=50,
        verbose_eval=50
        )

Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.757789	valid_1's auc: 0.74255
[100]	training's auc: 0.76032	valid_1's auc: 0.745819
[150]	training's auc: 0.761515	valid_1's auc: 0.74716
[200]	training's auc: 0.762213	valid_1's auc: 0.747716
[250]	training's auc: 0.762678	valid_1's auc: 0.747978
[300]	training's auc: 0.763059	valid_1's auc: 0.748162
[350]	training's auc: 0.763403	valid_1's auc: 0.7483
[400]	training's auc: 0.763728	valid_1's auc: 0.748416
[450]	training's auc: 0.764054	valid_1's auc: 0.748544
[500]	training's auc: 0.764349	valid_1's auc: 0.748645
[550]	training's auc: 0.764642	valid_1's auc: 0.748724
[600]	training's auc: 0.764944	valid_1's auc: 0.74881
[650]	training's auc: 0.765221	valid_1's auc: 0.748888
[700]	training's auc: 0.765518	valid_1's auc: 0.748975
[750]	training's auc: 0.765788	valid_1's auc: 0.749036
[800]	training's auc: 0.766045	valid_1's auc: 0.749092
[850]	training's auc: 0.766304	valid_1's auc: 0.74914
[900]	traini

# Inference

In [23]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [24]:
%%time

for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
         
        for user_id, content_id, answered_correctly, part, first_attempt_ornot, prior_explanation in zip(user_ids, content_ids, 
                                                                                      targets, 
                                                                                      prior_part_arrays, 
                                                                                      prior_f_attempt_arrays,
                                                                                      p_prior_question_had_explanation):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1
            if part == 1:
                part1_count_dict[user_id] +=1
            elif part == 2:
                part2_count_dict[user_id] +=1
            elif part == 3:
                part3_count_dict[user_id] +=1
            elif part == 4:
                part4_count_dict[user_id] +=1
            elif part == 5:
                part5_count_dict[user_id] +=1
            elif part == 6:
                part6_count_dict[user_id] +=1
            else:
                part7_count_dict[user_id] +=1
            user_unique_dict[user_id] += first_attempt_ornot
            user_prior_question_had_explanation_sum_dict[user_id] += prior_explanation

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(0)
    test_df['timestamp'] = (test_df['timestamp'] // 86400000) + 1
    
    prior_part_arrays = test_df['part'].values
    p_prior_question_had_explanation = test_df['prior_question_had_explanation'].values
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    part1_count = np.zeros(len(test_df), dtype=np.int32)
    part2_count = np.zeros(len(test_df), dtype=np.int32)
    part3_count = np.zeros(len(test_df), dtype=np.int32)
    part4_count = np.zeros(len(test_df), dtype=np.int32)
    part5_count = np.zeros(len(test_df), dtype=np.int32)
    part6_count = np.zeros(len(test_df), dtype=np.int32)
    part7_count = np.zeros(len(test_df), dtype=np.int32)
    user_count_inday = np.zeros(len(test_df), dtype=np.int32)
    first_attempt_values = []
    user_unique_count = np.zeros(len(test_df), dtype=np.int32)
    user_prior_question_had_explanation_sum = np.zeros(len(test_df), dtype=np.int32)
    
    for i, (user_id, content_id, timestamp) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values, test_df['timestamp'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        part1_count[i] = part1_count_dict[user_id]
        part2_count[i] = part2_count_dict[user_id]
        part3_count[i] = part3_count_dict[user_id]
        part4_count[i] = part4_count_dict[user_id]
        part5_count[i] = part5_count_dict[user_id]
        part6_count[i] = part6_count_dict[user_id]
        part7_count[i] = part7_count_dict[user_id]
        first_attempt_values.append(user_content_dict[user_id][content_id])
        user_content_dict[user_id][content_id] = False 
        if user_timestamp_dict[user_id] == timestamp:
            user_count_inday_dict[user_id] += 1
        else:
            user_count_inday_dict[user_id] = 1
            user_timestamp_dict[user_id] = timestamp
        user_count_inday[i] = user_count_inday_dict[user_id]
        user_unique_count[i] = user_unique_dict[user_id]
        user_prior_question_had_explanation_sum[i] = user_prior_question_had_explanation_sum_dict[user_id]
            
    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['count'] = 1
    test_df['count'] = test_df.groupby("user_id")["count"].cumsum()
    test_df['count'] += user_count
    test_df['part1_count'] = part1_count
    test_df['part2_count'] = part2_count
    test_df['part3_count'] = part3_count
    test_df['part4_count'] = part4_count
    test_df['part5_count'] = part5_count
    test_df['part6_count'] = part6_count
    test_df['part7_count'] = part7_count
    test_df["first_attempt"] = first_attempt_values
    test_df['count_inday'] = user_count_inday
    test_df["unique_attempt"] = test_df.groupby("user_id")["first_attempt"].cumsum()
    test_df["unique_attempt"] += user_unique_count
    test_df['prior_question_had_explanation_mean'] = user_prior_question_had_explanation_sum / user_count

    prior_f_attempt_arrays = test_df['first_attempt'].values

    test_df[target] = model.predict(test_df[features])
    env.predict(test_df[['row_id', target]])

CPU times: user 1.44 s, sys: 84.1 ms, total: 1.52 s
Wall time: 930 ms
