In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import feather
from tqdm import tqdm_notebook as tqdm
from tqdm._tqdm_notebook import tqdm_notebook
from collections import defaultdict
from bitarray import bitarray

import riiideducation
tqdm_notebook.pandas(desc="progress: ")

pd.set_option("max_rows", 1000)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  
  from pandas import Panel


In [2]:
DIR ="../input/riiid-test-answer-prediction/"
lectures = pd.read_csv(DIR+"lectures.csv")
questions = pd.read_csv(DIR+"questions.csv")
example_test = pd.read_csv(DIR+"example_test.csv")
train = pd.read_feather("../input/riid-train/train.feather")

# train
- row_id: (int64) ID code for the row.
- timestamp: (int64) the time between this user interaction and the first event completion from that user.
- user_id: (int32) ID code for the user.
- content_id: (int16) ID code for the user interaction
- content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.
- task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before  seeing the explanations for any of them. Those three would all share a task_container_id.
- user_answer: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.
- answered_correctly: (int8) if the user responded correctly. Read -1 as null, for lectures.
- prior_question_elapsed_time: (float32) The average time it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.
- prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

- content_type_id = 0なら、questions.csvと結合できる, content_type_id = 1なら、lectures.csvと結合できる

In [3]:
train.head(10)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,4294967295,
1,1,56943,115,5716,0,2,2,1,37000,False
2,2,118363,115,128,0,0,0,1,55000,False
3,3,131167,115,7860,0,3,0,1,19000,False
4,4,137965,115,7922,0,4,1,1,11000,False
5,5,157063,115,156,0,5,2,1,5000,False
6,6,176092,115,51,0,6,0,1,17000,False
7,7,194190,115,50,0,7,3,1,17000,False
8,8,212463,115,7896,0,8,2,1,16000,False
9,9,230983,115,7863,0,9,0,1,16000,False


In [4]:
train.drop(["timestamp", "user_answer", "prior_question_elapsed_time", "prior_question_had_explanation",
           "content_type_id", "task_container_id"], axis=1, inplace=True)

In [5]:
train_lec = train[train["answered_correctly"] == -1].reset_index(drop=True)
train = train[train["answered_correctly"] != -1].reset_index(drop=True)

In [6]:
train.shape

(99271300, 4)

In [7]:
train.describe()

Unnamed: 0,row_id,user_id,content_id,answered_correctly
count,99271300.0,99271300.0,99271300.0,99271300.0
mean,50615790.0,1076745000.0,4994.398,0.6572355
std,29221880.0,619699500.0,3288.955,0.4746335
min,0.0,115.0,0.0,0.0
25%,25308360.0,540837000.0,2016.0,0.0
50%,50616490.0,1071807000.0,4984.0,1.0
75%,75920820.0,1615742000.0,7218.0,1.0
max,101230300.0,2147483000.0,13522.0,1.0


In [8]:
len(train.user_id.unique())

393656

In [9]:
#train[train.user_id==2746] # task_container_id doesn't monotonically increase

In [10]:
#tmp = train.groupby("user_id")['user_answer'].value_counts(normalize=True).unstack()
#tmp["max"] = tmp.max(axis=1)
#tmp[-1].max(), tmp[-1].min()
#tmp[tmp["max"]>=0.7]

In [11]:
#tmp = train[train.user_id==115].copy()
#tmp["task_container_id"] = tmp["task_container_id"].transform(lambda x: pd.factorize(x)[0]).astype('int16')
#tmp

In [12]:
train.content_id.value_counts().head()

6116    213605
6173    202106
4120    199372
175     195861
7876    190170
Name: content_id, dtype: int64

In [13]:
# null value in prior_question_had_explanation_only, but other variables include -1 as missing value in user_answer and answered_correctly
train.isnull().sum()

row_id                0
user_id               0
content_id            0
answered_correctly    0
dtype: int64

In [14]:
# data num per user_id
#train.groupby("user_id").agg("max")["task_container_id"].plot(kind="hist")

In [15]:
#train["content_type_id"].sum() / train.shape[0] # lectureはわずか2%

In [16]:
#print(train.groupby("user_id")["content_type_id"].mean().max(), train.groupby("user_id")["content_type_id"].mean().min())
#print(train.groupby("user_id")["content_type_id"].mean().mean())

In [17]:
# had explanation in almost 90%
# train["prior_question_had_explanation"].value_counts(normalize=True)

In [18]:
#train = train.merge(questions[["question_id", "part"]], how="left", left_on=['content_id'], right_on=['question_id'])
#train = train.loc[train['answered_correctly'] != -1].reset_index(drop=True)
#tmp = train.groupby('part').agg({'answered_correctly': ['mean']}).copy()
#tmp.columns = ['correct_rate']
#tmp

In [19]:
tmp = train.groupby("content_id")["answered_correctly"].mean().reset_index()

In [20]:
print(len(tmp[tmp.answered_correctly<0.2]["content_id"]))
print(len(tmp[(tmp.answered_correctly>=0.2) & (tmp.answered_correctly<0.3)]["content_id"]))
print(len(tmp[(tmp.answered_correctly>=0.3) & (tmp.answered_correctly<0.4)]["content_id"]))
print(len(tmp[(tmp.answered_correctly>=0.4) & (tmp.answered_correctly<0.5)]["content_id"]))
print(len(tmp[(tmp.answered_correctly>=0.5) & (tmp.answered_correctly<0.6)]["content_id"]))
print(len(tmp[(tmp.answered_correctly>=0.5) & (tmp.answered_correctly<0.6)]["content_id"]))
print(len(tmp[(tmp.answered_correctly>=0.6) & (tmp.answered_correctly<0.7)]["content_id"]))
print(len(tmp[(tmp.answered_correctly>=0.7) & (tmp.answered_correctly<0.8)]["content_id"]))
print(len(tmp[(tmp.answered_correctly>=0.8) & (tmp.answered_correctly<0.9)]["content_id"]))
print(len(tmp[(tmp.answered_correctly>=0.9)]["content_id"]))

34
203
490
885
1567
1567
2520
3248
3143
1433


In [21]:
user_try_count = train.groupby("user_id")["row_id"].count().reset_index()
user_try_count.columns = ["user_id", "max_count"]
user_try_count.head()

Unnamed: 0,user_id,max_count
0,115,46
1,124,30
2,2746,19
3,5382,125
4,8623,109


In [22]:
train.content_id.max()

13522

# lecture
- lectures.csv: metadata for the lectures watched by users as they progress in their education.
- lecture_id: foreign key for the train/test content_id column, when the content type is lecture (1).
- part: top level category code for the lecture.
- tag: one tag codes for the lecture. The meaning of the tags will not be provided, but these codes are sufficient for clustering the lectures together.
- type_of: brief description of the core purpose of the lecture

In [23]:
lectures.shape

(418, 4)

In [24]:
lectures.describe()

Unnamed: 0,lecture_id,tag,part
count,418.0,418.0,418.0
mean,16983.401914,94.480861,4.267943
std,9426.16466,53.586487,1.872424
min,89.0,0.0,1.0
25%,9026.25,50.25,2.0
50%,17161.5,94.5,5.0
75%,24906.25,140.0,6.0
max,32736.0,187.0,7.0


In [25]:
lectures.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


In [26]:
lectures[lectures.part==3]

Unnamed: 0,lecture_id,tag,part,type_of
14,1253,74,3,solving question
35,3078,74,3,concept
62,5266,157,3,concept
110,9758,136,3,concept
129,10928,136,3,solving question
149,12846,106,3,solving question
170,14059,82,3,solving question
223,18346,157,3,solving question
256,20843,161,3,solving question
258,21094,161,3,solving question


In [27]:
lectures[lectures.type_of=="solving question"]["tag"].unique()

array([ 79, 156, 134,  80, 101,  74,  43, 109, 142, 140,   1,  26, 173,
         7,  64, 141, 124, 161,   4,  94,  70, 119, 177,  54,  50, 116,
        93, 136, 127,  53, 130,  13,  36,  62, 181, 164, 163,  85,  91,
         9,  28,  98, 159,  27, 149,  61, 166,  14, 113, 182,  55,  49,
       106,  47,  60, 179,  71,  82,  44,  25, 108,  96,   5, 152,  16,
        73, 178,  42,  40,   6, 157,  33,  51, 133,  48, 167,  68,  95,
        30, 174,  20,  66,  37,  45, 105, 151, 123, 114, 180,  89, 168,
        78, 111,  52,  34,   8, 184, 176,  72, 129, 128,  65, 125, 170,
       104,  69,   0])

In [28]:
lectures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   lecture_id  418 non-null    int64 
 1   tag         418 non-null    int64 
 2   part        418 non-null    int64 
 3   type_of     418 non-null    object
dtypes: int64(3), object(1)
memory usage: 13.2+ KB


In [29]:
lectures.tag.min(), lectures.tag.max()

(0, 187)

In [30]:
lectures.tag.value_counts().head() # 一部タグに重複あり

136    7
116    6
134    6
27     6
161    6
Name: tag, dtype: int64

In [31]:
lectures.part.value_counts() # 一部タグに重複あり

5    143
6     83
2     56
1     54
7     32
4     31
3     19
Name: part, dtype: int64

In [32]:
lectures.type_of.value_counts() 

concept             222
solving question    186
intention             7
starter               3
Name: type_of, dtype: int64

In [33]:
lectures.isnull().sum() # no missing value

lecture_id    0
tag           0
part          0
type_of       0
dtype: int64

In [34]:
len(lectures.lecture_id.unique())

418

In [35]:
len(set(lectures.lecture_id.unique()) & set(train.content_id.unique()))

158

In [36]:
# lecture id 28098 isn't included in training set
# set(lectures.lecture_id.unique()) - set(set(lectures.lecture_id.unique()) & set(train.content_id.unique()))

In [37]:
lectures[lectures.lecture_id==28098]

Unnamed: 0,lecture_id,tag,part,type_of
355,28098,166,6,solving question


In [38]:
len(lectures.tag.unique()) # overlap in tag

151

In [39]:
#lectures['type_of'] = lectures['type_of'].replace('solving question', 'solving_question')
#lectures = pd.get_dummies(lectures, columns=['part', 'type_of'])

In [40]:
#part_lectures_columns = [column for column in lectures.columns if column.startswith('part')]
#types_of_lectures_columns = [column for column in lectures.columns if column.startswith('type_of_')]

In [41]:
train_lec = train_lec.merge(lectures, left_on="content_id", right_on="lecture_id", how="left")

In [42]:
train_lec.drop(["lecture_id"], axis=1, inplace=True)

In [43]:
train_lec.head()

Unnamed: 0,row_id,user_id,content_id,answered_correctly,tag,part,type_of
0,89,2746,6808,-1,129,2,intention
1,117,5382,16736,-1,40,1,concept
2,212,5382,30207,-1,43,5,concept
3,216,5382,18545,-1,58,5,concept
4,295,8623,10540,-1,99,1,concept


In [44]:
train_lec.user_id.value_counts()

372519499     397
353540103     395
105239469     389
1066383521    386
2066672014    386
             ... 
256743508       1
1578004521      1
170958837       1
2140248020      1
2111834113      1
Name: user_id, Length: 149606, dtype: int64

In [45]:
sample_df = pd.concat([train_lec[train_lec.user_id==372519499], train[train.user_id==372519499]])
sample_df = sample_df.sort_values("row_id", ascending=True).reset_index(drop=True)

In [46]:
sample_df = pd.merge(sample_df, questions, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)
sample_df.drop("question_id", axis=1, inplace=True)

In [47]:
sample_df.head(1000)

Unnamed: 0,row_id,user_id,content_id,answered_correctly,tag,part_x,type_of,bundle_id,correct_answer,part_y,tags
0,17336386,372519499,3550,1,,,,3550.0,1.0,5.0,8
1,17336387,372519499,5128,1,,,,5128.0,3.0,5.0,8
2,17336388,372519499,296,1,,,,296.0,0.0,2.0,143 114 92 29
3,17336389,372519499,3577,0,,,,3577.0,0.0,5.0,8
4,17336390,372519499,3566,0,,,,3566.0,2.0,5.0,8
5,17336391,372519499,6109,0,,,,6109.0,0.0,5.0,73
6,17336392,372519499,98,1,,,,98.0,1.0,1.0,10 178 162 81
7,17336393,372519499,6256,0,,,,6256.0,2.0,5.0,134
8,17336394,372519499,3986,0,,,,3986.0,3.0,5.0,52
9,17336395,372519499,6119,0,,,,6119.0,2.0,5.0,54


# questions
- questions.csv: metadata for the questions posed to users.
- question_id: foreign key for the train/test content_id column, when the content type is question (0).
- bundle_id: code for which questions are served together.
- correct_answer: the answer to the question. Can be compared with the train user_answer column to check if the user was right.
- part: the relevant section of the TOEIC test.
- tags: one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

In [48]:
questions.shape

(13523, 5)

In [49]:
questions["tags"].value_counts()

8                 738
73                617
53                523
1                 413
96                373
                 ... 
113 102 186         1
161 38 92 102       1
74 126 92           1
161 144 162 92      1
131 36 162 38       1
Name: tags, Length: 1519, dtype: int64

In [50]:
questions.correct_answer.value_counts(normalize=True) # 2 has the smallest ratio of correct answer

0    0.274791
3    0.262072
1    0.257191
2    0.205945
Name: correct_answer, dtype: float64

In [51]:
questions.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [52]:
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   question_id     13523 non-null  int64 
 1   bundle_id       13523 non-null  int64 
 2   correct_answer  13523 non-null  int64 
 3   part            13523 non-null  int64 
 4   tags            13522 non-null  object
dtypes: int64(4), object(1)
memory usage: 528.4+ KB


In [53]:
len(questions.question_id.unique()) # question_id is unique

13523

In [54]:
questions.question_id.unique()

array([    0,     1,     2, ..., 13520, 13521, 13522])

In [55]:
len(questions.bundle_id.unique()) # bundle_id is not unique

9765

In [56]:
questions.part.value_counts()

5    5511
2    1647
3    1562
4    1439
6    1212
7    1160
1     992
Name: part, dtype: int64

In [57]:
questions.isnull().sum()

question_id       0
bundle_id         0
correct_answer    0
part              0
tags              1
dtype: int64

In [58]:
questions.tags

0         51 131 162 38
1             131 36 81
2        131 101 162 92
3        131 149 162 29
4          131 5 162 38
              ...      
13518                14
13519                 8
13520                73
13521               125
13522                55
Name: tags, Length: 13523, dtype: object

# combi of question and lecture

In [59]:
# https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns
# https://stackoverflow.com/questions/16228248/how-can-i-get-list-of-values-from-dict
def calc(tags):
    type_count = {'concept':0, 'solving question': 0, 'intention': 0, 'starter':0}
    try:
        tag_list = tags.split(" ")
        type_list = list(lectures[lectures.tag.isin(tag_list)]["type_of"])
    except:
        ans = list(type_count.values())
        return ans[0], ans[1], ans[2], ans[3] 
    for ele in type_list:
        type_count[ele] += 1
    ans = list(type_count.values())
    #print(tag_list, type_list ,ans)
    return ans[0], ans[1], ans[2], ans[3] 
#questions['concept'], questions['solving question'], questions['intention'], questions['starter'] = zip(*questions['tags'].map(calc))
#questions.apply(lambda x: calc(x), axis=1)

# example_test
- row_id: (int64) ID code for the row.
- timestamp: (int64) the time between this user interaction and the first event completion from that user.
- user_id: (int32) ID code for the user.
- content_id: (int16) ID code for the user interaction
- content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.
- task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.
- user_answer: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.
- answered_correctly: (int8) if the user responded correctly. Read -1 as null, for lectures.
- prior_question_elapsed_time: (float32) The average time it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.
- prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

In [60]:
example_test.head()

Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [61]:
#%%timeit
#test1 = pd.merge(example_test, questions, left_on='content_id', right_on='question_id', how='left')

In [62]:
#%%timeit
#test2 = pd.merge(example_test, questions, left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)

# how to make features

In [63]:
train = pd.merge(train, questions[["question_id", "part", "tags"]], left_on='content_id', right_on='question_id', how='left', right_index=True).reset_index(drop=True)

In [64]:
train.drop("question_id", axis=1, inplace=True)

In [65]:
train.head()

Unnamed: 0,row_id,user_id,content_id,answered_correctly,part,tags
0,0,115,5692,1,5,151
1,1,115,5716,1,5,168
2,2,115,128,1,1,131 149 92
3,3,115,7860,1,1,131 104 81
4,4,115,7922,1,1,131 149 92


In [66]:
train["the_part_count"] = train["part"] == 1
train["the_part_correct_count"] = (train["part"] == 1) & (train["answered_correctly"]==1)

In [67]:
#train["the_part_count"] = train[["user_id", "the_part_count"]].groupby("user_id")["the_part_count"].apply(lambda x: x.shift().cumsum())
#train["the_part_correct_count"] = train[["user_id", "the_part_correct_count"]].groupby("user_id")["the_part_correct_count"].apply(lambda x: x.shift().cumsum())

In [68]:
train.groupby("user_id")["part"].value_counts()

user_id     part
115         1        37
            3         3
            4         3
            5         2
            2         1
                   ... 
2147481750  2        24
2147482216  2       154
            5       121
2147482888  5        26
            1         1
Name: part, Length: 1824960, dtype: int64

In [69]:
#train_users = list(train.user_id.unique())
#user_check = np.isin(example_test.user_id.values, train_users)
#example_test[user_check]

#user_check = np.isin(example_test.user_id.values, train_users)
#preds = np.zeros(len(example_test)) 
#pred_existing_users = model.predict(example_test.loc[user_check,features])
#pred_new_users = model2.predict(example_test.loc[~user_check,features])
#preds[user_check] = pred_existing_users
#preds[~user_check] = pred_new_users 

In [70]:
tmp = train.groupby("user_id")["answered_correctly"].agg(["mean", "count"]).reset_index(drop=False)
tmp

Unnamed: 0,user_id,mean,count
0,115,0.695652,46
1,124,0.233333,30
2,2746,0.578947,19
3,5382,0.672000,125
4,8623,0.642202,109
...,...,...,...
393651,2147470770,0.765487,226
393652,2147470777,0.691489,752
393653,2147481750,0.760000,50
393654,2147482216,0.643636,275


In [71]:
tmp[(tmp["mean"]<=0.2) & (tmp["count"]>=50)]

Unnamed: 0,user_id,mean,count
1408,7846933,0.18,50
2882,15722883,0.196429,56
6648,36438630,0.166667,60
9878,53710648,0.18,50
9994,54332851,0.16,50
18465,100774959,0.186528,193
21369,116807220,0.196429,56
30991,169977625,0.196078,51
31674,173837648,0.2,70
32860,180627023,0.148148,756


In [72]:
tmp[(tmp["mean"]>=0.9) & (tmp["count"]>=50)]

Unnamed: 0,user_id,mean,count
2736,14944351,0.904762,63
2745,14998785,0.917647,85
3413,18600822,0.921294,4358
3491,18937567,0.9,50
3712,20180962,0.9,50
6357,34927151,0.901408,71
6481,35662893,0.989011,91
9823,53417560,0.914634,82
13190,71869317,0.911392,79
13934,75903846,0.908497,153


In [73]:
train[train.user_id==15722883]["user_answer"].value_counts()

KeyError: 'user_answer'

In [74]:
train[train.user_id==14944351]

Unnamed: 0,row_id,user_id,content_id,answered_correctly,part,tags,the_part_count,the_part_correct_count
701513,715645,14944351,4812,1,5,75,False,False
701514,715646,14944351,5889,0,5,89,False,False
701515,715647,14944351,6650,1,5,53,False,False
701516,715648,14944351,5521,1,5,14,False,False
701517,715649,14944351,4046,1,5,53,False,False
701518,715650,14944351,3968,1,5,1,False,False
701519,715651,14944351,6381,1,5,4,False,False
701520,715652,14944351,4254,1,5,96,False,False
701521,715653,14944351,5738,0,5,8,False,False
701522,715654,14944351,378,1,2,90 100 38 102,False,False


In [75]:
train.row_id.max()

101230331

In [76]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99271300 entries, 0 to 99271299
Data columns (total 8 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   row_id                  uint32
 1   user_id                 uint32
 2   content_id              uint16
 3   answered_correctly      int8  
 4   part                    int64 
 5   tags                    object
 6   the_part_count          bool  
 7   the_part_correct_count  bool  
dtypes: bool(2), int64(1), int8(1), object(1), uint16(1), uint32(2)
memory usage: 2.7+ GB


In [77]:
# how to make first_attempt
# takes about 10 minutes
train["first_attempt"] = train[["row_id", "user_id", "content_id"]].groupby("user_id").progress_apply(lambda x: 
                                                                    pd.DataFrame(np.isin(x["row_id"].values, x.drop_duplicates("content_id")["row_id"].values))).values

#for i in list(train.user_id.unique())[:10]:
#    tmp1 = np.isin(train[train.user_id==i]["row_id"].values, train[train.user_id==i].groupby("content_id").first()["row_id"].values)
#    tmp2 = np.isin(train[train.user_id==i]["row_id"], train[train.user_id==i].drop_duplicates("content_id")["row_id"])
#    print(i, np.all(tmp1 == tmp2))

train[["row_id", "first_attempt"]].to_csv("content_first_attempt.csv", index=False)

HBox(children=(FloatProgress(value=0.0, description='progress: ', max=393656.0, style=ProgressStyle(descriptio…




In [78]:
mod_train = train.iloc[:100,:].copy()
mod_train["first_attempt"] = mod_train[["row_id", "user_id", "content_id"]].groupby("user_id").progress_apply(lambda x: 
                                                                    pd.DataFrame(np.isin(x["row_id"].values, x.drop_duplicates("content_id")["row_id"].values))).values
mod_train.head()

HBox(children=(FloatProgress(value=0.0, description='progress: ', max=4.0, style=ProgressStyle(description_wid…




Unnamed: 0,row_id,user_id,content_id,answered_correctly,part,tags,the_part_count,the_part_correct_count,first_attempt
0,0,115,5692,1,5,151,False,False,True
1,1,115,5716,1,5,168,False,False,True
2,2,115,128,1,1,131 149 92,True,True,True
3,3,115,7860,1,1,131 104 81,True,True,True
4,4,115,7922,1,1,131 149 92,True,True,True


In [79]:
%%time
user_content_agg = train.groupby("user_id")["content_id"].unique().reset_index().set_index("user_id")

value = []
for j in user_content_agg.index:
    a = bitarray(32737, endian='little')
    a.setall(True)
    for i in user_content_agg.loc[j][0]:
        a[i] = 0
    value.append(a)
    
user_content_agg["content_exp"] = value

user_content_dict = user_content_agg["content_exp"].to_dict(defaultdict(int))

CPU times: user 2min 4s, sys: 2.23 s, total: 2min 6s
Wall time: 2min 6s


In [80]:
#user_content_dict = user_content_agg["content_exp"].to_dict()
#mod_first_attempt_values = []
#for i, (user_id, content_id) in enumerate(zip(example_test['user_id'].values, example_test['content_id'].values)):
#    if not user_id in user_content_dict.keys():
#        a = bitarray(32737, endian='little')
#        a.setall(True)   
#        user_content_dict[user_id] = a
#    mod_first_attempt_values.append(user_content_dict[user_id][content_id])
#    user_content_dict[user_id][content_id] = False 

In [81]:
def make_bitarray():
    a = bitarray(32737, endian='little')
    a.setall(True)   
    return a
    
user_content_dict = user_content_agg["content_exp"].to_dict(defaultdict(make_bitarray))
mod_first_attempt_values2 = []
for i, (user_id, content_id) in enumerate(zip(example_test['user_id'].values, example_test['content_id'].values)):
    mod_first_attempt_values2.append(user_content_dict[user_id][content_id])
    user_content_dict[user_id][content_id] = False 

In [82]:
#user_content_dict = user_content_agg["content_exp"].to_dict(defaultdict(int))
#first_attempt_values = []
#for i, (user_id, content_id) in enumerate(zip(example_test['user_id'].values, example_test['content_id'].values)):
#    if not user_id in user_content_agg.index:
#        a = bitarray(32737, endian='little')
#        a.setall(True)   
#        user_content_dict[user_id] = a
#    first_attempt_values.append(user_content_dict[user_id][content_id])
#    user_content_dict[user_id][content_id] = False 

In [83]:
for i, (user_id, content_id) in enumerate(zip(example_test['user_id'].values, example_test['content_id'].values)):
    print(i,train[(train["user_id"]==user_id) & (train["content_id"]==content_id)].shape, mod_first_attempt_values2[i])

0 (0, 9) True
1 (1, 9) False
2 (0, 9) True
3 (0, 9) True
4 (0, 9) True
5 (1, 9) False
6 (0, 9) True
7 (0, 9) True
8 (0, 9) True
9 (0, 9) True
10 (0, 9) True
11 (0, 9) True
12 (0, 9) True
13 (0, 9) True
14 (0, 9) True
15 (0, 9) True
16 (0, 9) True
17 (0, 9) True
18 (0, 9) True
19 (0, 9) True
20 (0, 9) True
21 (0, 9) True
22 (0, 9) True
23 (0, 9) True
24 (0, 9) True
25 (0, 9) True
26 (1, 9) False
27 (0, 9) True
28 (0, 9) True
29 (0, 9) True
30 (0, 9) True
31 (0, 9) True
32 (0, 9) True
33 (0, 9) True
34 (0, 9) True
35 (0, 9) True
36 (0, 9) True
37 (0, 9) True
38 (0, 9) True
39 (0, 9) True
40 (0, 9) True
41 (0, 9) True
42 (0, 9) True
43 (0, 9) True
44 (0, 9) True
45 (0, 9) True
46 (0, 9) True
47 (1, 9) False
48 (1, 9) False
49 (0, 9) True
50 (0, 9) True
51 (0, 9) True
52 (0, 9) True
53 (0, 9) True
54 (0, 9) True
55 (0, 9) True
56 (0, 9) True
57 (0, 9) True
58 (0, 9) True
59 (0, 9) True
60 (0, 9) True
61 (0, 9) True
62 (0, 9) True
63 (0, 9) True
64 (0, 9) True
65 (0, 9) True
66 (0, 9) True


In [84]:
#example_test["first_attempt"] = first_attempt_values

In [85]:
#from bitarray import bitarray
#sample_list = []
#for i in range(393656):
#    a = bitarray(13782, endian='little') # important to setup a fixed endianess.
#    a.setall(True)
#    sample_list.append(a)

In [86]:
#user_agg = train.groupby('user_id')['answered_correctly'].agg(['sum', 'count'])
#user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
#user_count_dict

In [87]:
#user_try = train.groupby('user_id')['row_id'].agg(["count"])
#user_try_dict = user_try['count'].astype('int16').to_dict(defaultdict(int))
#user_try_dict.head()

In [88]:
#user_count_dict == user_try_dict