In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm

In [2]:
train_df = pd.read_csv('../data/input/train.csv', low_memory=False)

In [3]:
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False


In [4]:
qs = pd.read_csv('../data/input/questions.csv')

In [5]:
lc = pd.read_csv('../data/input/lectures_new.csv')

In [6]:
tag = qs["tags"].str.split(" ",expand = True) 

In [7]:
tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']

In [8]:
tag.head()

Unnamed: 0,tags1,tags2,tags3,tags4,tags5,tags6
0,51,131,162,38.0,,
1,131,36,81,,,
2,131,101,162,92.0,,
3,131,149,162,29.0,,
4,131,5,162,38.0,,


In [9]:
qs = pd.concat([qs,tag],axis=1)

In [10]:
qs.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,tags1,tags2,tags3,tags4,tags5,tags6
0,0,0,0,1,51 131 162 38,51,131,162,38.0,,
1,1,1,1,1,131 36 81,131,36,81,,,
2,2,2,0,1,131 101 162 92,131,101,162,92.0,,
3,3,3,0,1,131 149 162 29,131,149,162,29.0,,
4,4,4,3,1,131 5 162 38,131,5,162,38.0,,


In [11]:
lc.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


In [12]:
def _label_encoder(data):
    l_data,_ =data.factorize(sort=True)
    if l_data.max()>32000:
        l_data = l_data.astype('int32')
    else:
        l_data = l_data.astype('int16')

    if data.isnull().sum() > 0:
        l_data = np.where(l_data == -1,np.nan,l_data)
    return l_data

In [13]:
lc['l_type_of'] = _label_encoder(lc['type_of'])

In [14]:
lc.head()

Unnamed: 0,lecture_id,tag,part,type_of,l_type_of
0,89,159,5,concept,0
1,100,70,1,concept,0
2,185,45,6,concept,0
3,192,79,5,solving question,2
4,317,156,5,solving question,2


In [15]:
qs = qs.rename(columns={'question_id':'content_id'})
lc = lc.rename(columns={'lecture_id':'content_id'})

In [16]:
qs = qs[['content_id','part','tags1','tags2','tags3','tags4','tags5','tags6']]

In [17]:
qs_lc = pd.concat([qs,lc])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [18]:
qs_lc

Unnamed: 0,content_id,l_type_of,part,tag,tags1,tags2,tags3,tags4,tags5,tags6,type_of
0,0,,1,,51,131,162,38,,,
1,1,,1,,131,36,81,,,,
2,2,,1,,131,101,162,92,,,
3,3,,1,,131,149,162,29,,,
4,4,,1,,131,5,162,38,,,
...,...,...,...,...,...,...,...,...,...,...,...
413,32535,2.0,5,8.0,,,,,,,solving question
414,32570,2.0,3,113.0,,,,,,,solving question
415,32604,0.0,6,24.0,,,,,,,concept
416,32625,0.0,2,142.0,,,,,,,concept


In [19]:
train_df = pd.merge(train_df,qs_lc,on='content_id',how='left')

In [20]:
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,l_type_of,part,tag,tags1,tags2,tags3,tags4,tags5,tags6,type_of
0,0,0,115,5692,0,1,3,1,,,,5,,151,,,,,,
1,1,56943,115,5716,0,2,2,1,37000.0,False,,5,,168,,,,,,
2,2,118363,115,128,0,0,0,1,55000.0,False,,1,,131,149.0,92.0,,,,
3,3,131167,115,7860,0,3,0,1,19000.0,False,,1,,131,104.0,81.0,,,,
4,4,137965,115,7922,0,4,1,1,11000.0,False,,1,,131,149.0,92.0,,,,


In [26]:
user_ans_mean = train_df[train_df['tag'].isnull()][['user_id','answered_correctly']].groupby('user_id').mean()

In [28]:
user_part_ans_mean = train_df[train_df['tag'].isnull()][['user_id','part','answered_correctly']].groupby(['user_id','part']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,answered_correctly
user_id,part,Unnamed: 2_level_1
115,1,0.702703
115,2,1.0
115,3,0.666667
115,4,0.333333
115,5,1.0
