In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import seaborn as sns
pd.set_option('display.max_columns',None)

In [None]:
train_df = pd.read_feather('../data/input/train.feather')

In [None]:
def _reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def _label_encoder(data):
    l_data,_ =data.factorize(sort=True)
    if l_data.max()>32000:
        l_data = l_data.astype('int32')
    else:
        l_data = l_data.astype('int16')

    if data.isnull().sum() > 0:
        l_data = np.where(l_data == -1,np.nan,l_data)
    return l_data

In [None]:
train_df = _reduce_mem_usage(train_df)

In [6]:
qs = pd.read_csv('../data/input/questions.csv')

In [7]:
lc = pd.read_csv('../data/input/lectures_new.csv')

In [9]:
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False


In [10]:
tag = qs["tags"].str.split(" ",expand = True)
tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']
qs = pd.concat([qs,tag],axis=1)
lc['l_type_of'] = _label_encoder(lc['type_of'])
qs = qs.rename(columns={'question_id':'content_id'})
lc = lc.rename(columns={'lecture_id':'content_id'})
qs_lc = pd.concat([qs,lc])
train_df = pd.merge(train_df,qs_lc,on='content_id',how='left')
train_df = _reduce_mem_usage(train_df)

Mem. usage decreased to 11427.83 Mb (21.1% reduction)


In [11]:
train_df = train_df[train_df['answered_correctly'] != -1]

In [12]:
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,bundle_id,correct_answer,part,tags,tags1,tags2,tags3,tags4,tags5,tags6,tag,type_of,l_type_of
0,0,0,115,5692,0,1,3,1,,,5692.0,3.0,5,151,151,,,,,,,,
1,1,56943,115,5716,0,2,2,1,37000.0,False,5716.0,2.0,5,168,168,,,,,,,,
2,2,118363,115,128,0,0,0,1,55000.0,False,128.0,0.0,1,131 149 92,131,149.0,92.0,,,,,,
3,3,131167,115,7860,0,3,0,1,19000.0,False,7860.0,0.0,1,131 104 81,131,104.0,81.0,,,,,,
4,4,137965,115,7922,0,4,1,1,11000.0,False,7920.0,1.0,1,131 149 92,131,149.0,92.0,,,,,,


In [13]:
few_content = train_df['content_id'].value_counts().sort_values().head(30).index.values

In [14]:
train_df[train_df['content_id'].isin(few_content)]['part'].value_counts()

7    224
5    124
6      5
3      3
Name: part, dtype: int64

In [15]:
train_df['part'].value_counts()

5    41210309
2    18926910
6    10785396
3     8694212
4     8166324
1     7635842
7     5156026
Name: part, dtype: int64

In [17]:
for i in range(1,8):
    print(f'part{i}')
    print(train_df[train_df['part'] == i]['content_id'].nunique())

part1
1015
part2
1664
part3
1568
part4
1449
part5
5547
part6
1236
part7
1173


In [18]:
train_df[train_df['content_id'].isin(few_content)]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,bundle_id,correct_answer,part,tags,tags1,tags2,tags3,tags4,tags5,tags6,tag,type_of,l_type_of
744545,729633,1032782533,15375317,4618,0,274,1,1,18000.0,True,4616.0,1.0,5,79,79,,,,,,,,
3256736,3190666,2046332289,68437470,7139,0,251,2,1,127250.0,True,7136.0,2.0,7,18 16 21,18,16,21,,,,,,
3256737,3190667,2046332289,68437470,7137,0,251,0,1,127250.0,True,7136.0,0.0,7,118 16 21,118,16,21,,,,,,
3256738,3190668,2046332289,68437470,7138,0,251,2,1,127250.0,True,7136.0,2.0,7,145 16 21,145,16,21,,,,,,
4585479,4492995,3610286625,94962323,4741,0,780,0,1,11000.0,True,4740.0,0.0,5,8,8,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101716399,99676695,1640088301,2116187045,3572,0,1626,0,1,15000.0,True,3572.0,0.0,5,8,8,,,,,,,,
101716621,99676913,1702711594,2116187045,4656,0,1801,0,1,7000.0,True,4656.0,0.0,5,79,79,,,,,,,,
101716912,99677201,1812182261,2116187045,4741,0,2009,0,1,20000.0,True,4740.0,0.0,5,8,8,,,,,,,,
101717142,99677431,2084254766,2116187045,6236,0,2229,0,1,20000.0,True,6236.0,0.0,5,8,8,,,,,,,,


In [19]:
train_df[train_df['content_id'].isin(few_content)]['answered_correctly'].mean()

0.8230337078651685

In [22]:
content_id_ans = train_df[['content_id','answered_correctly']].groupby('content_id').mean()

In [25]:
content_id_ans.sort_values('answered_correctly')

Unnamed: 0_level_0,answered_correctly
content_id,Unnamed: 1_level_1
1485,0.000000
10007,0.000000
1484,0.000000
1486,0.000000
10062,0.091752
...,...
10006,1.000000
12809,1.000000
12806,1.000000
7548,1.000000


In [27]:
for i in range(1,11):
    few_content = train_df['content_id'].value_counts().sort_values().head(i*10).index.values
    
    print(f'content {i}')
    print(train_df[train_df['content_id'].isin(few_content)]['answered_correctly'].mean())

content 1
0.5
content 2
0.8731343283582089
content 3
0.8230337078651685
content 4
0.832572298325723
content 5
0.8134556574923547
content 6
0.8129175946547884
content 7
0.8158338012352611
content 8
0.8139122729286664
content 9
0.8101768314687838
content 10
0.8125373134328359
