In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from  sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',None)

In [2]:
def _reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
def _label_encoder(data):
    l_data,_ =data.factorize(sort=True)
    if l_data.max()>32000:
        l_data = l_data.astype('int32')
    else:
        l_data = l_data.astype('int16')

    if data.isnull().sum() > 0:
        l_data = np.where(l_data == -1,np.nan,l_data)
    return l_data

In [4]:
train_df = pd.read_feather('../data/input/train.feather')
test_df = pd.read_csv('../data/input/example_test.csv')

train_df = _reduce_mem_usage(train_df)
test_df = _reduce_mem_usage(test_df)

Mem. usage decreased to 3378.93 Mb (56.2% reduction)
Mem. usage decreased to  0.00 Mb (45.9% reduction)


In [5]:
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False


In [6]:
test_df.head()

Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [7]:
drop_cols = ['row_id','user_id','user_answer','answered_correctly','l_type_of','type_of','tag','tags']

use_cols = ['answered_correctly','timestamp','content_id','content_type_id','task_container_id','prior_question_elapsed_time',\
            'prior_question_had_explanation','shift_content_type_id','past_correctly_sum','past_not_correctly_sum','past_correctly_mean',\
            'bundle_id','part','tags1','tags2','tags3',\
            'tags4','tags5','tags6','answered_correctly_user','sum',\
            'answered_correctly_content','answered_correctly_part_user']

In [8]:
qs = pd.read_csv('../data/input/questions.csv')
lc = pd.read_csv('../data/input/lectures_new.csv')
tag = qs["tags"].str.split(" ",expand = True)
tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']
qs = pd.concat([qs,tag],axis=1)
lc['l_type_of'] = _label_encoder(lc['type_of'])
qs = qs.rename(columns={'question_id':'content_id'})
lc = lc.rename(columns={'lecture_id':'content_id'})
qs_lc = pd.concat([qs,lc])

In [9]:
train_df = pd.merge(train_df,qs_lc,on='content_id',how='left')
train_df = _reduce_mem_usage(train_df)

Mem. usage decreased to 11427.83 Mb (21.1% reduction)


In [10]:
train_df = train_df[train_df['answered_correctly'] != -1]
train_df = train_df.reset_index(drop=True)

In [11]:
import gc
gc.collect()

34

## create_features

In [12]:
# agg系統
# answered_correctly_content answered_correctly_user sum answered_correctly_part_user

content_train = train_df[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
content_train.columns = ["answered_correctly_content"]

user_train = train_df[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum'])
user_train.columns = ["answered_correctly_user", 'sum']

user_part_train = train_df[['user_id','part','answered_correctly']].groupby(['user_id','part']).mean()
user_part_train.columns = ["answered_correctly_part_user"]

In [13]:
# past系統
tmp_user_train = train_df[['user_id','answered_correctly']].groupby('user_id').agg(['mean','sum','count'])
tmp_user_train.columns = ['past_correctly_mean','past_correctly_sum','count']
tmp_user_train['past_not_correctly_sum'] = tmp_user_train['count'] - tmp_user_train['past_correctly_sum']
tmp_user_train = tmp_user_train.drop('count',axis=1)

In [14]:
gc.collect()

57

In [15]:
user_train = pd.merge(user_train,tmp_user_train,on='user_id',how='left')

In [16]:
user_train = user_train.reset_index()

In [17]:
content_train = content_train.reset_index()

In [18]:
user_part_train = user_part_train.reset_index()

In [19]:
train_df = pd.read_feather('../data/input/train.feather')
shift_content_type_id = train_df[~train_df["user_id"].duplicated(keep='last')]

shift_content_type_id = shift_content_type_id[['user_id','content_type_id']]
shift_content_type_id.columns = ['user_id','shift_content_type_id']

In [22]:
user_train = pd.merge(user_train,shift_content_type_id,on='user_id',how='left')

In [24]:
user_train.to_feather('../data/output/kaggle_kernel/user_train.feather')
content_train.to_feather('../data/output/kaggle_kernel/content_train.feather')
user_part_train.to_feather('../data/output/kaggle_kernel/user_part_train.feather')

In [20]:
t = pd.read_feather('../data/output/kaggle_kernel/user_part_train.feather')

In [21]:
t.head()

Unnamed: 0,user_id,part,answered_correctly_part_user
0,115,1,0.710526
1,115,2,1.0
2,115,3,0.666667
3,115,4,0.333333
4,115,5,1.0


In [5]:
# train_df = pd.read_feather('../data/input/train.feather')
# train_df = _reduce_mem_usage(train_df)

Mem. usage decreased to 3378.93 Mb (56.2% reduction)


In [6]:
# train_df[['user_id','content_type_id']].to_feather('../data/output/kaggle_kernel/shift_train.feather')