%%sh

conda install -y -q -c conda-forge lightgbm
conda install -y -q -c conda-forge gensim
pip install kaggle

if [ ! -e /home/ec2-user/SageMaker/TalkingDataAdTrackingFraudDetectionChallenge/input/* ]; then    
    

In [44]:
!conda list | grep gbm

lightgbm                  2.2.2            py36hf484d3e_0    conda-forge


In [13]:
import gc
import itertools
import time
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
from gensim import corpora, models

DATADIR = Path('./input')

tr_path = DATADIR / 'train.csv'
test_path = DATADIR / 'test.csv'


train_cols = ['ip', 'app', 'device', 'os',
              'channel', 'click_time', 'is_attributed']
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


## pandasデータ型指定によるメモリ使用量の削減

データサイズが大きいので`float64`や`int64`をなるべく使わずに最適な型を選ぶように変換。

In [3]:
def reduce_mem_usage(df: pd.DataFrame) -> pd.DataFrame:
    """ Iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        
    Args:
        df (pd.DataFrame): pd.DataFrame to be reduced memory usage.
    Regurns:
        df (pd.DataFrame): pd.DataFrame which dtypes are changed.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df


def load_data(filepath: str) -> pd.DataFrame:
    """create a dataframe and optimize its memory usage
    Args:
        filepath (str): Path to csv file.
    Returns:
        df (pd.DataFrame): pd.DataFrame which dtypes are changed for memory usage reduction.
    """
    df = pd.read_csv(filepath, parse_dates=True, keep_date_col=True).head(10000)
    df = reduce_mem_usage(df)
    
    return df

In [4]:
# train_sm = load_data(tr_path)
# test_sm = load_data(test_path)

Memory usage of dataframe is 0.61 MB
Memory usage after optimization is: 0.18 MB
Decreased by 71.1%


In [10]:
# train_sm.to_csv(DATADIR / 'train_sm.csv', index=False)
# test_sm.to_csv(DATADIR / 'test_sm.csv', index=False)

## Load and bind train and test data

In [34]:
train = pd.read_csv(DATADIR / 'train_sm.csv', parse_dates=True, keep_date_col=True)
len_train = len(train)

test = pd.read_csv(DATADIR / 'test_sm.csv', parse_dates=True, keep_date_col=True)

In [15]:
def bind_tr_test(train: pd.DataFrame, test: pd.DataFrame) -> pd.DataFrame:
    """ Bind train and test data for features engineering.
    Args:
        train (pd.DataFrame): train data.
        test (pd.DataFrame): test data.
    Returns:
        data (pd.DataFrame): binded data.
    """
    
    len_train = len(train)
    print('The initial size of the train set is', len_train)
    print('Binding the training and test set together...')
    data = train.append(test, ignore_index=True, sort=False)

    del train, test
    gc.collect()

    return data

In [16]:
data = bind_tr_test(train, test)

The initial size of the train set is 10000
Binding the training and test set together...


## Feature Engineering

### 時間系

click_timeは`2017-11-10 04:00:00`の形なので日付と時間の特徴量を作る。

In [17]:
def create_time_features(data: pd.DataFrame):
    print("Creating new time features: 'hour' and 'day'...")
    data['hour'] = pd.to_datetime(data.click_time).dt.hour.astype('uint8')
    data['day'] = pd.to_datetime(data.click_time).dt.day.astype('uint8')

    gc.collect()
    return data

In [18]:
data = create_time_features(data)

Creating new time features: 'hour' and 'day'...


### count系

- ベーシックな処理
  - five raw categorical features (ip, os, app, channel, device)  （単純に型をカテゴリ化）
  - time categorical features (day, hour) 
  - some count features 
- web広告配信データ特有の特徴量
  - five raw categorical features (ip, os, app, channel, device) に対し、以下の特徴量を作成 (全組み合わせ2^5 -1 = 31通り)
  - click count within next one/six hours  (直後1 or 6時間以内のクリック数)
  - forward/backward click time delta  (前後クリックまでの時差)
  - average attributed ratio of past click (過去のCVレート)

In [23]:
def create_count_channels_features(data):
    print("Creating new count features: 'n_channels', 'ip_app_count', 'ip_app_os_count'...")

    print('Computing the number of channels associated with ')
    print('a given IP address within each hour...')
    print('一時間の中でIPアドレス毎のチャネル数を数えている')
    n_chans = data[['ip', 'day', 'hour', 'channel']].groupby(by=['ip', 'day', 'hour'])[
        ['channel']].count().reset_index().rename(columns={'channel': 'n_channels'})
    print('Merging the channels data with the main data set...')
    data = data.merge(n_chans, on=['ip', 'day', 'hour'], how='left')
    del n_chans
    gc.collect()
    data['n_channels'].astype('uint16').to_csv(
        DATADIR/'n_channels.csv', header=False)
    print("Saving the data")
    data.drop(['n_channels'], axis=1)

    print('Computing the number of channels associated with ')
    print('a given IP address and app...')
    print('IPアドレス毎/app毎のチャネル数を数えている')
    n_chans = data[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[
        ['channel']].count().reset_index().rename(columns={'channel': 'ip_app_count'})
    print('Merging the channels data with the main data set...')
    data = data.merge(n_chans, on=['ip', 'app'], how='left')
    del n_chans
    gc.collect()
    data['ip_app_count'].astype('uint16').to_csv(
        DATADIR/'ip_app_count.csv', header=False)
    print("Saving the data")
    data.drop(['ip_app_count'], axis=1)

    print('Computing the number of channels associated with ')
    print('a given IP address, app, and os...')
    print('IPアドレス毎/app毎/os毎のチャネル数を数えている')
    n_chans = data[['ip', 'app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[
        ['channel']].count().reset_index().rename(columns={'channel': 'ip_app_os_count'})
    print('Merging the channels data with the main data set...')
    data = data.merge(n_chans, on=['ip', 'app', 'os'], how='left')
    del n_chans
    gc.collect()
    data['ip_app_os_count'].astype('uint16').to_csv(
        DATADIR/'ip_app_os_count.csv', header=False)
    print("Saving the data")
    data.drop(['ip_app_os_count'], axis=1)

    del data
    gc.collect()

In [24]:
create_count_channels_features(data)

Creating new count features: 'n_channels', 'ip_app_count', 'ip_app_os_count'...
Computing the number of channels associated with 
a given IP address within each hour...
一時間の中でIPアドレス毎のチャネル数を数えている
Merging the channels data with the main data set...
Saving the data
Computing the number of channels associated with 
a given IP address and app...
IPアドレス毎/app毎のチャネル数を数えている
Merging the channels data with the main data set...
Saving the data
Computing the number of channels associated with 
a given IP address, app, and os...
IPアドレス毎/app毎/os毎のチャネル数を数えている
Merging the channels data with the main data set...
Saving the data


# LDAを用いたカテゴリカルデータの埋め込み

今回のデータはipやosなど、多数のカテゴリをを抱える特徴量がある。それ単体でも特徴なり得るが、任意のカテゴリがどのような意味を持つかについて、他の特徴の各カテゴリとの共起から情報を得る。

In [27]:
def create_LDA_features(df: pd.DaraFrame, num_topics: int, column_pair: tupple)-> None:
    """ Create LDA feateures calculated with a pair of categorical features
    Args
        df:
        num_topics:
        column_pair 
    """
    col1, col2 = column_pair
    print('pair of %s & %s' % (col1, col2))
    tmp_dict = {}
    for v_col1, v_col2 in zip(data[col1], data[col2]):
        tmp_dict.setdefault(v_col1, []).append(str(v_col2))

    col1_list = list(tmp_dict.keys())
    col2s_of_col1s_list = [[' '.join(tmp_dict[tokun])] for tokun in col1_list]

    dictionary = corpora.Dictionary(col2s_of_col1s_list)
    corpus = [dictionary.doc2bow(tokens) for tokens in col2s_of_col1s_list]
    print('---Start learning LDA model---')

    model = models.LdaModel(corpus,
                            num_topics=num_topics,
                            id2word=dictionary,
                            random_state=3655
                            )

    print('---Saving the model---')
    features = np.array(model.get_document_topics(
        corpus, minimum_probability=0))[:, :, 1]

    column_name_list = ["lda_%s_%s_" % (col1, col2) + str(i) for i in range(5)]

    df_features = pd.DataFrame(features, columns=column_name_list)
    df_features[col1] = col1_list

    print("---Merging data---")
    data = pd.merge(data, df_features, on=col1, how='left')
    del df_features
    gc.collect()

    datapath = "lda_" + col1 + "_" + col2 + ".csv"
    data[column_name_list].to_csv(DATADIR/datapath)

    print("Shape of merged data is %s %s " % data[column_name_list].shape)


def get_column_pairs(columns):
    return [(col1, col2) for col1, col2 in itertools.product(columns, repeat=2) if col1 != col2]

In [29]:
columns = ['ip', 'app', 'os', 'channel']
column_pairs = get_column_pairs(columns)

for pair in column_pairs:
    create_LDA_features(data, pair)

pair of ip & app
Start learning LDA model
Saving the model
---merging data---
   lda_ip_app_0  lda_ip_app_1  lda_ip_app_2  lda_ip_app_3  lda_ip_app_4  \
0      0.599627      0.100010      0.100359      0.100001      0.100002   
1      0.100068      0.599767      0.100004      0.100156      0.100004   
2      0.599604      0.100010      0.100383      0.100001      0.100002   
3      0.100351      0.100358      0.100371      0.100342      0.598577   
4      0.599627      0.100010      0.100359      0.100001      0.100002   

       ip  
0   83230  
1   17357  
2   35810  
3   45745  
4  161007  
shape of merged data is 20000 5 
pair of ip & os
Start learning LDA model
Saving the model
---merging data---
   lda_ip_os_0  lda_ip_os_1  lda_ip_os_2  lda_ip_os_3  lda_ip_os_4      ip
0     0.100000     0.100023     0.599975     0.100000     0.100000   83230
1     0.599369     0.100582     0.100004     0.100042     0.100004   17357
2     0.100000     0.100023     0.599975     0.100000     0.1000

# 不均衡データに対するNegative donwsampling

サンプルサイズの削減とクラス不均衡な二値分類への対応としてNegativeDownSamplingを使用した。

In [32]:
def negative_down_sampling(data, random_state, target_variable):
    positive_data = data[data[target_variable] == 1]
    positive_ratio = float(len(positive_data)) / len(data)
    negative_data = data[data[target_variable] == 0].sample(
        frac=positive_ratio / (1 - positive_ratio), random_state=random_state)
    return pd.concat([positive_data, negative_data])

def train_test_split(data, len_train):
    train = data[:len_train]
    test = data[len_train:]
    return train, test

In [35]:
train, test = train_test_split(data, len_train)
sampled_train = negative_down_sampling(train, target_variable='is_attributed', random_state=3655)

del train
gc.collect()

23830

In [36]:
print(sampled_train.head())
print(sampled_train.shape)
print("="*80)
print(test.head())
print(test.shape)

          ip  app  device  os  channel           click_time  \
103   204158   35       1  13       21  2017-11-06 15:41:07   
1504   29692    9       1  22      215  2017-11-06 16:00:02   
1798   64516   35       1  13       21  2017-11-06 16:00:02   
2102  172429   35       1  46      274  2017-11-06 16:00:03   
3056  199085   35       1  13      274  2017-11-06 16:00:04   

          attributed_time  is_attributed  click_id  hour  day  
103   2017-11-07 08:17:19            1.0       NaN    15    6  
1504  2017-11-07 10:05:22            1.0       NaN    16    6  
1798  2017-11-06 23:40:50            1.0       NaN    16    6  
2102  2017-11-07 00:55:29            1.0       NaN    16    6  
3056  2017-11-06 23:04:54            1.0       NaN    16    6  
(46, 11)
           ip  app  device  os  channel           click_time attributed_time  \
10000    5744    9       1   3      107  2017-11-10 04:00:00             NaN   
10001  119901    9       1   3      466  2017-11-10 04:00:00        

# 学習

In [40]:
val = sampled_train[(len_train-25000):len_train]
train = sampled_train[:(len_train-25000)]

print("train size: ", len(train))
print("valid size: ", len(val))
print("test size : ", len(test))

train size:  46
valid size:  0
test size :  10000


In [None]:
target = 'is_attributed'

# TODO:全特徴量を使う
predictors = ['app', 'device', 'os', 'channel', 'hour', 'day','ip_app_count', 'ip_app_os_count',
              'lda_ip_app_0', 'lda_ip_app_1', 'lda_ip_app_2', 'lda_ip_app_3', 'lda_ip_app_4',
              'lda_ip_os_0', 'lda_ip_os_1', 'lda_ip_os_2', 'lda_ip_os_3', 'lda_ip_os_4',
              'lda_ip_channel_0', 'lda_ip_channel_1', 'lda_ip_channel_2', 'lda_ip_channel_3', 'lda_ip_channel_4']

categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']

In [41]:
sub = pd.DataFrame()
test_id = pd.read_csv(DATADIR/'test.csv')
sub['click_id'] = test_id['click_id'].astype('int')
del test_id
gc.collect()

print(sub.head())

   click_id
0         0
1         1
2         2
3         3
4         4


In [42]:
print("Training...")
start_time = time.time()

params = {
    'learning_rate': 0.15,
    # 'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 7,  # 2^max_depth - 1
    'max_depth': 3,  # -1 means no limit
    # Minimum number of data need in a child(min_data_in_leaf)
    'min_child_samples': 100,
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    # Subsample ratio of columns when constructing each tree.
    'colsample_bytree': 0.9,
    # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'min_child_weight': 0,
    'scale_pos_weight': 99  # because training data is extremely unbalanced
    }

bst = lgb_modelfit_nocv(params,
                        train,
                        val,
                        predictors,
                        target,
                        objective='binary',
                        metrics='auc',
                        early_stopping_rounds=30,
                        verbose_eval=True,
                        num_boost_round=30,
                        categorical_features=categorical)

print('[{}]: model training time'.format(time.time() - start_time))
del train_df
del val_df
gc.collect()

Training...


NameError: name 'lgb_modelfit_nocv' is not defined