In [None]:
!conda install --channel conda-forge --yes --quiet --file requirements.txt

Collecting package metadata: ...working... done
Solving environment: ...working... 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::blaze==0.11.3=py36h4e06776_0
  - defaults/linux-64::jupyter==1.0.0=py36_4
  - defaults/linux-64::seaborn==0.8.1=py36hfad7ec4_0
  - defaults/linux-64::scikit-image==0.13.1=py36h14c3975_1


In [1]:
import gc
import itertools
import time
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
from gensim import corpora, models

from models import LightGBM, Model

DATADIR = Path('./input')

tr_path = DATADIR / 'train.csv'
test_path = DATADIR / 'test.csv'


train_cols = ['ip', 'app', 'device', 'os',
              'channel', 'click_time', 'is_attributed']
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']



## pandasデータ型指定によるメモリ使用量の削減

データサイズが大きいので`float64`や`int64`をなるべく使わずに最適な型を選ぶように変換。

In [2]:
def reduce_mem_usage(df: pd.DataFrame) -> pd.DataFrame:
    """ Iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        
    Args:
        df (pd.DataFrame): pd.DataFrame to be reduced memory usage.
    Regurns:
        df (pd.DataFrame): pd.DataFrame which dtypes are changed.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df


def load_data(filepath: str) -> pd.DataFrame:
    """create a dataframe and optimize its memory usage
    Args:
        filepath (str): Path to csv file.
    Returns:
        df (pd.DataFrame): pd.DataFrame which dtypes are changed for memory usage reduction.
    """
    df = pd.read_csv(filepath, parse_dates=True, keep_date_col=True).head(10000)
    df = reduce_mem_usage(df)
    
    return df

In [3]:
# train_sm = load_data(tr_path)
# test_sm = load_data(test_path)

In [4]:
# train_sm.to_csv(DATADIR / 'train_sm.csv', index=False)
# test_sm.to_csv(DATADIR / 'test_sm.csv', index=False)

## Load and bind train and test data

In [5]:
train = pd.read_csv(DATADIR / 'train_sm.csv', parse_dates=True, keep_date_col=True)
len_train = len(train)

test = pd.read_csv(DATADIR / 'test_sm.csv', parse_dates=True, keep_date_col=True)

In [6]:
def bind_tr_test(train: pd.DataFrame, test: pd.DataFrame) -> pd.DataFrame:
    """ Bind train and test data for features engineering.
    Args:
        train (pd.DataFrame): train data.
        test (pd.DataFrame): test data.
    Returns:
        data (pd.DataFrame): binded data.
    """
    
    len_train = len(train)
    print('The initial size of the train set is', len_train)
    print('Binding the training and test set together...')
    data = train.append(test, ignore_index=True, sort=False)

    del train, test
    gc.collect()

    return data

In [7]:
data = bind_tr_test(train, test)

The initial size of the train set is 10000
Binding the training and test set together...


## Feature Engineering

### 時間系

click_timeは`2017-11-10 04:00:00`の形なので日付と時間の特徴量を作る。

In [8]:
def create_time_features(data: pd.DataFrame):
    print("Creating new time features: 'hour' and 'day'...")
    data['hour'] = pd.to_datetime(data.click_time).dt.hour.astype('uint8')
    data['day'] = pd.to_datetime(data.click_time).dt.day.astype('uint8')

    gc.collect()
    return data

In [9]:
data = create_time_features(data)

Creating new time features: 'hour' and 'day'...


### count系

- ベーシックな処理
  - five raw categorical features (ip, os, app, channel, device)  （単純に型をカテゴリ化）
  - time categorical features (day, hour) 
  - some count features 
- web広告配信データ特有の特徴量
  - five raw categorical features (ip, os, app, channel, device) に対し、以下の特徴量を作成 (全組み合わせ2^5 -1 = 31通り)
  - click count within next one/six hours  (直後1 or 6時間以内のクリック数)
  - forward/backward click time delta  (前後クリックまでの時差)
  - average attributed ratio of past click (過去のCVレート)

In [10]:
def create_count_channels_features(data):
    print("Creating new count features: 'n_channels', 'ip_app_count', 'ip_app_os_count'...")

    print('Computing the number of channels associated with ')
    print('a given IP address within each hour...')
    print('一時間の中でIPアドレス毎のチャネル数を数えている')
    n_chans = data[['ip', 'day', 'hour', 'channel']].groupby(by=['ip', 'day', 'hour'])[
        ['channel']].count().reset_index().rename(columns={'channel': 'n_channels'})
    print('Merging the channels data with the main data set...')
    data = data.merge(n_chans, on=['ip', 'day', 'hour'], how='left')
    del n_chans
    gc.collect()
    data['n_channels'].astype('uint16').to_csv(
        DATADIR/'n_channels.csv', header=True, index=False)
    print("Saving the data")
    data.drop(['n_channels'], axis=1)

    print('Computing the number of channels associated with ')
    print('a given IP address and app...')
    print('IPアドレス毎/app毎のチャネル数を数えている')
    n_chans = data[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[
        ['channel']].count().reset_index().rename(columns={'channel': 'ip_app_count'})
    print('Merging the channels data with the main data set...')
    data = data.merge(n_chans, on=['ip', 'app'], how='left')
    del n_chans
    gc.collect()
    data['ip_app_count'].astype('uint16').to_csv(
        DATADIR/'ip_app_count.csv', header=True, index=False)
    print("Saving the data")
    data.drop(['ip_app_count'], axis=1)

    print('Computing the number of channels associated with ')
    print('a given IP address, app, and os...')
    print('IPアドレス毎/app毎/os毎のチャネル数を数えている')
    n_chans = data[['ip', 'app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[
        ['channel']].count().reset_index().rename(columns={'channel': 'ip_app_os_count'})
    print('Merging the channels data with the main data set...')
    data = data.merge(n_chans, on=['ip', 'app', 'os'], how='left')
    del n_chans
    gc.collect()
    data['ip_app_os_count'].astype('uint16').to_csv(
        DATADIR/'ip_app_os_count.csv', header=True, index=False)
    print("Saving the data")
    data.drop(['ip_app_os_count'], axis=1)

    del data
    gc.collect()

In [11]:
create_count_channels_features(data)

Creating new count features: 'n_channels', 'ip_app_count', 'ip_app_os_count'...
Computing the number of channels associated with 
a given IP address within each hour...
一時間の中でIPアドレス毎のチャネル数を数えている
Merging the channels data with the main data set...
Saving the data
Computing the number of channels associated with 
a given IP address and app...
IPアドレス毎/app毎のチャネル数を数えている
Merging the channels data with the main data set...
Saving the data
Computing the number of channels associated with 
a given IP address, app, and os...
IPアドレス毎/app毎/os毎のチャネル数を数えている
Merging the channels data with the main data set...
Saving the data


# LDAを用いたカテゴリカルデータの埋め込み

今回のデータはipやosなど、多数のカテゴリをを抱える特徴量がある。それ単体でも特徴なり得るが、任意のカテゴリがどのような意味を持つかについて、他の特徴の各カテゴリとの共起から情報を得る。

In [12]:
def create_LDA_features(df: pd.DataFrame, num_topics: int, column_pair: tuple)-> None:
    """ Create LDA feateures calculated with a pair of categorical features
    Args
        df:
        num_topics:
        column_pair 
    """
    col1, col2 = column_pair
    print('pair of %s & %s' % (col1, col2))
    tmp_dict = {}
    for v_col1, v_col2 in zip(df[col1], df[col2]):
        tmp_dict.setdefault(v_col1, []).append(str(v_col2))

    col1_list = list(tmp_dict.keys())
    col2s_of_col1s_list = [[' '.join(tmp_dict[tokun])] for tokun in col1_list]

    dictionary = corpora.Dictionary(col2s_of_col1s_list)
    corpus = [dictionary.doc2bow(tokens) for tokens in col2s_of_col1s_list]

    model = models.LdaModel(corpus,
                            num_topics=num_topics,
                            id2word=dictionary,
                            random_state=3655
                            )

    features = np.array(model.get_document_topics(
        corpus, minimum_probability=0))[:, :, 1]

    column_name_list = ["lda_%s_%s_" % (col1, col2) + str(i) for i in range(5)]

    df_features = pd.DataFrame(features, columns=column_name_list)
    df_features[col1] = col1_list

    df = pd.merge(df, df_features, on=col1, how='left')
    del df_features
    gc.collect()

    datapath = "lda_" + col1 + "_" + col2 + ".csv"
    df[column_name_list].to_csv(DATADIR/datapath, header=True, index=False)

    print("Shape of merged data is %s %s " % df[column_name_list].shape)


def get_column_pairs(columns):
    return [(col1, col2) for col1, col2 in itertools.product(columns, repeat=2) if col1 != col2]

In [13]:
columns = ['ip', 'app', 'os', 'channel']
column_pairs = get_column_pairs(columns)

for pair in column_pairs:
    create_LDA_features(data, num_topics=5, column_pair=pair)

pair of ip & app
Shape of merged data is 20000 5 
pair of ip & os
Shape of merged data is 20000 5 
pair of ip & channel
Shape of merged data is 20000 5 
pair of app & ip
Shape of merged data is 20000 5 
pair of app & os
Shape of merged data is 20000 5 
pair of app & channel
Shape of merged data is 20000 5 
pair of os & ip
Shape of merged data is 20000 5 
pair of os & app
Shape of merged data is 20000 5 
pair of os & channel
Shape of merged data is 20000 5 
pair of channel & ip
Shape of merged data is 20000 5 
pair of channel & app
Shape of merged data is 20000 5 
pair of channel & os
Shape of merged data is 20000 5 


# 不均衡データに対するNegative donwsampling

これまで作成した特徴量をロードし、一つのデータマートとしてマージ。

In [27]:
features = ["n_channels", "ip_app_count", "ip_app_os_count"]
lda_features = ["lda_" + pair[0] + "_" + pair[1] for pair in column_pairs]

features.extend(lda_features)

for feature in features:
    print("merging %s" % feature)
    featurepath = feature + '.csv'
    df_feature = pd.read_csv(DATADIR/featurepath)
    print(df_feature.head())
    data = pd.concat([data, df_feature], axis=1)
    del df_feature
    gc.collect()
    print("shape of data is %s %s" % (data.shape))

merging n_channels
   n_channels
0           1
1           1
2           1
3           1
4           1
shape of data is 20000 33
merging ip_app_count
   ip_app_count
0             1
1             2
2             1
3             5
4             1
shape of data is 20000 34
merging ip_app_os_count
   ip_app_os_count
0                1
1                1
2                1
3                2
4                1
shape of data is 20000 35
merging lda_ip_app
   lda_ip_app_0  lda_ip_app_1  lda_ip_app_2  lda_ip_app_3  lda_ip_app_4
0      0.599627      0.100010      0.100359      0.100001      0.100002
1      0.100068      0.599767      0.100004      0.100156      0.100004
2      0.599604      0.100010      0.100383      0.100001      0.100002
3      0.100351      0.100358      0.100371      0.100342      0.598577
4      0.599627      0.100010      0.100359      0.100001      0.100002
shape of data is 20000 40
merging lda_ip_os
   lda_ip_os_0  lda_ip_os_1  lda_ip_os_2  lda_ip_os_3  lda_ip_os_4
0 

In [29]:
data.tail()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,click_id,hour,...,lda_channel_app_0,lda_channel_app_1,lda_channel_app_2,lda_channel_app_3,lda_channel_app_4,lda_channel_os_0,lda_channel_os_1,lda_channel_os_2,lda_channel_os_3,lda_channel_os_4
19995,6481,3,1,13,173,2017-11-10 04:00:11,,,9995.0,4,...,0.100032,0.599875,0.10003,0.100027,0.100036,0.100032,0.599875,0.100029,0.100028,0.100037
19996,10770,23,1,19,153,2017-11-10 04:00:11,,,9996.0,4,...,0.100033,0.100029,0.10003,0.599871,0.100036,0.100031,0.100028,0.100028,0.599877,0.100036
19997,93325,12,1,19,140,2017-11-10 04:00:11,,,9997.0,4,...,0.100033,0.10003,0.100031,0.59987,0.100035,0.100032,0.100029,0.100029,0.599876,0.100035
19998,17142,21,1,3,232,2017-11-10 04:00:11,,,9998.0,4,...,0.100034,0.599869,0.100031,0.100028,0.100037,0.100034,0.599869,0.10003,0.100029,0.100038
19999,116535,12,1,13,265,2017-11-10 04:00:11,,,9999.0,4,...,0.100031,0.10003,0.599879,0.100027,0.100033,0.100033,0.100031,0.59987,0.10003,0.100036


In [28]:
def train_test_split(data, len_train):
    train = data[:len_train]
    test = data[len_train:]
    return train, test

train, test = train_test_split(data, len_train)
test.to_csv(DATADIR/'test_features.csv')

サンプルサイズの削減とクラス不均衡な二値分類への対応として、学習データへNegativeDownSamplingを使用した。

In [16]:
def negative_down_sampling(data, random_state, target_variable):
    positive_data = data[data[target_variable] == 1]
    positive_ratio = float(len(positive_data)) / len(data)
    negative_data = data[data[target_variable] == 0].sample(
        frac=positive_ratio / (1 - positive_ratio), random_state=random_state)
    return pd.concat([positive_data, negative_data])

In [17]:
sampled_train = negative_down_sampling(train, target_variable='is_attributed', random_state=3655)

del train
gc.collect()

14

In [18]:
print(sampled_train.head())
print(sampled_train.shape)
print("="*80)
print(test.head())
print(test.shape)

          ip  app  device  os  channel           click_time  \
103   204158   35       1  13       21  2017-11-06 15:41:07   
1504   29692    9       1  22      215  2017-11-06 16:00:02   
1798   64516   35       1  13       21  2017-11-06 16:00:02   
2102  172429   35       1  46      274  2017-11-06 16:00:03   
3056  199085   35       1  13      274  2017-11-06 16:00:04   

          attributed_time  is_attributed  click_id  hour  ...  \
103   2017-11-07 08:17:19            1.0       NaN    15  ...   
1504  2017-11-07 10:05:22            1.0       NaN    16  ...   
1798  2017-11-06 23:40:50            1.0       NaN    16  ...   
2102  2017-11-07 00:55:29            1.0       NaN    16  ...   
3056  2017-11-06 23:04:54            1.0       NaN    16  ...   

      lda_ip_channel_1  lda_ip_channel_2  lda_ip_channel_3  lda_ip_channel_4  \
103           0.100004          0.100004          0.100004          0.599983   
1504          0.100055          0.100001          0.599840          0.

# 学習

In [None]:
val = sampled_train[(len_train-25000):len_train]
train = sampled_train[:(len_train-25000)]

print("train size: ", len(train))
print("valid size: ", len(val))
print("test size : ", len(test))

In [None]:
target = 'is_attributed'
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']

# TODO:全特徴量を使う
predictors = ['app', 'device', 'os', 'channel', 'hour', 'day','ip_app_count', 'ip_app_os_count',
              'lda_ip_app_0', 'lda_ip_app_1', 'lda_ip_app_2', 'lda_ip_app_3', 'lda_ip_app_4',
              'lda_ip_os_0', 'lda_ip_os_1', 'lda_ip_os_2', 'lda_ip_os_3', 'lda_ip_os_4',
              'lda_ip_channel_0', 'lda_ip_channel_1', 'lda_ip_channel_2', 'lda_ip_channel_3', 'lda_ip_channel_4']

params  = {
    
        "model_params": {
            "boosting_type": "gbdt",
            "objective": "binary",
            "metric": ["auc"],
            "learning_rate": 0.2,
            "num_leaves": 50,
            "max_depth": 5,
            "max_bin": 100,
            "subsample": 0.7,
            "subsample_freq": 1,
            "min_child_samples": 100,
            "min_child_weight": 0,
            "validation_ratio": 0.1,
            "verbose": 0
        },
    
        "train_params": {
            "num_boost_round": 250,
            "early_stopping_rounds": 10
        }
}

In [None]:
booster, result = model.train_and_predict(train=train, 
                                          valid=val,
                                          categorical_features=categorical_features,
                                          target=target,
                                          params=params)

best_iteration = booster.best_iteration

In [None]:
data = pd.concat([train, val])
booster = model.train_without_validation(train=train,
                                         weight=None,
                                         categorical_features=categorical_features,
                                         target=target_variable,
                                         params=params,
                                         best_iteration=best_iteration)

# 予測

提出用データの作成

In [7]:
sub = pd.DataFrame()
test_id = pd.read_csv(DATADIR/'test.csv')
sub['click_id'] = test_id['click_id'].astype('int')
del test_id
gc.collect()

NameError: name 'DATADIR' is not defined

In [None]:
test = pd.read_csv(DATADIR/'test_features.csv')

In [None]:
prediction = booster.predict(test_data[predictors])