In [1]:
!conda install --channel conda-forge --yes --quiet --file requirements.txt

Solving environment: ...working... done

## Package Plan ##

  environment location: /home/ec2-user/anaconda3/envs/python3

  added / updated specs: 
    - gensim==3.7.1
    - lightgbm==2.2.2
    - pandas==0.24.2


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    smart_open-1.8.2           |             py_0          49 KB  conda-forge
    bz2file-0.98               |             py_0           9 KB  conda-forge
    gensim-3.7.1               |   py36he1b5a44_0        22.7 MB  conda-forge
    lightgbm-2.2.2             |   py36hf484d3e_0         990 KB  conda-forge
    ------------------------------------------------------------
                                           Total:        23.7 MB

The following NEW packages will be INSTALLED:

    bz2file:         0.98-py_0             conda-forge
    gensim:          3.7.1-py36he1b5a44_0  conda-forge
    lightgbm:        2.2.2-py36hf484d3e_0

In [1]:
import gc
import itertools
import time
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
import lightgbm as lgb
from gensim import corpora, models

from models import LightGBM, Model

DATADIR = Path('./input')

tr_path = DATADIR / 'train.csv'
test_path = DATADIR / 'test.csv'


train_cols = ['ip', 'app', 'device', 'os','channel', 'click_time', 'is_attributed']
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']



## pandasデータ型指定によるメモリ使用量の削減

データサイズが大きいので`float64`や`int64`をなるべく使わずに最適な型を選ぶように変換。

In [2]:
def reduce_mem_usage(df: pd.DataFrame) -> pd.DataFrame:
    """ Iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        
    Args:
        df (pd.DataFrame): pd.DataFrame to be reduced memory usage.
    Regurns:
        df (pd.DataFrame): pd.DataFrame which dtypes are changed.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df


def load_data(filepath: str) -> pd.DataFrame:
    """create a dataframe and optimize its memory usage
    Args:
        filepath (str): Path to csv file.
    Returns:
        df (pd.DataFrame): pd.DataFrame which dtypes are changed for memory usage reduction.
    """
    df = pd.read_csv(filepath, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    
    return df

In [3]:
train_sm = load_data(tr_path)
test_sm = load_data(test_path)

Memory usage of dataframe is 11285.64 MB
Memory usage after optimization is: 3721.47 MB
Decreased by 67.0%
Memory usage of dataframe is 1003.52 MB
Memory usage after optimization is: 323.35 MB
Decreased by 67.8%


In [None]:
train_sm.to_csv(DATADIR / 'train_sm.csv', index=False)
test_sm.to_csv(DATADIR / 'test_sm.csv', index=False)

In [None]:
del train_sm, test_sm
gc.collect()

## Load and bind train and test data

In [None]:
train = pd.read_csv(DATADIR / 'train_sm.csv', usecols=train_cols, parse_dates=True, keep_date_col=True)
len_train = len(train)

test = pd.read_csv(DATADIR / 'test_sm.csv', usecols=test_cols, parse_dates=True, keep_date_col=True)

In [None]:
def bind_tr_test(train: pd.DataFrame, test: pd.DataFrame) -> pd.DataFrame:
    """ Bind train and test data for features engineering.
    Args:
        train (pd.DataFrame): train data.
        test (pd.DataFrame): test data.
    Returns:
        data (pd.DataFrame): binded data.
    """
    
    len_train = len(train)
    print('The initial size of the train set is', len_train)
    print('Binding the training and test set together...')
    data = train.append(test, ignore_index=True, sort=False)

    del train, test
    gc.collect()

    return data

In [None]:
data = bind_tr_test(train, test)

In [None]:
data.head()

## Feature Engineering

### 時間系

click_timeは`2017-11-10 04:00:00`の形なので日付と時間の特徴量を作る。

In [None]:
def create_time_features(data: pd.DataFrame) -> pd.DataFrame:
    """ create datatime-based features 'hour' and 'day' from 'click_time' strings.
    Args:
        data (pd.DataFrame): data concatinated train and test datasets.
    Returns:
        data (pd.DataFrame): data datatime-based featuers are converted from 'click_time'
    """
    data['hour'] = pd.to_datetime(data.click_time).dt.hour.astype('uint8')
    data['day'] = pd.to_datetime(data.click_time).dt.day.astype('uint8')
    data = data.drop(['click_time'], axis=1)
    
    return data

In [None]:
data = create_time_features(data)

### count系

- ベーシックな処理
  - five raw categorical features (ip, os, app, channel, device)  （単純に型をカテゴリ化）
  - time categorical features (day, hour) 
  - some count features 
- web広告配信データ特有の特徴量
  - five raw categorical features (ip, os, app, channel, device) に対し、以下の特徴量を作成 (全組み合わせ2^5 -1 = 31通り)
  - click count within next one/six hours  (直後1 or 6時間以内のクリック数)
  - forward/backward click time delta  (前後クリックまでの時差)
  - average attributed ratio of past click (過去のCVレート)

In [None]:
def create_count_channels_features(data: pd.DataFrame) -> None:
    """Create and save count-based features.
    Args:
        data (pd.DataFrame): data concatinated train and test datasets.
    """
    print("Creating new count features: 'n_channels', 'ip_app_count', 'ip_app_os_count'...")

    # Create "n_channels" feature
    print('Computing the number of channels associated with a given IP address within each hour...')
    n_chans = data[['ip', 'day', 'hour', 'channel']].groupby(by=['ip', 'day', 'hour'])[
        ['channel']].count().reset_index().rename(columns={'channel': 'n_channels'})
    data = data.merge(n_chans, on=['ip', 'day', 'hour'], how='left')
    del n_chans
    gc.collect()
    data['n_channels'].astype('uint16').to_csv(
        DATADIR/'n_channels.csv', header=True, index=False)
    data = data.drop(['n_channels'], axis=1)

    # Create "ip_app_count" feature
    print('Computing the number of channels associated with a given IP address and app...')
    n_chans = data[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[
        ['channel']].count().reset_index().rename(columns={'channel': 'ip_app_count'})
    data = data.merge(n_chans, on=['ip', 'app'], how='left')
    del n_chans
    gc.collect()
    data['ip_app_count'].astype('uint16').to_csv(DATADIR/'ip_app_count.csv', header=True, index=False)
    data = data.drop(['ip_app_count'], axis=1)

    # Create "ip_app_os_count"
    print('Computing the number of channels associated with a given IP address, app, and os...')
    n_chans = data[['ip', 'app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[
        ['channel']].count().reset_index().rename(columns={'channel': 'ip_app_os_count'})
    data = data.merge(n_chans, on=['ip', 'app', 'os'], how='left')
    del n_chans
    gc.collect()
    data['ip_app_os_count'].astype('uint16').to_csv(DATADIR/'ip_app_os_count.csv', header=True, index=False)
    data = data.drop(['ip_app_os_count'], axis=1)

    del data
    gc.collect()

In [None]:
create_count_channels_features(data)

# LDAを用いたカテゴリカルデータの埋め込み

今回のデータはipやosなど、多数のカテゴリをを抱える特徴量がある。それ単体でも特徴なり得るが、任意のカテゴリがどのような意味を持つかについて、他の特徴の各カテゴリとの共起から情報を得る。

In [None]:
def create_LDA_features(df: pd.DataFrame, num_topics: int, column_pair: Tuple[str, str])-> None:
    """ Create and save LDA feateures calculated with a pair of categorical features.
    Args:
        df (pd.DataFrame): data.
        num_topics (int): num of topics for LDA.
        column_pair (tuple): tuple of column name of df, e.g. (ip, app).
    """
    col1, col2 = column_pair
    print('pair of %s & %s count data is converting to LDA topics' % (col1, col2))
    tmp_dict = {}
    for v_col1, v_col2 in zip(df[col1], df[col2]):
        tmp_dict.setdefault(v_col1, []).append(str(v_col2))

    col1_list = list(tmp_dict.keys())
    col2s_of_col1s_list = [[' '.join(tmp_dict[tokun])] for tokun in col1_list]

    dictionary = corpora.Dictionary(col2s_of_col1s_list)
    corpus = [dictionary.doc2bow(tokens) for tokens in col2s_of_col1s_list]

    model = models.LdaModel(corpus,
                            num_topics=num_topics,
                            id2word=dictionary,
                            random_state=3655
                            )

    features = np.array(model.get_document_topics(
        corpus, minimum_probability=0))[:, :, 1]

    column_name_list = ["lda_%s_%s_" % (col1, col2) + str(i) for i in range(5)]

    df_features = pd.DataFrame(features, columns=column_name_list)
    df_features[col1] = col1_list

    df = pd.merge(df, df_features, on=col1, how='left')
    del df_features
    gc.collect()

    datapath = "lda_" + col1 + "_" + col2 + ".csv"
    df[column_name_list].to_csv(DATADIR/datapath, header=True, index=False)

def get_column_pairs(columns: List[str]) -> List[Tuple[str, str]]:
    """Get pairs of column names from given column name list.
    Args:
        columns (List[str]): column names.
    Returns: 
        List[Tuple[str,str]]: list of tuples of peirs of column names.
    """
    return [(col1, col2) for col1, col2 in itertools.product(columns, repeat=2) if col1 != col2]

In [None]:
columns = ['ip', 'app', 'os', 'channel']
column_pairs = get_column_pairs(columns)

for pair in column_pairs:
    create_LDA_features(data, num_topics=5, column_pair=pair)

# 不均衡データに対するNegative donwsampling

これまで作成した特徴量をロードし、一つのデータマートとしてマージ。

In [None]:
features = ["n_channels", "ip_app_count", "ip_app_os_count"]
lda_features = ["lda_" + pair[0] + "_" + pair[1] for pair in column_pairs]

features.extend(lda_features)

for feature in features:
    featurepath = feature + '.csv'
    df_feature = pd.read_csv(DATADIR/featurepath)
    data = pd.concat([data, df_feature], axis=1)
    del df_feature
    gc.collect()
    print("merged feature: %s" % feature)

In [None]:
def train_test_split(data, len_train):
    train = data[:len_train]
    test = data[len_train:]
    return train, test

train, test = train_test_split(data, len_train)
test.to_csv(DATADIR/'test_features.csv', header=True, index=False)

サンプルサイズの削減とクラス不均衡な二値分類への対応として、学習データへNegativeDownSamplingを使用した。

In [None]:
def negative_down_sampling(data: pd.DataFrame, random_state: int, target_variable: str) -> pd.DataFrame:
    """Create balanced dataset by matching the number of samples in the minority class with a random sampling.
    Args: 
        data (pd.DataFrame): inbalanced data.
        random_state (int): random state for sampling.
        target_bariable (str): target variable for balancing.
    Returns:
        pd.DataFrame: balanced dataset. 
    """
    positive_data = data[data[target_variable] == 1]
    positive_ratio = float(len(positive_data)) / len(data)
    negative_data = data[data[target_variable] == 0].sample(
        frac=positive_ratio / (1 - positive_ratio), random_state=random_state)
    return pd.concat([positive_data, negative_data])

In [None]:
sampled_train = negative_down_sampling(train, target_variable='is_attributed', random_state=3655)

del train
gc.collect()

In [None]:
sampled_train

# 学習

In [None]:
# val = sampled_train[(len_train-25000):len_train]
# train = sampled_train[:(len_train-25000)]

len_train = len(sampled_train)

val = sampled_train[(len_train-25):len_train]
train = sampled_train[:(len_train-25)]

print("train size: ", len(train))
print("valid size: ", len(val))
print("test size : ", len(test))

In [None]:
target = 'is_attributed'
categorical_features = ['app', 'device', 'os', 'channel', 'hour', 'day']

# TODO:全特徴量を使う
predictors = ['app', 'device', 'os', 'channel', 'hour', 'day','ip_app_count', 'ip_app_os_count',
              'lda_ip_app_0', 'lda_ip_app_1', 'lda_ip_app_2', 'lda_ip_app_3', 'lda_ip_app_4',
              'lda_ip_os_0', 'lda_ip_os_1', 'lda_ip_os_2', 'lda_ip_os_3', 'lda_ip_os_4',
              'lda_ip_channel_0', 'lda_ip_channel_1', 'lda_ip_channel_2', 'lda_ip_channel_3', 'lda_ip_channel_4']

params  = {
    
        "model_params": {
            "boosting_type": "gbdt",
            "objective": "binary",
            "metric": ["auc"],
            "learning_rate": 0.2,
            "num_leaves": 50,
            "max_depth": 5,
            "max_bin": 100,
            "subsample": 0.7,
            "subsample_freq": 1,
            "min_child_samples": 100,
            "min_child_weight": 0,
            "validation_ratio": 0.1,
            "verbose": 0
        },
    
        "train_params": {
            "num_boost_round": 250,
            "early_stopping_rounds": 10
        }
}

In [None]:
model = LightGBM()
booster, result = model.train_and_predict(train=train, 
                                          valid=val,
                                          categorical_features=categorical_features,
                                          target=target,
                                          params=params)

best_iteration = booster.best_iteration

In [None]:
data = pd.concat([train, val])
booster = model.train_without_validation(train=train,
                                         categorical_features=categorical_features,
                                         target=target,
                                         params=params,
                                         best_iteration=best_iteration)

# 予測

提出用データの作成

In [None]:
sub = pd.DataFrame()
test_id = pd.read_csv(DATADIR/'test.csv')
sub['click_id'] = test_id['click_id'].astype('int')
del test_id
gc.collect()

In [41]:
test = pd.read_csv(DATADIR/'test_features.csv')

In [42]:
prediction = booster.predict(test[predictors])

In [43]:
sub['is_attributed'] = [1 if i >  else 0 for i in prediction]

ValueError: Length of values does not match length of index

In [34]:
sub.head()