<a href="https://colab.research.google.com/github/coffeemountain/kaggle_otto/blob/main/notebooks/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
#メモリ削減関数

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [15]:
from google.colab import drive
drive.mount('/content/drive')

import os

OTTO_DIR = '/content/drive/MyDrive/otto'
KAGGLE_JSON = OTTO_DIR + '/kaggle.json'
! mkdir -p ~/.kaggle && cp $KAGGLE_JSON ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

with open(OTTO_DIR + '/access_token.txt', 'r') as f:
  access_token = f.read()

GITHUB_DIR = OTTO_DIR + '/kaggle_otto'
! rm -rf $GITHUB_DIR
! git clone -b gcs-func-add https://$access_token@github.com/coffeemountain/kaggle_otto.git $GITHUB_DIR

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cloning into '/content/drive/MyDrive/otto/kaggle_otto'...
remote: Enumerating objects: 310, done.[K
remote: Counting objects: 100% (218/218), done.[K
remote: Compressing objects: 100% (137/137), done.[K
remote: Total 310 (delta 134), reused 132 (delta 75), pack-reused 92[K
Receiving objects: 100% (310/310), 339.08 KiB | 3.90 MiB/s, done.
Resolving deltas: 100% (167/167), done.


In [16]:
import numpy as np
import pandas as pd
import collections
from collections import Counter
import lightgbm as lgb
import pickle
import glob
import gc
import sys
from tqdm.notebook import tqdm

In [17]:
class Dataset:

  def _load(self, filename):
    return reduce_mem_usage(pd.read_parquet(filename))

  def _prepare_features(self, candidates, features_dir, features):

    def _check(df):
      """merge後のデータに問題がないか。軽く確認"""

      if df.isnull().any():
        message = 'データにnullが存在'
        raise Exception(message)

    def _collect_target_feature_files():
      target_files = []
      for feature_file in glob.glob(features_dir + '/*'):

        # ファイル名から特徴名を取得
        feature_name = feature_file.split('/')[-1].split('.')[0]

        # 含めたい特徴かチェック
        if feature_name in features:
          target_files.append(feature_file)
      
      # ほしい特徴量のファイルすべて取得できているかチェック
      if len(target_files) != len(features):
        message = f"見つからない特徴量があります。\nexcept: {features}.\nactual: {target_files}"
        raise Exception(message)

      return target_files

    target_files = _collect_target_feature_files()
    for file_ in tqdm(target_files):
        # FIXME: 特徴作成側で対処後、drop_duplicatesは消す
        feature_df = self._load(file_).drop_duplicates()
        candidates = candidates.merge(feature_df, on=['session', 'aid'], how='left', copy=False)

        print('check')
        # _check(candidates)
        del feature_df

    return candidates

  def _prepare_labels(self, candidates, labels_file, TARGET):
    labels = pd.read_parquet(labels_file)
    labels = labels[labels['type'] == TARGET]
    labels['aid'] = labels['ground_truth'].explode().astype('int32')
    labels[TARGET] = 1

    candidates = candidates.merge(labels[['session', 'aid', TARGET]], on=['session', 'aid'], how='left', copy=False).fillna(0)
    return candidates

  def prepare_dataset(self, candidates_file, features_dir, features, labels_file, target):
    print('1. prepare candidates')
    candidates = self._load(candidates_file)

    print('2. prepare features')
    # 特徴量をロードして、candidatesとmerge
    candidates_with_features = self._prepare_features(candidates, features_dir, features)

    print('3. prepare labels')
    # 正解ラベルをロードして、candidates_with_featureとmerge
    candidates_with_labels = self._prepare_labels(candidates_with_features, labels_file, target)

    return candidates_with_labels

In [13]:
TRAIN_CANDIDATES_FILE = OTTO_DIR + '/share/example_train2/candidates/click_train_v1.qpt'
TRAIN_LABELS_FILE = OTTO_DIR + '/share/example_train2/test_labels.parquet'
TEST_CANDIDATES_FILE = OTTO_DIR + '/share/example_train2/candidates/click_test_v1.qpt'
TEST_LABELS_FILE = OTTO_DIR + '/share/example_train2/test_labels.parquet'
FEATURES_DIR = OTTO_DIR + '/share/example_train2/features'
print(glob.glob(FEATURES_DIR + '/*'))
FEATURES = [
    'item_item_count',
    'item_user_count',
    'item_buy_ratio',
    'user_user_count',
    'user_item_count',
    # 'user_buy_ratio',
]

TARGET = 'clicks'

train_data = Dataset().prepare_dataset(
    candidates_file=TRAIN_CANDIDATES_FILE,
    features_dir=FEATURES_DIR,
    features=FEATURES,
    labels_file=TRAIN_LABELS_FILE,
    target=TARGET,
)

['/content/drive/MyDrive/otto/share/example_train2/features/item_item_count.parquet', '/content/drive/MyDrive/otto/share/example_train2/features/item_user_count.parquet', '/content/drive/MyDrive/otto/share/example_train2/features/item_buy_ratio.parquet', '/content/drive/MyDrive/otto/share/example_train2/features/user_user_count.parquet', '/content/drive/MyDrive/otto/share/example_train2/features/user_item_count.parquet']
1. prepare candidates
Memory usage of dataframe is 215.04 MB
Memory usage after optimization is: 215.04 MB
Decreased by 0.0%
2. prepare features


  0%|          | 0/5 [00:00<?, ?it/s]

Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 105.59 MB
Decreased by 16.7%
check
Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 89.75 MB
Decreased by 29.2%
check
Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 95.03 MB
Decreased by 25.0%
check
Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 95.03 MB
Decreased by 25.0%
check
Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 95.03 MB
Decreased by 25.0%
check
3. prepare labels


# train

In [18]:
import lightgbm as lgb

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg', #
    'lambdarank_truncation_level': 10,
    'ndcg_eval_at': 20,
    'n_estimators': 10,
    'boosting_type': 'dart',
    'random_state': 0,
}

X_train = train_data[FEATURES]
y_train = train_data[TARGET]
group_train = train_data.groupby('session')['aid'].agg('count').values

dtrain = lgb.Dataset(X_train, y_train, group=group_train)
model = lgb.train(params, dtrain)

pickle.dump(model, open(OTTO_DIR + f'/share/models/v1.pkl','wb'))

gc.collect()

115

## predict

In [19]:
test_candidates = Dataset().prepare_dataset(
    candidates_file=TEST_CANDIDATES_FILE,
    features_dir=FEATURES_DIR,
    features=FEATURES,
    labels_file=TEST_LABELS_FILE,
    target=TARGET,
)

X_test = test_candidates[FEATURES]

# 各candidateに対して、scoreを計算
test_candidates['score'] = model.predict(X_test)

pred_df = test_candidates[['session', 'aid', 'score']]

# session(user)毎に、scoreが高い順に並べて、ランキング情報を付与
pred_df['rank'] = pred_df.sort_values(['session','score'], ascending=[True, False]).groupby('session').cumcount().astype('int8') + 1

# 各session 1~20位までのデータをlist化
pred_df = pred_df.groupby('session')['aid'].apply(list).to_frame().reset_index()

pred_df['labels'] = pred_df['aid'].apply(lambda x: " ".join(map(str,x)))
pred_labels = pred_df[['session', 'labels']]

1. prepare candidates
Memory usage of dataframe is 215.04 MB
Memory usage after optimization is: 215.04 MB
Decreased by 0.0%
2. prepare features


  0%|          | 0/5 [00:00<?, ?it/s]

Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 105.59 MB
Decreased by 16.7%
check
Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 89.75 MB
Decreased by 29.2%
check
Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 95.03 MB
Decreased by 25.0%
check
Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 95.03 MB
Decreased by 25.0%
check
Memory usage of dataframe is 126.71 MB
Memory usage after optimization is: 95.03 MB
Decreased by 25.0%
check
3. prepare labels


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df['rank'] = pred_df.sort_values(['session','score'], ascending=[True, False]).groupby('session').cumcount().astype('int8') + 1


In [20]:
def calc_recall(test_labels, pred_labels):
  test_labels = test_labels.merge(pred_labels, how='left', on='session')

  if test_labels['labels'].isnull().any():
    message = '推論できていないデータがあります。'
    raise Exception(message)

  test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1) 
  test_labels['gt_count'] = test_labels['ground_truth'].str.len().clip(0, 20)
  recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()

  return recall

def evaluate(test_labels_file, pred_labels, target):
  recall = calc_recall(test_labels_target, pred_labels)
  return recall

In [23]:
test_labels = pd.read_parquet(TEST_LABELS_FILE)
test_labels_target = test_labels[test_labels['type'] == TARGET]
# test_labels_target = test_labels_target[test_labels_target['session'].isin(pred_labels['session'])]
evaluate(test_labels_target, pred_labels, TARGET)

0.0