### LightGBMを試す


In [None]:
import numpy as np
import pandas as pd
import gc
import os
import time
import random
from tqdm.auto import tqdm
import datetime
import dask.dataframe as dd

In [None]:
def visualize_df(df):
    print(df.shape)
    display(df.head())

## Read Data

In [None]:
input_type = {
 'customer_id':                 'int64',
 'article_id':                  'int32',
 'label':                       'bool',
 'FN':                          'bool',
 'Active':                      'bool',
 'club_member_status':          'int8',
 'fashion_news_frequency':      'int8',
 'age':                         'int8',
 'postal_code':                 'int32',   
 'product_code':                'int32',  
 'product_type_no':             'int32',
 'graphical_appearance_no':     'int32',
 'colour_group_code':           'int32',
 'perceived_colour_value_id':   'int32',
 'perceived_colour_master_id':  'int32',
 'department_no':               'int16',
 'index_code':                  'int16',
 'index_group_no':              'int8',
 'section_no':                  'int8',
 'garment_group_no':            'int16'}

In [None]:
# train data
train = dd.read_csv('../input/h-m-training-and-testing-data/train.csv', dtype=input_type).compute()
train.head()

In [None]:
# train data
valid = dd.read_csv('../input/h-m-training-and-testing-data/valid.csv', dtype=input_type).compute()
valid.head()

## Preprocess

In [None]:
train.sort_index(inplace=True)
valid.sort_index(inplace=True)
train.head()

In [None]:
# queryの準備, customer_idごとにsortする, lightGBMを使うときに必要
query_list_train = train['customer_id'].value_counts()
query_list_train = query_list_train.sort_index()

query_list_valid = valid['customer_id'].value_counts()
query_list_valid = query_list_valid.sort_index()

In [None]:
query_list_train.head()

In [None]:
# 学習データを特徴量と目的変数に分ける
train_x = train.drop(['label'], axis=1)
train_y = train['label']

valid_x = valid.drop(['label'], axis=1)
valid_y = valid['label']

In [None]:
# 特徴量作成
train_x = train_x.drop(['customer_id', 'article_id'], axis=1)
valid_x = valid_x.drop(['customer_id', 'article_id'], axis=1)

In [None]:
# labelエンコードは前段で済んでいる

In [None]:
train_x

## Training

In [None]:
# LightGBM
import lightgbm as lgb

params = {
    'objective': 'lambdarank',
    'metric': 'map',
    'ndcg_eval_at': [5, 12],
    'boosting_type': 'gbdt',
}
num_round = 100

In [None]:
# 特徴量と目的変数をlightgbmのデータ構造に変換する
lgb_train = lgb.Dataset(train_x, train_y, group=query_list_train)
lgb_eval = lgb.Dataset(valid_x, valid_y, group=query_list_valid)

In [None]:
lgb_train

In [None]:
# 学習の実行
# バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
model = lgb.train(params, lgb_train, 
                  num_boost_round=num_round, 
                  valid_names=['train', 'valid'], 
                  valid_sets=[lgb_train, lgb_eval])

In [None]:
model.feature_importance

In [None]:
model.best_iteration

In [None]:
model.feature_name()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(),model.feature_name())), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

## test

In [None]:
# メモリ節約
del train, train_x, train_y, valid, valid_x, valid_y
gc.collect()

In [None]:
# test data
test = dd.read_csv('../input/h-m-training-and-testing-data/test.csv', dtype=input_type).compute()
test

In [None]:
test.sort_index(inplace=True)

In [None]:
test_id = test[['customer_id', 'article_id']]
test.drop(['customer_id', 'article_id'], axis=1)

### prediction

In [None]:
del query_list_train, query_list_valid, lgb_train, lgb_eval, params, feature_imp
gc.collect()

In [None]:
pred = model.predict(test, num_iteration=100)
pred.shape, test.shape

In [None]:
del test
gc.collect()

In [None]:
test = test.reset_index(drop=True)
pred = pd.Series(pred, name='score')

In [None]:
sub1 = pd.concat([test_id, pred], axis=1)

In [None]:
sub1 = sub1[['customer_id', 'article_id', 'score']]

In [None]:
sub1

In [None]:
sub1 = sub1.sort_values(['customer_id', 'score'], ascending=False)

In [None]:
sub1 = sub1.groupby('customer_id')['article_id'].apply(list).reset_index(name='prediction')

In [None]:
sub1['prediction'].apply(len)

In [None]:
sub1['prediction'] = sub1['prediction'].apply(lambda x: x[:12])
sub1['prediction'].apply(len)

In [None]:
# 提出用に元に戻す
sub1['prediction'] = sub1['prediction'].map(lambda x: '0' + str(x))

## Submit

In [None]:
sub = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
del sub['prediction']; gc.collect()
visualize_df(sub)

In [None]:
sub['customer_id2'] = sub['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [None]:
print(sub.shape, sub1.shape)

In [None]:
sub =  pd.merge(sub, sub1, on=['customer_id2','customer_id'], how='left')
sub = sub.drop(['customer_id2'], axis=1)

In [None]:
sub = sub[['customer_id', 'valid_pred']].copy()
sub.columns = ['customer_id', 'prediction']
print(sub.shape)

sub.to_csv('submission.csv', index=False)