# 特徴量エンジニアリング
アイテム特徴量とユーザー特徴量を用いてランク付けするのが良さそう？  
（元コンペのランカー達も使っている「LGB Ranking」での学習にもっていきたい）

In [62]:
# ライブラリ
import pandas as pd
import numpy as np
from tqdm import tqdm
import lightgbm
from abc import ABC, abstractmethod
import pickle
from collections import defaultdict
from typing import List, Dict, Any, Union
#!pip install pyarrow      # メモリ容量対策でparquetフォーマットで保存
#!pip install fastparquet  # メモリ容量対策でparquetフォーマットで保存

In [63]:
# transactionデータ読み込み
transactions = pd.read_csv("/Users/yutennnnn/★コンペ/kaggle_h_m/transactions_rakus_train.csv")
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

In [64]:
# articleデータ読み込み
articles = pd.read_csv("/Users/yutennnnn/★コンペ/kaggle_h_m/articles.csv")

In [65]:
# customerデータ読み込み
customers = pd.read_csv("/Users/yutennnnn/★コンペ/kaggle_h_m/customers_rakus.csv")

In [111]:
# sample_submissionデータ読み込み
sample_submission = pd.read_csv("/Users/yutennnnn/★コンペ/kaggle_h_m/sample_submission_rakus_latest.csv")

# ユーザー特徴量を生成する

In [66]:
# 抽象クラスの定義
class UserFeatures(ABC):
    @abstractmethod
    def get(self) -> pd.DataFrame:
        pass

【参照】抽象クラスを使うメリット  
https://qiita.com/bluepost59/items/eef6f48fdd322b0b9791  
クラスを複数扱うようなときに便利らしい

In [67]:
# 基本集計機能関数の作成（mean、max、minなど）
class AggrFeatures(UserFeatures):
    def __init__(self, transactions):
        self.groupby_df = transactions.groupby('customer_id', as_index = False)

    def get(self):
        output_df = (
            self.groupby_df['price']
            .agg({
                'mean_transactions': 'mean',
                'max_transactions': 'max',
                'min_transactions': 'min',
                'median_transactions': 'median',
                'sum_transactions': 'sum',
                'max_minus_min_transactions': lambda x: x.max()-x.min()
            })
            .set_index('customer_id')
            .astype('float32')
        )
        return output_df

In [68]:
# 取引に関わる基本機能関数の作成
class CountFeatures(UserFeatures):
    def __init__(self, transactions, topk = 10):
        self.transactions = transactions
        self.topk = topk

    def get(self):
        grouped = self.transactions.groupby('customer_id', as_index = False)
        # 取引件数、オンラインの商品数
        # 平均取引価格より大きい取引の数
        a = (
            grouped
            .agg({
                'article_id': 'count',
                'price': lambda x: sum(np.array(x) > x.mean()),
                'sales_channel_id': lambda x: sum(x == 2),
            })
            .rename(columns = {
                'article_id': 'n_transactions',
                'price': 'n_transactions_bigger_mean',
                'sales_channel_id': 'n_online_articles'
            })
            .set_index('customer_id')
            .astype('int8')
        )
        # ユニークな商品数、店舗の商品数
        b = (
            grouped
            .agg({
                'article_id': 'nunique',
                'sales_channel_id': lambda x: sum(x == 1),
            })
            .rename(columns = {
                'article_id': 'n_unique_articles',
                'sales_channel_id': 'n_store_articles',
            })
            .set_index('customer_id')
            .astype('int8')
        )
        # 上位取引数
        topk_articles = self.transactions['article_id'].value_counts()[:self.topk].index
        c = (
            grouped['article_id']
            .agg({
               f'top_article_{i}':  lambda x: sum(x == k) for i, k in enumerate(topk_articles)
            }
            )
            .set_index('customer_id')
            .astype('int8')
        )
        
        output_df = a.merge(b, on = ('customer_id')).merge(c, on = ('customer_id'))
        return output_df

In [69]:
class CustomerFeatures(UserFeatures):
    """
    All columns from customers dataframe
    """
    def __init__(self, customers):
        self.customers = self._prepare_customers(customers)
    
    def _prepare_customers(self, customers):
        customers['FN'] = customers['FN'].fillna(0).astype('int8')
        customers['Active'] = customers['Active'].fillna(0).astype('int8')
        customers['club_member_status'] = customers['club_member_status'].fillna('UNKNOWN')
        customers['age'] = customers['age'].fillna(customers['age'].mean()).astype('int8')
        customers['fashion_news_frequency'] = (
            customers['fashion_news_frequency']
            .replace('None', 'NONE')
            .replace(np.nan, 'NONE')
        )
        return customers

    def get(self):
        output = (
            self.customers[filter(lambda x: x != 'postal_code', customers.columns)]
            .set_index('customer_id')
        )
        return output

In [70]:
class ArticlesFeatures(UserFeatures):
    """
    returns article features: whether category appears in top categories
    """
    def __init__(self, transactions, articles, topk = 10):
        self.merged_df = transactions.merge(articles, on = ('article_id'))
        self.articles = articles
        self.topk = topk
    
    def get(self):
        output_df = None

        for col in tqdm(self.articles.columns, desc = 'extracting features'):
            if 'name' in col:
                if output_df is None:
                    output_df = self.aggregate_topk(self.merged_df, col, self.topk)
                else:
                    intermediate_out = self.aggregate_topk(self.merged_df, col, self.topk)
                    output_df = output_df.merge(intermediate_out, on = ('customer_id'))
        return output_df

    def return_value_counts(self, df, column_name, k):
        value_counts = df[column_name].value_counts()[:k].index
        value_counts = list(map(lambda x: x[1], value_counts))
        return value_counts

    def aggregate_topk(self, merged_df, column_name, k):
        grouped_df_indx = merged_df.groupby('customer_id')
        grouped_df = merged_df.groupby('customer_id', as_index = False)
        
        topk_values = self.return_value_counts(grouped_df_indx, column_name, k)
        # トップカテゴリに表示されるトランザクション数
        n_top_k = (
            grouped_df[column_name]
            .agg({
                f'top_{column_name}_{i}': lambda x: sum(x == k) for i, k in enumerate(topk_values)
            })
            .set_index('customer_id')
            .astype('int16')
        )
        return n_top_k

In [71]:
class UserFeaturesCollector:
    """
    collect all features and aggregate them
    """
    @staticmethod
    def collect(features: Union[List[UserFeatures], List[str]], **kwargs) -> pd.DataFrame:
        output_df = None

        for feature in tqdm(features):
            if isinstance(feature, UserFeatures):
                feature_out = feature.get(**kwargs)
            if isinstance(feature, str):
                try:
                    feature_out = pd.read_csv(feature)
                except:
                    feature_out = pd.read_parquet(feature)

            if output_df is None:
                output_df = feature_out
            else:
                output_df = output_df.merge(feature_out, on = ('customer_id'))
        return output_df

In [72]:
# 最初の1万件の取引だけ取り上げる
user_features = UserFeaturesCollector.collect([
    AggrFeatures(transactions.iloc[:10000]),
    CountFeatures(transactions.iloc[:10000], 3),
    CustomerFeatures(customers),
    ArticlesFeatures(transactions.iloc[:10000], articles, 3)])

 75%|█████████████████████████████████▊           | 3/4 [00:01<00:00,  2.42it/s]
extracting features:   0%|                               | 0/25 [00:00<?, ?it/s][A
extracting features:  12%|██▊                    | 3/25 [00:00<00:02,  8.35it/s][A
extracting features:  20%|████▌                  | 5/25 [00:00<00:02,  6.83it/s][A
extracting features:  24%|█████▌                 | 6/25 [00:01<00:03,  4.93it/s][A
extracting features:  32%|███████▎               | 8/25 [00:01<00:03,  5.26it/s][A
extracting features:  40%|████████▊             | 10/25 [00:01<00:02,  5.44it/s][A
extracting features:  48%|██████████▌           | 12/25 [00:02<00:02,  5.55it/s][A
extracting features:  56%|████████████▎         | 14/25 [00:02<00:01,  5.61it/s][A
extracting features:  64%|██████████████        | 16/25 [00:02<00:01,  5.62it/s][A
extracting features:  72%|███████████████▊      | 18/25 [00:03<00:01,  5.62it/s][A
extracting features:  80%|█████████████████▌    | 20/25 [00:03<00:00,  5.66it/s

In [73]:
user_features.head()

Unnamed: 0_level_0,mean_transactions,max_transactions,min_transactions,median_transactions,sum_transactions,max_minus_min_transactions,n_transactions,n_transactions_bigger_mean,n_online_articles,n_unique_articles,...,top_index_name_2,top_index_group_name_0,top_index_group_name_1,top_index_group_name_2,top_section_name_0,top_section_name_1,top_section_name_2,top_garment_group_name_0,top_garment_group_name_1,top_garment_group_name_2
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0.040661,0.050831,0.030492,0.040661,0.081322,0.020339,2,1,2,2,...,0,0,0,0,0,0,0,0,0,0
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,0.017271,0.020322,0.015237,0.016932,0.086356,0.005085,5,1,5,5,...,2,2,2,2,2,2,2,2,2,2
00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280,0.038119,0.053373,0.030492,0.030492,0.190593,0.022881,5,2,1,5,...,0,0,0,0,0,0,0,0,0,0
0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2,0.021424,0.022525,0.020322,0.021424,0.042847,0.002203,2,1,2,2,...,2,2,2,2,0,0,0,2,2,2
000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8,0.023768,0.042356,0.008458,0.016932,0.713051,0.033898,30,14,30,15,...,18,18,18,18,2,2,2,2,2,2


In [74]:
# メモリ容量対策でparquet形式で保存
user_features.to_parquet('user_features.parquet')

▼Parquetの特徴  
https://parquet.apache.org/documentation/latest/
  
・列指向フォーマットのため、行指向と比較して、圧縮効率や列に対する集計処理などにおいてアドバンテージを持つ  
・ネストしたカラムもエンコード可能  
  
▼Parquetからロード  
変数 = pd.read_parquet("./ファイル名.parquet")

# アイテム特徴量を生成する

In [75]:
# 抽象クラスの定義
class ItemFeatures(ABC):
    @abstractmethod
    def get(self, *args, **kwargs) -> pd.DataFrame:
        """
        article_id -> features
        """
        pass

In [76]:
class CategoryTransform(ItemFeatures):
    """
    factorize all articles columns
    """
    def __init__(self, articles: pd.DataFrame):
        self.articles = articles

    def get(self):
        self.__feature_columns = list(filter(lambda x: 'name' in x, self.articles.columns))[1:]
        filtered_articles = self.articles[self.__feature_columns]
        filtered_articles = filtered_articles.apply(lambda x: pd.factorize(x)[0])
        filtered_articles['article_id'] = self.articles['article_id']

        features = filtered_articles.set_index('article_id').astype('int8')
        return features

    def get_columns(self):
        return self.__feature_columns

In [77]:
class AggrTransform(ItemFeatures):
    """
    aggregation transactions features : mean, max and etc...
    """
    def __init__(self, articles: pd.DataFrame, transactions: pd.DataFrame):
        self.articles = articles
        self.transactions = transactions

    def get(self):
        stats = self._get_stats()
        return stats

    def _get_stats(self):
        transactions_more = self.transactions.merge(self.articles, on = ('article_id'))
        grouped = (
            transactions_more.
            groupby('article_id')
        )

        counts = (
            grouped['article_id']
            .count()
            .to_frame()
            .rename(columns = {'article_id': 'count'})
            .astype('int16')
            .reset_index()
            .set_index('article_id')
        )
        sums = (
            grouped['price']
            .sum()
            .to_frame()
            .astype('float32')
            .rename(columns = {
                'price': 'sum_price'
            })
        )
        means = (
            grouped['price']
            .mean()
            .to_frame()
            .astype('float32')
            .rename(columns = {
                'price': 'mean_price'
            })
        )
        mins = (
            grouped['price']
            .min()
            .to_frame()
            .astype('float32')
            .rename(columns = {
               'price': 'min_price' 
            })
        )
        maxs = (
            grouped['price']
            .max()
            .to_frame()
            .astype('float32')
            .rename(columns = {
                'price': 'max_price'
            })
        )
        
        output_df = (
            counts
            .merge(sums, on = ('article_id'))
            .merge(means, on = ('article_id'))
            .merge(mins, on = ('article_id'))
            .merge(maxs, on = ('article_id'))
        )
        return output_df

In [78]:
class TopTransforms(ItemFeatures):
    """
    whether category appears in top categories
    """
    def __init__(self, articles: pd.DataFrame, topk = 3):
        self.articles = articles
        self.topk = topk
    
    def get(self):
        name_cols = list(filter(lambda x: 'name' in x, self.articles.columns))  
        
        value_counts = self._get_value_counts(name_cols)
        value_counts = {
            f'{k}_{self.topk}': self.articles[k].isin(v).astype('int8') for k, v in value_counts.items()
        }
        
        output_df = self.articles.assign(**value_counts)
        output_df = output_df[['article_id'] + list(value_counts.keys())].set_index('article_id')
        return output_df
    
    def _get_value_counts(self, name_cols: List[str]):
        value_counts = self.articles[name_cols].apply(pd.Series.value_counts)
        get_index = lambda x: value_counts.sort_values(x, ascending = False)[x][:self.topk].index  
        value_counts = dict(zip(name_cols, map(lambda x: get_index(x), name_cols)))
        return value_counts

In [79]:
class ItemFeaturesCollector:
    @staticmethod
    def collect(features: Union[List[ItemFeatures], List[str]], **kwargs) -> pd.DataFrame:
        output_df = None

        for feature in tqdm(features):
            if isinstance(feature, ItemFeatures):
                feature_out = feature.get(**kwargs)
            if isinstance(feature, str):
                try:
                    feature_out = pd.read_csv(feature)
                except:
                    feature_out = pd.read_parquet(feature)

            if output_df is None:
                output_df = feature_out
            else:
                output_df = output_df.merge(feature_out, on = ('article_id'))
        return output_df

In [80]:
item_features = ItemFeaturesCollector.collect([
    CategoryTransform(articles),
    AggrTransform(articles, transactions.iloc[:10000]),
    TopTransforms(articles)
])

100%|█████████████████████████████████████████████| 3/3 [00:00<00:00,  8.54it/s]


In [81]:
item_features.head()

Unnamed: 0_level_0,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,...,product_group_name_3,graphical_appearance_name_3,colour_group_name_3,perceived_colour_value_name_3,perceived_colour_master_name_3,department_name_3,index_name_3,index_group_name_3,section_name_3,garment_group_name_3
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108775015,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,0,1
108775051,0,0,1,2,2,1,0,0,0,0,...,1,0,0,1,1,0,1,1,0,1
110065001,1,1,0,0,0,0,1,1,0,1,...,0,1,1,1,1,0,0,1,0,0
110065002,1,1,0,1,1,1,1,1,0,1,...,0,1,1,1,1,0,0,1,0,0
111586001,4,3,0,0,0,0,2,1,0,2,...,1,1,1,1,1,0,0,1,0,0


In [82]:
# メモリ容量対策でparquet形式で保存
item_features.to_parquet('item_features.parquet')

# ↓↓↓GBM Rankingで分析↓↓↓  

【参照】  
https://www.kaggle.com/code/kimurayut/gbm-ranking  
https://yolo-kiyoshi.com/2021/02/08/post-2606/  
ランク学習は検索クエリ(情報検索なら検索ワード、レコメンドならユーザー)に対するアイテムのランク付けを目的としているため、情報検索やレコメンドにも活用される。

In [83]:
# ユーザー特徴量
user_features.head()

Unnamed: 0_level_0,mean_transactions,max_transactions,min_transactions,median_transactions,sum_transactions,max_minus_min_transactions,n_transactions,n_transactions_bigger_mean,n_online_articles,n_unique_articles,...,top_index_name_2,top_index_group_name_0,top_index_group_name_1,top_index_group_name_2,top_section_name_0,top_section_name_1,top_section_name_2,top_garment_group_name_0,top_garment_group_name_1,top_garment_group_name_2
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0.040661,0.050831,0.030492,0.040661,0.081322,0.020339,2,1,2,2,...,0,0,0,0,0,0,0,0,0,0
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,0.017271,0.020322,0.015237,0.016932,0.086356,0.005085,5,1,5,5,...,2,2,2,2,2,2,2,2,2,2
00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280,0.038119,0.053373,0.030492,0.030492,0.190593,0.022881,5,2,1,5,...,0,0,0,0,0,0,0,0,0,0
0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2,0.021424,0.022525,0.020322,0.021424,0.042847,0.002203,2,1,2,2,...,2,2,2,2,0,0,0,2,2,2
000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8,0.023768,0.042356,0.008458,0.016932,0.713051,0.033898,30,14,30,15,...,18,18,18,18,2,2,2,2,2,2


In [84]:
# アイテム特徴量
item_features.head()

Unnamed: 0_level_0,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,...,product_group_name_3,graphical_appearance_name_3,colour_group_name_3,perceived_colour_value_name_3,perceived_colour_master_name_3,department_name_3,index_name_3,index_group_name_3,section_name_3,garment_group_name_3
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108775015,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,0,1
108775051,0,0,1,2,2,1,0,0,0,0,...,1,0,0,1,1,0,1,1,0,1
110065001,1,1,0,0,0,0,1,1,0,1,...,0,1,1,1,1,0,0,1,0,0
110065002,1,1,0,1,1,1,1,1,0,1,...,0,1,1,1,1,0,0,1,0,0
111586001,4,3,0,0,0,0,2,1,0,2,...,1,1,1,1,1,0,0,1,0,0


In [85]:
# 過去4週間のトランザクションをベースラインとして使用
df_4w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-08-10')].copy()
df_3w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-08-17')].copy()
df_2w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-08-24')].copy()
df_1w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-08-31')].copy()

↑容量の許す限りいろんな期間で試してもいいかも

In [86]:
# 質的カラムを量的カラムに置き換えていく
# とりあえず質的カラム確認
user_features[['club_member_status', 'fashion_news_frequency']]

Unnamed: 0_level_0,club_member_status,fashion_news_frequency
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,ACTIVE,NONE
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,ACTIVE,Regularly
00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280,ACTIVE,NONE
0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2,ACTIVE,Regularly
000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8,ACTIVE,NONE
...,...,...
365bfd22ce4b8aabeb84c847266a80a225b04f188629bf3a7f912e8545ac7643,ACTIVE,Regularly
365d866c4dcfcdf9c7abe4ccd9a0c3beb8e145db7a952424469b865a82da1eb3,ACTIVE,Regularly
36689f5d8b67b19a42fff664ad5e39ca4288bc264e72f7603c5b764b4b3f0b2b,ACTIVE,Regularly
366b89e56112858fd747d2b8ec9519b8c7856d34e546eb827e46ebc37613dab1,ACTIVE,NONE


In [87]:
# 実際に置き換え
user_features[['club_member_status', 'fashion_news_frequency']] = (
                   user_features[['club_member_status', 'fashion_news_frequency']]
                   .apply(lambda x: pd.factorize(x)[0])
).astype('int8')

In [88]:
# 置き換えできたか確認
user_features[['club_member_status', 'fashion_news_frequency']]

Unnamed: 0_level_0,club_member_status,fashion_news_frequency
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0,0
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,0,1
00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280,0,0
0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2,0,1
000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8,0,0
...,...,...
365bfd22ce4b8aabeb84c847266a80a225b04f188629bf3a7f912e8545ac7643,0,1
365d866c4dcfcdf9c7abe4ccd9a0c3beb8e145db7a952424469b865a82da1eb3,0,1
36689f5d8b67b19a42fff664ad5e39ca4288bc264e72f7603c5b764b4b3f0b2b,0,1
366b89e56112858fd747d2b8ec9519b8c7856d34e546eb827e46ebc37613dab1,0,0


In [89]:
# ここでユーザー＆アイテム特徴量をトランザクションにマージ
transactions = (
    transactions
    .merge(user_features, on = ('customer_id'))
    .merge(item_features, on = ('article_id'))
)
transactions.sort_values(['t_dat', 'customer_id'], inplace=True)

In [90]:
# トランザクションの中身確認
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36568 entries, 0 to 7790
Data columns (total 88 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   t_dat                               36568 non-null  datetime64[ns]
 1   customer_id                         36568 non-null  object        
 2   article_id                          36568 non-null  int64         
 3   price                               36568 non-null  float64       
 4   sales_channel_id                    36568 non-null  int64         
 5   mean_transactions                   36568 non-null  float32       
 6   max_transactions                    36568 non-null  float32       
 7   min_transactions                    36568 non-null  float32       
 8   median_transactions                 36568 non-null  float32       
 9   sum_transactions                    36568 non-null  float32       
 10  max_minus_min_transacti

In [91]:
# メモリ対策で一旦1万行のみ抽出
n_rows = 10000

train = transactions.loc[transactions.t_dat <= pd.to_datetime('2020-08-31')].iloc[:n_rows]
valid = transactions.loc[transactions.t_dat >= pd.to_datetime('2020-09-01')]

↑容量の許す限り抽出行を増やしていいかも

In [92]:
# 学習データ、検証データの形確認
train.shape, valid.shape

((10000, 88), (39, 88))

↓↓↓ここから候補の準備↓↓↓

In [93]:
#検証用アルゴリズム
names = ['Alice', 'Bob', 'Charlie','Alice']
ages = [24, 50, 18,24]
test_dict = {}

for i, (name, age) in enumerate(zip(names, ages)):
    print(i, name, age)
    if name not in test_dict:
        test_dict[name] = {}
    
    if age not in test_dict[name]:
        test_dict[name][age] = 0
    
    test_dict[name][age] += 1
test_dict

0 Alice 24
1 Bob 50
2 Charlie 18
3 Alice 24


{'Alice': {24: 2}, 'Bob': {50: 1}, 'Charlie': {18: 1}}

In [94]:
# 4週間
purchase_dict_4w = {}

for i,x in enumerate(zip(df_4w['customer_id'], df_4w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_4w:
        purchase_dict_4w[cust_id] = {}
    
    if art_id not in purchase_dict_4w[cust_id]:
        purchase_dict_4w[cust_id][art_id] = 0
    
    purchase_dict_4w[cust_id][art_id] += 1

dummy_list_4w = list((df_4w['article_id'].value_counts()).index)[:12]

In [95]:
# 3週間
purchase_dict_3w = {}

for i,x in enumerate(zip(df_3w['customer_id'], df_3w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_3w:
        purchase_dict_3w[cust_id] = {}
    
    if art_id not in purchase_dict_3w[cust_id]:
        purchase_dict_3w[cust_id][art_id] = 0
    
    purchase_dict_3w[cust_id][art_id] += 1

dummy_list_3w = list((df_3w['article_id'].value_counts()).index)[:12]

In [96]:
# 2週間
purchase_dict_2w = {}

for i,x in enumerate(zip(df_2w['customer_id'], df_2w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_2w:
        purchase_dict_2w[cust_id] = {}
    
    if art_id not in purchase_dict_2w[cust_id]:
        purchase_dict_2w[cust_id][art_id] = 0
    
    purchase_dict_2w[cust_id][art_id] += 1

dummy_list_2w = list((df_2w['article_id'].value_counts()).index)[:12]

In [97]:
# 1週間
purchase_dict_1w = {}

for i,x in enumerate(zip(df_1w['customer_id'], df_1w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_1w:
        purchase_dict_1w[cust_id] = {}
    
    if art_id not in purchase_dict_1w[cust_id]:
        purchase_dict_1w[cust_id][art_id] = 0
    
    purchase_dict_1w[cust_id][art_id] += 1

dummy_list_1w = list((df_1w['article_id'].value_counts()).index)[:12]

顧客が特定の週で最も購入している商品(特定顧客ベース)上位12をトレーニングデータに設定。  
12の商品がなかった場合、特定の週で最も購入された商品(特定の週の全取引情報ベース)上位12のデータで不足を保管。

In [98]:
def prepare_candidates(customers_id, n_candidates = 12):
  prediction_dict = {}
  dummy_list = list((df_2w['article_id'].value_counts()).index)[:n_candidates]

  for i, cust_id in tqdm(enumerate(customers_id)):
    # comment this for validation
    if cust_id in purchase_dict_1w:
        # 顧客が購入したアイテムの回数のデータを参照して、降順に並び替える
        l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        # 降順に並び替えたリストから、アイテムIDを配列で取得
        l = [y[0] for y in l]
        # 予測アイテム数の上限よりもアイテムID数が多かった場合、予測アイテム数の上限までのアイテムIDのリスト要素を取得
        if len(l)>n_candidates:
            s = l[:n_candidates]
            # 予測アイテム数の上限よりもアイテムID数が少なかった場合、ダミーの値で保管
            # ダミーの値の中身は、その週に最も購入された上位12の商品
        else:
            s = l+dummy_list_1w[:(n_candidates-len(l))]
    elif cust_id in purchase_dict_2w:
        l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_2w[:(n_candidates-len(l))]
    elif cust_id in purchase_dict_3w:
        l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_3w[:(n_candidates-len(l))]
    elif cust_id in purchase_dict_4w:
        l = sorted((purchase_dict_4w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_4w[:(n_candidates-len(l))]
    else:
        s = dummy_list
    prediction_dict[cust_id] = s

  k = list(map(lambda x: x[0], prediction_dict.items()))
  v = list(map(lambda x: x[1], prediction_dict.items()))
  negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
  negatives_df = (
      negatives_df
      .explode('negatives')
      .rename(columns = {'negatives': 'article_id'})
  )
  return negatives_df

In [99]:
# trainモデル
train['rank'] = range(len(train))
train.assign(rn = train.groupby(['customer_id'])['rank'].rank(method='first', ascending=False))

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,mean_transactions,max_transactions,min_transactions,median_transactions,sum_transactions,...,colour_group_name_3,perceived_colour_value_name_3,perceived_colour_master_name_3,department_name_3,index_name_3,index_group_name_3,section_name_3,garment_group_name_3,rank,rn
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,0.040661,0.050831,0.030492,0.040661,0.081322,...,1,1,1,0,0,1,0,0,0,2.0
4,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,0.040661,0.050831,0.030492,0.040661,0.081322,...,0,1,0,0,0,1,0,0,1,1.0
108,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,0.017271,0.020322,0.015237,0.016932,0.086356,...,0,0,0,0,1,1,0,0,2,5.0
111,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,0.017271,0.020322,0.015237,0.016932,0.086356,...,0,0,0,0,1,1,1,0,3,4.0
169,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,0.017271,0.020322,0.015237,0.016932,0.086356,...,0,1,0,0,1,1,1,0,4,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5977,2018-09-20,366b89e56112858fd747d2b8ec9519b8c7856d34e546eb...,671057002,0.003373,2,0.003373,0.003373,0.003373,0.003373,0.003373,...,0,0,0,0,1,1,1,1,9995,1.0
14967,2018-09-20,366d7effce9489934c3317a8e4e1828622d51f30cee29c...,305304008,0.011847,2,0.017356,0.027102,0.006763,0.017780,0.069424,...,0,1,1,0,1,1,1,0,9996,4.0
28830,2018-09-20,366d7effce9489934c3317a8e4e1828622d51f30cee29c...,587782001,0.027102,2,0.017356,0.027102,0.006763,0.017780,0.069424,...,0,0,0,0,1,1,1,0,9997,3.0
35592,2018-09-20,366d7effce9489934c3317a8e4e1828622d51f30cee29c...,671777002,0.006763,2,0.017356,0.027102,0.006763,0.017780,0.069424,...,1,1,1,0,1,0,0,1,9998,2.0


In [100]:
#take only last 15 transactions
#トレーニングデータの長さ分の数値を格納
train['rank'] = range(len(train))
#カスタマーごとに最新の15のトランザクションをトレーニングデータとして扱う
train = (
    train
    .assign(
        rn = train.groupby(['customer_id'])['rank']
                  .rank(method='first', ascending=False))
    .query("rn <= 15")
    .drop(columns = ['price', 'sales_channel_id'])
    .sort_values(['t_dat', 'customer_id'])
)
train['label'] = 1

del train['rank']
del train['rn']

valid.sort_values(['t_dat', 'customer_id'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [101]:
#カスタマーごとに最新の購入日を取得
last_dates = (
    train
    .groupby('customer_id')['t_dat']
    .max()
    .to_dict()
)

negatives = prepare_candidates(train['customer_id'].unique(), 15)
negatives['t_dat'] = negatives['customer_id'].map(last_dates)

negatives = (
    negatives
    .merge(user_features, on = ('customer_id'))
    .merge(item_features, on = ('article_id'))
)
negatives['label'] = 0

2954it [00:00, 576170.67it/s]


In [102]:
negatives

Unnamed: 0,customer_id,article_id,t_dat,mean_transactions,max_transactions,min_transactions,median_transactions,sum_transactions,max_minus_min_transactions,n_transactions,...,graphical_appearance_name_3,colour_group_name_3,perceived_colour_value_name_3,perceived_colour_master_name_3,department_name_3,index_name_3,index_group_name_3,section_name_3,garment_group_name_3,label
0,001127bffdda108579e6cb16080440e89bf1250a776c6e...,610776002,2018-09-20,0.033881,0.033881,0.033881,0.033881,0.033881,0.000000,1,...,1,1,1,1,0,1,1,0,1,0
1,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,610776002,2018-09-20,0.011000,0.011847,0.010153,0.011000,0.022000,0.001695,2,...,1,1,1,1,0,1,1,0,1,0
2,0074c5948b6c96e7522f6f5c034b46cf08875b9a81c557...,610776002,2018-09-20,0.042356,0.042356,0.042356,0.042356,0.042356,0.000000,1,...,1,1,1,1,0,1,1,0,1,0
3,00796ce0bc561897e7047a7b059867aa6424f63ec597e1...,610776002,2018-09-20,0.059305,0.059305,0.059305,0.059305,0.059305,0.000000,1,...,1,1,1,1,0,1,1,0,1,0
4,00b85712d5e677f7ae3bfa9f9b5780a6eb328e99aeea63...,610776002,2018-09-20,0.022017,0.022017,0.022017,0.022017,0.022017,0.000000,1,...,1,1,1,1,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,31450ad0f1788b63e44094b51b57558d22365ef4efb247...,399256001,2018-09-20,0.013564,0.028966,0.006763,0.010153,0.108508,0.022203,8,...,1,1,1,1,0,1,1,0,0,0
306,34bb5eff98255234fc9b543583cb88fab242ae4a938790...,400285006,2018-09-20,0.038401,0.047441,0.033881,0.033881,0.115203,0.013559,3,...,0,1,0,1,0,1,1,0,0,0
307,35164d04b16f43155283f3af6572195f9ba4be2e53ae98...,372860024,2018-09-20,0.023797,0.034305,0.010153,0.030492,0.118983,0.024153,5,...,1,0,0,0,0,0,1,0,0,0
308,35460dd0cc3e0d5c435db9895476a0b4b9289979fcd8ba...,554598001,2018-09-20,0.025407,0.025407,0.025407,0.025407,0.025407,0.000000,1,...,1,1,1,1,0,1,1,0,1,0


In [103]:
train = pd.concat([train, negatives])
train.sort_values(['customer_id', 't_dat'], inplace = True)

【メモ】  
LGBMRankerは、groupプロパティに「どこからどこまでの配列が一人の顧客がどの商品購入したトランザクションデータなのか」を伝える必要があるので上記で、カスタマーIDでソートして、以下の処理で各カスタマーIDがどの商品を何回購入したかの回数を取得する。  
その回数を配列にすることにより、「どこからどこまでの配列が一人の顧客がどの商品購入したトランザクションデータなのか」のデータ形式を満たすことができる。

In [104]:
train_baskets = train.groupby(['customer_id'])['article_id'].count().values

In [105]:
train_baskets

array([2, 5, 5, ..., 9, 1, 4])

↓↓↓学習↓↓↓

In [106]:
# LGBMRankerモデルで学習
ranker = lightgbm.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    max_depth=7,
    n_estimators=300,
    importance_type='gain',
    verbose=10
)

In [107]:
ranker = ranker.fit(
    train.drop(columns = ['t_dat', 'customer_id', 'article_id', 'label']),
    train.pop('label'),
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.842996
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.458185
[LightGBM] [Debug] init for col-wise cost 0.001434 seconds, init for row-wise cost 0.003263 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 3324
[LightGBM] [Info] Number of data points in the train set: 10060, number of used features: 82
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and dep

In [109]:
ranker

In [112]:
# 予測
candidates = prepare_candidates(sample_submission.customer_id.unique(), 12)
candidates = (
    candidates
    .merge(user_features, on = ('customer_id'))
    .merge(item_features, on = ('article_id'))
)

1362281it [00:05, 234402.62it/s]


In [113]:
preds = []
batch_size = 10000
# 10000行ごとにcandidatesを取り出し予測
# 予測結果はpredsに格納
for bucket in tqdm(range(0, len(candidates), batch_size)):
  outputs = ranker.predict(
      candidates.iloc[bucket: bucket+batch_size]
      .drop(columns = ['customer_id', 'article_id'])
      )
  preds.append(outputs)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 45.21it/s]


In [114]:
preds

[array([-2.53203175, -1.85842746, -4.89109683, -5.78492282, -5.84393092,
        -5.5286658 , -5.72628309, -5.55064068, -5.03735725, -5.34002309,
        -5.87968882, -5.47222629, -5.63930273, -5.57595375, -5.11878375,
        -5.11055765, -5.02671245, -5.48758535, -5.41404797, -5.06232197,
        -5.74845228, -5.59489236, -5.49338316, -5.08540016, -5.76909866,
        -5.22848878, -4.60982083, -4.90839129, -5.93075759, -5.54999631,
        -5.14447902, -5.10001797, -5.69475145, -5.27870112, -5.72157189,
        -5.92260856, -5.66227815, -5.79329436, -5.7577173 , -5.78348376,
        -5.88044145, -5.05458948, -5.81746295, -5.20467462, -5.44396964,
        -5.81872769, -4.78540403, -5.75219541, -5.94704971, -4.79584572,
        -5.0536797 , -5.79412599, -5.6009118 , -5.85035152, -5.15199679,
        -5.20257039, -5.84024797, -5.35195364, -5.09709712, -5.05696651,
        -5.82652603, -5.17008148, -4.90839858, -5.87542957, -5.68548869,
        -5.62624258, -5.60179202, -5.49896377, -5.6

In [115]:
preds = np.concatenate(preds)
preds

array([-2.53203175, -1.85842746, -4.89109683, -5.78492282, -5.84393092,
       -5.5286658 , -5.72628309, -5.55064068, -5.03735725, -5.34002309,
       -5.87968882, -5.47222629, -5.63930273, -5.57595375, -5.11878375,
       -5.11055765, -5.02671245, -5.48758535, -5.41404797, -5.06232197,
       -5.74845228, -5.59489236, -5.49338316, -5.08540016, -5.76909866,
       -5.22848878, -4.60982083, -4.90839129, -5.93075759, -5.54999631,
       -5.14447902, -5.10001797, -5.69475145, -5.27870112, -5.72157189,
       -5.92260856, -5.66227815, -5.79329436, -5.7577173 , -5.78348376,
       -5.88044145, -5.05458948, -5.81746295, -5.20467462, -5.44396964,
       -5.81872769, -4.78540403, -5.75219541, -5.94704971, -4.79584572,
       -5.0536797 , -5.79412599, -5.6009118 , -5.85035152, -5.15199679,
       -5.20257039, -5.84024797, -5.35195364, -5.09709712, -5.05696651,
       -5.82652603, -5.17008148, -4.90839858, -5.87542957, -5.68548869,
       -5.62624258, -5.60179202, -5.49896377, -5.68891883, -4.68

In [116]:
candidates['preds'] = preds
candidates['preds']

0     -2.532032
1     -1.858427
2     -4.891097
3     -5.784923
4     -5.843931
         ...   
161   -2.013736
162   -1.738727
163   -2.311533
164   -2.789873
165   -4.577373
Name: preds, Length: 166, dtype: float64

In [117]:
preds = candidates[['customer_id', 'article_id', 'preds']]
preds

Unnamed: 0,customer_id,article_id,preds
0,00b85712d5e677f7ae3bfa9f9b5780a6eb328e99aeea63...,372860001,-2.532032
1,053070ffd27651239c41ad9011888e0b7d9fc0892de672...,372860001,-1.858427
2,0137b87739a796f65396d8483173f66318039d19a2583f...,610776002,-4.891097
3,01393ee2d1e9494965b723a22d3b09d992a1b450d17621...,610776002,-5.784923
4,0218014619321c250395c5cb9bc9f5323745113f61f3a1...,610776002,-5.843931
...,...,...,...
161,31450ad0f1788b63e44094b51b57558d22365ef4efb247...,399256001,-2.013736
162,34bb5eff98255234fc9b543583cb88fab242ae4a938790...,400285006,-1.738727
163,35164d04b16f43155283f3af6572195f9ba4be2e53ae98...,372860024,-2.311533
164,35460dd0cc3e0d5c435db9895476a0b4b9289979fcd8ba...,554598001,-2.789873


In [118]:
preds.sort_values(['customer_id', 'preds'], ascending=False, inplace = True)
preds

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,customer_id,article_id,preds
77,36689f5d8b67b19a42fff664ad5e39ca4288bc264e72f7...,610776002,-4.862863
76,363c61c833d01887ce6f01ec8433591aee518dc76fd7a8...,610776002,-5.052750
165,35fbc75b992051d5fae144bff02808cb415450441f9a63...,573937001,-4.577373
75,35f4fdf1a87470f173f166e9b1e21f5b97a0db98eace57...,610776002,-5.505200
108,3591ef5a968c84c0d0820ba6db16b5e36559974fa777fc...,664074001,-2.371792
...,...,...,...
4,0218014619321c250395c5cb9bc9f5323745113f61f3a1...,610776002,-5.843931
78,013a3d7d6d974a818bcb8c5181b4f5016f8973550f42e1...,153115019,-2.638458
3,01393ee2d1e9494965b723a22d3b09d992a1b450d17621...,610776002,-5.784923
2,0137b87739a796f65396d8483173f66318039d19a2583f...,610776002,-4.891097


In [119]:
preds = (
    preds
    .groupby('customer_id')[['article_id']]
    .aggregate(lambda x: x.tolist())
)
preds

Unnamed: 0_level_0,article_id
customer_id,Unnamed: 1_level_1
00b85712d5e677f7ae3bfa9f9b5780a6eb328e99aeea633f6e8a3df5589d6486,[372860001]
0137b87739a796f65396d8483173f66318039d19a2583f33f9014a6b6f38719e,[610776002]
01393ee2d1e9494965b723a22d3b09d992a1b450d17621c4c970ef7faf4d0d8c,[610776002]
013a3d7d6d974a818bcb8c5181b4f5016f8973550f42e1049b86053643ace6e2,[153115019]
0218014619321c250395c5cb9bc9f5323745113f61f3a136da2f3762df5930c9,[610776002]
...,...
3591ef5a968c84c0d0820ba6db16b5e36559974fa777fcda230baeb586bb9d22,[664074001]
35f4fdf1a87470f173f166e9b1e21f5b97a0db98eace577b1ecdccd7ff77b3a6,[610776002]
35fbc75b992051d5fae144bff02808cb415450441f9a637c45a0dc4c9efecc3f,[573937001]
363c61c833d01887ce6f01ec8433591aee518dc76fd7a863d8e7dbbca3f1f693,[610776002]


In [120]:
preds['article_id'] = preds['article_id'].apply(lambda x: ' '.join(['0'+str(k) for k in x]))
preds['article_id'] 

customer_id
00b85712d5e677f7ae3bfa9f9b5780a6eb328e99aeea633f6e8a3df5589d6486    0372860001
0137b87739a796f65396d8483173f66318039d19a2583f33f9014a6b6f38719e    0610776002
01393ee2d1e9494965b723a22d3b09d992a1b450d17621c4c970ef7faf4d0d8c    0610776002
013a3d7d6d974a818bcb8c5181b4f5016f8973550f42e1049b86053643ace6e2    0153115019
0218014619321c250395c5cb9bc9f5323745113f61f3a136da2f3762df5930c9    0610776002
                                                                       ...    
3591ef5a968c84c0d0820ba6db16b5e36559974fa777fcda230baeb586bb9d22    0664074001
35f4fdf1a87470f173f166e9b1e21f5b97a0db98eace577b1ecdccd7ff77b3a6    0610776002
35fbc75b992051d5fae144bff02808cb415450441f9a637c45a0dc4c9efecc3f    0573937001
363c61c833d01887ce6f01ec8433591aee518dc76fd7a863d8e7dbbca3f1f693    0610776002
36689f5d8b67b19a42fff664ad5e39ca4288bc264e72f7603c5b764b4b3f0b2b    0610776002
Name: article_id, Length: 152, dtype: object

In [121]:
preds = sample_submission[['customer_id']].merge(
    preds
    .reset_index()
    .rename(columns = {'article_id': 'prediction'}), how = 'left')
preds['prediction'].fillna(' '.join(['0'+str(art) for art in dummy_list_2w]), inplace = True)

In [122]:
preds.to_csv('submisssion_ranking.csv', index = False)