In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import warnings
warnings.filterwarnings('ignore')

In [2]:
DATA_DIR = "/kaggle/input/h-and-m-personalized-fashion-recommendations/"
df = pd.read_csv(DATA_DIR + "transactions_train.csv")
article_df = pd.read_csv(DATA_DIR + "articles.csv")
customer_df = pd.read_csv(DATA_DIR + "customers.csv")

In [3]:
df["t_dat"] = pd.to_datetime(df["t_dat"])

date_week_df = df.drop_duplicates("t_dat")[["t_dat"]].reset_index(drop=True)
date_week_df['week_no'] = (date_week_df['t_dat'] + pd.DateOffset(days=5)).dt.week
date_week_df["week_no"] = date_week_df["week_no"].diff(1)
date_week_df["week_no"].fillna(0, inplace=True)
date_week_df["week_no"] = date_week_df["week_no"] != 0
date_week_df["week_no"] = date_week_df["week_no"].cumsum()

df = pd.merge(df, date_week_df, on="t_dat", how="left")

df.sort_values(['t_dat', 'customer_id'], inplace=True)

df = df[df.week_no > df.week_no.max() - 100].reset_index(drop=True)
test_week = df.week_no.max()

In [4]:
article_columns = ["garment_group_no",
                "product_code", "product_type_no", 
                "graphical_appearance_no", "colour_group_code", "perceived_colour_value_id",
                "perceived_colour_master_id", "department_no", "index_group_no", 
                "section_no"]

df = pd.merge(df, article_df[["article_id"] + article_columns], on='article_id', how='left')

In [5]:
# customerごとに何か購入したweek
customer2weeks = df.groupby('customer_id')['week_no'].unique()

# customerごとに学習データの特徴量となるweekとラベルとなるweekのペアを作る。
customer2weekpairs = {}

for c_id, weeks in customer2weeks.items():
    customer2weekpairs[c_id] = {}
    for i in range(weeks.shape[0]-1):
        customer2weekpairs[c_id][weeks[i]] = weeks[i+1]
    customer2weekpairs[c_id][weeks[-1]] = test_week

# feature engineering

In [6]:
def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

def label_prev_week(input_df):
    weeks = []
    for i, (c_id, week) in enumerate(zip(input_df['customer_id'], input_df['week_no'])):
        weeks.append(customer2weekpairs[c_id][week])
    input_df.week_no=weeks
    return input_df

In [7]:
class AbstractBaseBlock:
    def fit(self, input_df, y=None):
        return self.transform(input_df)

    def transform(self, input_df):
        raise NotImplementedError()
        
class UserHistoryEveryBlock(AbstractBaseBlock):
    """ユーザーの数値履歴を週間で集計した特徴量を付与する block 累積ではない"""
    def __init__(self, item):
        self.item = item

    def fit(self, input_df, y=None):
        out = input_df \
        .groupby(['customer_id', 'week_no'])[self.item].agg(["sum", "std", "max", "min"]).add_prefix("u_prev_"+self.item+"_").reset_index()
        out = label_prev_week(out)
        return self.transform(out)

    def transform(self, input_df):
        return input_df
    
class UserHistoryVectorEveryBlock(AbstractBaseBlock):
    """ユーザーの文字履歴を週間で集計しベクトル化した特徴量を付与する block 累積ではない"""
    def __init__(self, item, n_components = 5):
        self.item = item
        self.n_components = n_components
        
    def fit(self, input_df, y=None):
        docs = input_df.groupby(["customer_id", "week_no"])[self.item].apply(join)
        max_features = int(input_df[self.item].nunique() * 0.8)
        tv = TfidfVectorizer(max_features=max_features)
        X = tv.fit_transform(docs)
        
        svd = TruncatedSVD(n_components=self.n_components, random_state=0)
        X = svd.fit_transform(X)
        result = pd.DataFrame(X, columns=[f"u_{self.item}_{i}" for i in range(self.n_components)])
        result.index = docs.index
        result = result.reset_index()
        result = label_prev_week(result)
        return self.transform(result)
  
    def transform(self, input_df):
        return input_df  
    
class UserItemHistoryEveryBlock(AbstractBaseBlock):
    """ユーザーの数値履歴を週間で集計した特徴量を付与する block 累積ではない"""
    def __init__(self, item):
        self.item = item

    def fit(self, input_df, y=None):
        out = input_df \
        .groupby(['customer_id', 'week_no', self.item])["price"].agg(["sum", "std", "max", "min"]).add_prefix("ui_prev_"+self.item+"_").reset_index()
        out = label_prev_week(out)
        return self.transform(out)

    def transform(self, input_df):
        return input_df

# gengerate and output

In [8]:
# user_features
feature_blocks = [
    UserHistoryEveryBlock("price"),
    UserHistoryVectorEveryBlock("section_no"),
    UserHistoryVectorEveryBlock("garment_group_no"),
]

for i, block in enumerate(feature_blocks):
    if i == 0:
        user_features_df = block.fit(df)
    else:
        out_i = block.fit(df)
        user_features_df = pd.merge(user_features_df, out_i, on = ["customer_id", "week_no"], how="left")
        
user_features_df.to_csv("user_features.csv", index=False)
del user_features_df

In [9]:
# item_features

In [10]:
# user_item_features
user_item_features_df_garment_group = UserItemHistoryEveryBlock("garment_group_no").fit(df)
user_item_features_df_garment_group.to_csv("user_item_features_garment_group.csv", index=False)
del user_item_features_df_garment_group