- consider only item features
- modify functions for predictions of cold users
- pickle als model and user, item mapping
- remove cold prediction

In [1]:
!pip install similaripy

Collecting similaripy
  Downloading similaripy-0.1.2.tar.gz (331 kB)
     |████████████████████████████████| 331 kB 281 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: similaripy
  Building wheel for similaripy (setup.py) ... [?25l- \ | / - \ | / - done
[?25h  Created wheel for similaripy: filename=similaripy-0.1.2-cp37-cp37m-linux_x86_64.whl size=2040891 sha256=340225fc67b23764ab8969bbd67ac16a612b6494c0cfe00ace461461d950621b
  Stored in directory: /root/.cache/pip/wheels/2e/81/4b/7a396ebc45534553ddee1eb106d99023df4d9d09b53b13cc66
Successfully built similaripy
Installing collected packages: similaripy
Successfully installed similaripy-0.1.2


In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import preprocessing
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import implicit
import similaripy
import pickle

import warnings
warnings.filterwarnings('ignore')

tqdm.pandas()

In [3]:
DATA_DIR = "/kaggle/input/h-and-m-personalized-fashion-recommendations/"
df = pd.read_csv(DATA_DIR + "transactions_train.csv")
article_df = pd.read_csv(DATA_DIR + "articles.csv")
customer_df = pd.read_csv(DATA_DIR + "customers.csv")
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [4]:
df["t_dat"] = pd.to_datetime(df["t_dat"])

date_week_df = df.drop_duplicates("t_dat")[["t_dat"]].reset_index(drop=True)
date_week_df['week_no'] = (date_week_df['t_dat'] + pd.DateOffset(days=5)).dt.week
date_week_df["week_no"] = date_week_df["week_no"].diff(1)
date_week_df["week_no"].fillna(0, inplace=True)
date_week_df["week_no"] = date_week_df["week_no"] != 0
date_week_df["week_no"] = date_week_df["week_no"].cumsum()

df = pd.merge(df, date_week_df, on="t_dat", how="left")

df.sort_values(['t_dat', 'customer_id'], inplace=True)

test_week = df.week_no.max()
last_ts = df['t_dat'].max()

# als model

In [5]:
# for validation
tmp = df[df.week_no < test_week].reset_index(drop=True).copy()
    
# als model
all_users = tmp["customer_id"].unique().tolist()
all_items = tmp["article_id"].unique().tolist()

user_ids = dict(list(enumerate(all_users)))
item_ids = dict(list(enumerate(all_items)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {u: iidx for iidx, u in item_ids.items()}

tmp["customer_id"] = tmp["customer_id"].map(user_map)
tmp["article_id"] = tmp["article_id"].map(item_map)

row = tmp["customer_id"].values
col = tmp["article_id"].values

data = np.ones(tmp.shape[0])
coo = coo_matrix((data, (row, col)), shape=(len(all_users), len(all_items)))

n_factors = 5
als_model = implicit.als.AlternatingLeastSquares(factors=n_factors, iterations = 5, regularization=0.1, random_state=2240)
als_model.fit(coo)
item_factor_columns = ["article_id_"+str(i) for i in range(n_factors)]

del row, col, tmp, data, coo, user_ids, item_ids

with open('als_model_val.pickle', mode='wb') as f:
    pickle.dump(als_model, f)
    
with open('user_map_val.pickle', mode='wb') as f:
    pickle.dump(user_map, f)
    
with open('item_map_val.pickle', mode='wb') as f:
    pickle.dump(item_map, f)

  0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
# for full train
tmp = df.copy()
    
# als model
all_users = tmp["customer_id"].unique().tolist()
all_items = tmp["article_id"].unique().tolist()

user_ids = dict(list(enumerate(all_users)))
item_ids = dict(list(enumerate(all_items)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {u: iidx for iidx, u in item_ids.items()}

tmp["customer_id"] = tmp["customer_id"].map(user_map)
tmp["article_id"] = tmp["article_id"].map(item_map)

row = tmp["customer_id"].values
col = tmp["article_id"].values

data = np.ones(tmp.shape[0])
coo = coo_matrix((data, (row, col)), shape=(len(all_users), len(all_items)))

n_factors = 5
als_model = implicit.als.AlternatingLeastSquares(factors=n_factors, iterations = 5, regularization=0.1, random_state=2240)
als_model.fit(coo)
item_factor_columns = ["article_id_"+str(i) for i in range(n_factors)]

del row, col, tmp, data, coo, user_ids, item_ids

with open('als_model_full.pickle', mode='wb') as f:
    pickle.dump(als_model, f)
    
with open('user_map_full.pickle', mode='wb') as f:
    pickle.dump(user_map, f)
    
with open('item_map_full.pickle', mode='wb') as f:
    pickle.dump(item_map, f)

  0%|          | 0/5 [00:00<?, ?it/s]

# first appperance week of article_id

In [7]:
article_week_range = df.groupby("article_id").week_no.agg(["min", "max"]).reset_index()
article_week_range.to_csv('article_week_range.csv', index=None)

# features by similaripy

In [8]:
# normalize matrix with bm25
#coo_bm = similaripy.normalization.bm25(coo)

# train the model with 50 knn per item 
#model = similaripy.cosine(coo_bm.T, k=50)

# recommend 100 items to users 1, 14 and 8 filtering the items already seen by each users
#user_recommendations = sim.dot_product(coo_bm, model.T, k=100, target_rows=[1,14,8], filter_cols=coo_bm)

# features by abstraction block

In [9]:
df = df[df.week_no > df.week_no.max() - 50].reset_index(drop=True)

In [10]:
article_columns = ["garment_group_no",
                "product_code", "product_type_no", 
                "graphical_appearance_no", "colour_group_code", "perceived_colour_value_id",
                "perceived_colour_master_id", "department_no", "index_group_no", 
                "section_no"]

df = pd.merge(df, article_df[["article_id"] + article_columns], on='article_id', how='left')

In [11]:
# customerごとに何か購入したweek
customer2weeks = df.groupby('customer_id')['week_no'].unique()

# customerごとに学習データの特徴量となるweekとラベルとなるweekのペアを作る。
customer2weekpairs = {}

for c_id, weeks in customer2weeks.items():
    customer2weekpairs[c_id] = {}
    for i in range(weeks.shape[0]-1):
        customer2weekpairs[c_id][weeks[i]] = weeks[i+1]
    customer2weekpairs[c_id][weeks[-1]] = test_week

In [12]:
def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

def label_prev_week(input_df):
    weeks = []
    for i, (c_id, week) in enumerate(zip(input_df['customer_id'], input_df['week_no'])):
        weeks.append(customer2weekpairs[c_id][week])
    input_df.week_no=weeks
    return input_df

In [13]:
class AbstractBaseBlock:
    def fit(self, input_df, y=None):
        return self.transform(input_df)

    def transform(self, input_df):
        raise NotImplementedError()
        
class UserHistoryEveryBlock(AbstractBaseBlock):
    """ユーザーの数値履歴を週間で集計した特徴量を付与する block 累積ではない"""
    def __init__(self, item):
        self.item = item

    def fit(self, input_df, y=None):
        out = input_df \
        .groupby(['customer_id', 'week_no'])[self.item].agg(["sum", "std", "max", "min"]).add_prefix("u_prev_"+self.item+"_").reset_index()
        out = label_prev_week(out)
        return self.transform(out)

    def transform(self, input_df):
        return input_df
    
class UserHistoryVectorEveryBlock(AbstractBaseBlock):
    """ユーザーの文字履歴を週間で集計しベクトル化した特徴量を付与する block 累積ではない"""
    def __init__(self, item, n_components = 5):
        self.item = item
        self.n_components = n_components
        
    def fit(self, input_df, y=None):
        docs = input_df.groupby(["customer_id", "week_no"])[self.item].apply(join)
        max_features = int(input_df[self.item].nunique() * 0.8)
        tv = TfidfVectorizer(max_features=max_features)
        X = tv.fit_transform(docs)
        
        svd = TruncatedSVD(n_components=self.n_components, random_state=0)
        X = svd.fit_transform(X)
        result = pd.DataFrame(X, columns=[f"u_{self.item}_{i}" for i in range(self.n_components)])
        result.index = docs.index
        result = result.reset_index()
        result = label_prev_week(result)
        return self.transform(result)
  
    def transform(self, input_df):
        return input_df  

class ItemHistoryEveryBlock(AbstractBaseBlock):
    """アイテムの数値履歴を週間で集計した特徴量を付与する block 累積ではない"""
    def __init__(self, item):
        self.item = item

    def fit(self, input_df, y=None):
        out = input_df \
        .groupby(['week_no'])[self.item].value_counts()
        out = out.to_frame("sale_count").reset_index()
        return self.transform(out)

    def transform(self, input_df):
        return input_df
    
class ItemHistorySumBlock(AbstractBaseBlock):
    """アイテムの数値履歴を週間で集計した特徴量を付与する block 累積ではない"""
    def __init__(self, item):
        self.item = item

    def fit(self, input_df, y=None):
        out = input_df \
        .groupby(['week_no', self.item])['price'].agg(["sum"]).add_prefix("sale_").reset_index()
        return self.transform(out)

    def transform(self, input_df):
        return input_df

In [14]:
# user_features
feature_blocks = [
#    UserHistoryEveryBlock("price"),
    UserHistoryVectorEveryBlock("section_no"),
    UserHistoryVectorEveryBlock("garment_group_no"),
]

for i, block in enumerate(feature_blocks):
    if i == 0:
        user_features_df = block.fit(df)
    else:
        out_i = block.fit(df)
        user_features_df = pd.merge(user_features_df, out_i, on = ["customer_id", "week_no"], how="left")
        
user_features_df.to_csv("user_features.csv", index=False)
del user_features_df

In [15]:
# item_features
article_count = ItemHistoryEveryBlock("article_id").fit(df)
article_count.to_csv("article_count.csv", index=False)

article_sale = ItemHistorySumBlock("article_id").fit(df)
article_sale.to_csv("article_sale.csv", index=False)