- consider only item features
- modify functions for predictions of cold users
- pickle als model

In [1]:
!pip install similaripy

Collecting similaripy
  Downloading similaripy-0.1.2.tar.gz (331 kB)
     |████████████████████████████████| 331 kB 2.0 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: similaripy
  Building wheel for similaripy (setup.py) ... [?25l- \ | / - \ | / - done
[?25h  Created wheel for similaripy: filename=similaripy-0.1.2-cp37-cp37m-linux_x86_64.whl size=2040867 sha256=11de8692a8a2608e087bdfda38ca902fb1f44a610333c7199bfbb89ac47c760d
  Stored in directory: /root/.cache/pip/wheels/2e/81/4b/7a396ebc45534553ddee1eb106d99023df4d9d09b53b13cc66
Successfully built similaripy
Installing collected packages: similaripy
Successfully installed similaripy-0.1.2


In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import preprocessing
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import implicit
import similaripy
import pickle

import warnings
warnings.filterwarnings('ignore')

tqdm.pandas()

In [3]:
DATA_DIR = "/kaggle/input/h-and-m-personalized-fashion-recommendations/"
df = pd.read_csv(DATA_DIR + "transactions_train.csv")
article_df = pd.read_csv(DATA_DIR + "articles.csv")
customer_df = pd.read_csv(DATA_DIR + "customers.csv")
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [4]:
df["t_dat"] = pd.to_datetime(df["t_dat"])

date_week_df = df.drop_duplicates("t_dat")[["t_dat"]].reset_index(drop=True)
date_week_df['week_no'] = (date_week_df['t_dat'] + pd.DateOffset(days=5)).dt.week
date_week_df["week_no"] = date_week_df["week_no"].diff(1)
date_week_df["week_no"].fillna(0, inplace=True)
date_week_df["week_no"] = date_week_df["week_no"] != 0
date_week_df["week_no"] = date_week_df["week_no"].cumsum()

df = pd.merge(df, date_week_df, on="t_dat", how="left")

df.sort_values(['t_dat', 'customer_id'], inplace=True)

test_week = df.week_no.max()
last_ts = df['t_dat'].max()

# als model

In [5]:
# for validation
tmp = df[df.week_no < test_week].reset_index(drop=True).copy()
    
# als model
all_users = tmp["customer_id"].unique().tolist()
all_items = tmp["article_id"].unique().tolist()

user_ids = dict(list(enumerate(all_users)))
item_ids = dict(list(enumerate(all_items)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {u: iidx for iidx, u in item_ids.items()}

tmp["customer_id"] = tmp["customer_id"].map(user_map)
tmp["article_id"] = tmp["article_id"].map(item_map)

row = tmp["customer_id"].values
col = tmp["article_id"].values

data = np.ones(tmp.shape[0])
coo = coo_matrix((data, (row, col)), shape=(len(all_users), len(all_items)))

n_factors = 5
als_model = implicit.als.AlternatingLeastSquares(factors=n_factors, iterations = 5, regularization=0.1, random_state=2240)
als_model.fit(coo)
item_factor_columns = ["article_id_"+str(i) for i in range(n_factors)]

del row, col, tmp, data, coo, user_ids, item_ids

with open('als_model_val.pickle', mode='wb') as f:
    pickle.dump(als_model, f)

  0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
# for full train
tmp = df.copy()
    
# als model
all_users = tmp["customer_id"].unique().tolist()
all_items = tmp["article_id"].unique().tolist()

user_ids = dict(list(enumerate(all_users)))
item_ids = dict(list(enumerate(all_items)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {u: iidx for iidx, u in item_ids.items()}

tmp["customer_id"] = tmp["customer_id"].map(user_map)
tmp["article_id"] = tmp["article_id"].map(item_map)

row = tmp["customer_id"].values
col = tmp["article_id"].values

data = np.ones(tmp.shape[0])
coo = coo_matrix((data, (row, col)), shape=(len(all_users), len(all_items)))

n_factors = 5
als_model = implicit.als.AlternatingLeastSquares(factors=n_factors, iterations = 5, regularization=0.1, random_state=2240)
als_model.fit(coo)
item_factor_columns = ["article_id_"+str(i) for i in range(n_factors)]

del row, col, tmp, data, coo, user_ids, item_ids

with open('als_model_full.pickle', mode='wb') as f:
    pickle.dump(als_model, f)

  0%|          | 0/5 [00:00<?, ?it/s]

# first appperance week of article_id

In [7]:
article_week_range = df.groupby("article_id").week_no.agg(["min", "max"]).reset_index()
article_week_range.to_csv('article_week_range.csv', index=None)

# preds for cold users
- https://www.kaggle.com/byfone/h-m-trending-products-weekly

In [8]:
df = df[df.week_no > df.week_no.max() - 50].reset_index(drop=True)

In [9]:
def make_cold_predictions(input_df, sub):
    # Count the number of transactions per week
    weekly_sales = input_df.drop('customer_id', axis=1).groupby(['week_no', 'article_id'])['t_dat'].count().reset_index()
    weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})
    input_df = pd.merge(input_df, weekly_sales, on=['week_no', 'article_id'], how="left")
    
    #Let's assume that in the target week sales will be similar to the last week of the training data
    weekly_sales = weekly_sales.set_index('article_id')

    input_df = input_df.join(
        weekly_sales.loc[weekly_sales['week_no']==input_df.week_no.max(), ['count']],
        on='article_id', rsuffix="_targ")

    input_df['count_targ'].fillna(0, inplace=True)
    del weekly_sales
    
    # Calculate sales rate adjusted for changes in product popularity¶
    input_df['quotient'] = input_df['count_targ'] / input_df['count']
    
    N = 12
    # Take supposedly popular products¶
    target_sales = input_df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
    general_pred = target_sales.nlargest(N).index.tolist()
    del target_sales
    
    # Fill in purchase dictionary¶
    purchase_dict = {}

    for i in tqdm(input_df.index):
        cust_id = input_df.at[i, 'customer_id']
        art_id = input_df.at[i, 'article_id']
        t_dat = input_df.at[i, 't_dat']

        if cust_id not in purchase_dict:
            purchase_dict[cust_id] = {}

        if art_id not in purchase_dict[cust_id]:
            purchase_dict[cust_id][art_id] = 0
    
        x = max(1, (last_ts - t_dat).days)

        a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
        y = a / np.sqrt(x) + b * np.exp(-c*x) - d

        value = input_df.at[i, 'quotient'] * max(0, y)
        purchase_dict[cust_id][art_id] += value
        

    pred_list = []
    for cust_id in tqdm(sub['customer_id']):
        if cust_id in purchase_dict:
            series = pd.Series(purchase_dict[cust_id])
            series = series[series > 0]
            l = series.nlargest(N).index.tolist()
            if len(l) < N:
                l = l + general_pred[:(N-len(l))]
        else:
            l = general_pred
        l = ['0'+str(ele) for ele in l]
        pred_list.append(' '.join(l))

    sub['prediction'] = pred_list
    return sub

In [10]:
tmp1 = df[(df.week_no > test_week-30) & (df.week_no < test_week)].reset_index(drop=True).copy()
sub1 = make_cold_predictions(tmp1, sub)
sub1.to_csv('cold_user_predictions1.csv', index=None)
del tmp1

100%|██████████| 8817060/8817060 [16:41<00:00, 8800.76it/s]
100%|██████████| 1371980/1371980 [10:13<00:00, 2237.61it/s]


In [11]:
tmp = df[(df.week_no > test_week-30)].reset_index(drop=True).copy()
sub = make_cold_predictions(tmp, sub)
sub.to_csv('cold_user_predictions_for_test.csv', index=None)
del tmp

100%|██████████| 9057371/9057371 [17:10<00:00, 8791.44it/s]
100%|██████████| 1371980/1371980 [10:17<00:00, 2221.62it/s]


# features by implicit

In [12]:
#all_users = df["customer_id"].unique().tolist()
#all_items = df["article_id"].unique().tolist()

#user_ids = dict(list(enumerate(all_users)))
#item_ids = dict(list(enumerate(all_items)))

#user_map = {u: uidx for uidx, u in user_ids.items()}
#item_map = {u: iidx for iidx, u in item_ids.items()}

#df["customer_id"] = df["customer_id"].map(user_map)
#df["article_id"] = df["article_id"].map(item_map)

#row = df["customer_id"].values
#col = df["article_id"].values

#data = np.ones(df.shape[0])
#coo = coo_matrix((data, (row, col)), shape=(len(all_users), len(all_items)))

#model = implicit.als.AlternatingLeastSquares(factors=10, iterations = 5, regularization=0.01)
#model.fit(coo)

#model.user_factors, model.user_factors.shape
#model.item_factors, model.item_factors.shape

# features by similaripy

In [13]:
# normalize matrix with bm25
#coo_bm = similaripy.normalization.bm25(coo)

# train the model with 50 knn per item 
#model = similaripy.cosine(coo_bm.T, k=50)

# recommend 100 items to users 1, 14 and 8 filtering the items already seen by each users
#user_recommendations = sim.dot_product(coo_bm, model.T, k=100, target_rows=[1,14,8], filter_cols=coo_bm)

# features by abstraction block

In [14]:
article_columns = ["garment_group_no",
                "product_code", "product_type_no", 
                "graphical_appearance_no", "colour_group_code", "perceived_colour_value_id",
                "perceived_colour_master_id", "department_no", "index_group_no", 
                "section_no"]

df = pd.merge(df, article_df[["article_id"] + article_columns], on='article_id', how='left')

In [15]:
# customerごとに何か購入したweek
customer2weeks = df.groupby('customer_id')['week_no'].unique()

# customerごとに学習データの特徴量となるweekとラベルとなるweekのペアを作る。
customer2weekpairs = {}

for c_id, weeks in customer2weeks.items():
    customer2weekpairs[c_id] = {}
    for i in range(weeks.shape[0]-1):
        customer2weekpairs[c_id][weeks[i]] = weeks[i+1]
    customer2weekpairs[c_id][weeks[-1]] = test_week

In [16]:
def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

def label_prev_week(input_df):
    weeks = []
    for i, (c_id, week) in enumerate(zip(input_df['customer_id'], input_df['week_no'])):
        weeks.append(customer2weekpairs[c_id][week])
    input_df.week_no=weeks
    return input_df

In [17]:
class AbstractBaseBlock:
    def fit(self, input_df, y=None):
        return self.transform(input_df)

    def transform(self, input_df):
        raise NotImplementedError()
        
class UserHistoryEveryBlock(AbstractBaseBlock):
    """ユーザーの数値履歴を週間で集計した特徴量を付与する block 累積ではない"""
    def __init__(self, item):
        self.item = item

    def fit(self, input_df, y=None):
        out = input_df \
        .groupby(['customer_id', 'week_no'])[self.item].agg(["sum", "std", "max", "min"]).add_prefix("u_prev_"+self.item+"_").reset_index()
        out = label_prev_week(out)
        return self.transform(out)

    def transform(self, input_df):
        return input_df
    
class UserHistoryVectorEveryBlock(AbstractBaseBlock):
    """ユーザーの文字履歴を週間で集計しベクトル化した特徴量を付与する block 累積ではない"""
    def __init__(self, item, n_components = 5):
        self.item = item
        self.n_components = n_components
        
    def fit(self, input_df, y=None):
        docs = input_df.groupby(["customer_id", "week_no"])[self.item].apply(join)
        max_features = int(input_df[self.item].nunique() * 0.8)
        tv = TfidfVectorizer(max_features=max_features)
        X = tv.fit_transform(docs)
        
        svd = TruncatedSVD(n_components=self.n_components, random_state=0)
        X = svd.fit_transform(X)
        result = pd.DataFrame(X, columns=[f"u_{self.item}_{i}" for i in range(self.n_components)])
        result.index = docs.index
        result = result.reset_index()
        result = label_prev_week(result)
        return self.transform(result)
  
    def transform(self, input_df):
        return input_df  

class ItemHistoryEveryBlock(AbstractBaseBlock):
    """アイテムの数値履歴を週間で集計した特徴量を付与する block 累積ではない"""
    def __init__(self, item):
        self.item = item

    def fit(self, input_df, y=None):
        out = input_df \
        .groupby(['week_no'])[self.item].value_counts()
        out = out.to_frame("sale_count").reset_index()
        return self.transform(out)

    def transform(self, input_df):
        return input_df
    
class ItemHistorySumBlock(AbstractBaseBlock):
    """アイテムの数値履歴を週間で集計した特徴量を付与する block 累積ではない"""
    def __init__(self, item):
        self.item = item

    def fit(self, input_df, y=None):
        out = input_df \
        .groupby(['week_no', self.item])['price'].agg(["sum"]).add_prefix("sale_").reset_index()
        return self.transform(out)

    def transform(self, input_df):
        return input_df

In [18]:
# user_features
#feature_blocks = [
#    UserHistoryEveryBlock("price"),
#    UserHistoryVectorEveryBlock("section_no"),
#    UserHistoryVectorEveryBlock("garment_group_no"),
#]

#for i, block in enumerate(feature_blocks):
#    if i == 0:
#        user_features_df = block.fit(df)
#    else:
#        out_i = block.fit(df)
#        user_features_df = pd.merge(user_features_df, out_i, on = ["customer_id", "week_no"], how="left")
        
#user_features_df.to_csv("user_features.csv", index=False)
#del user_features_df

In [19]:
# item_features
article_count = ItemHistoryEveryBlock("article_id").fit(df)
article_count.to_csv("article_count.csv", index=False)

article_sale = ItemHistorySumBlock("article_id").fit(df)
article_sale.to_csv("article_sale.csv", index=False)