In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 63 bytes


In [2]:
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations

Downloading h-and-m-personalized-fashion-recommendations.zip to /content
100% 28.7G/28.7G [06:47<00:00, 83.8MB/s]
100% 28.7G/28.7G [06:47<00:00, 75.6MB/s]


In [3]:
!kaggle datasets download -d post-hm-features

Downloading post-hm-features.zip to /content
 98% 921M/936M [00:30<00:00, 66.7MB/s]
100% 936M/936M [00:30<00:00, 32.5MB/s]


In [4]:
!unzip h-and-m-personalized-fashion-recommendations.zip articles.csv
!unzip h-and-m-personalized-fashion-recommendations.zip customers.csv
!unzip h-and-m-personalized-fashion-recommendations.zip transactions_train.csv
!unzip h-and-m-personalized-fashion-recommendations.zip sample_submission.csv

Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: articles.csv            
Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: customers.csv           
Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: transactions_train.csv  
Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: sample_submission.csv   


In [5]:
!unzip post-hm-features.zip article2vec.pickle

Archive:  post-hm-features.zip
  inflating: article2vec.pickle      


In [6]:
VALID = True
N_SIMILAR_USERS = 20
MINIMUM_PURCHASES = 3
START_DATE = '2020-08-01'
DROP_PURCHASED_ITEMS = False
DROP_USER_FROM_HIS_NEIGHBORHOOD = False
RECOMMEND_ITEMS = 100
USER_BATCH_SIZE = 10000

In [7]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

In [8]:
csv_train = 'transactions_train.csv'
csv_sub = 'sample_submission.csv'
csv_users = 'customers.csv'
csv_items = 'articles.csv'

df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
df_sub = pd.read_csv(csv_sub)

In [9]:
df["t_dat"] = pd.to_datetime(df["t_dat"])

date_week_df = df.drop_duplicates("t_dat")[["t_dat"]].reset_index(drop=True)
date_week_df['week_no'] = (date_week_df['t_dat'] + pd.DateOffset(days=5)).dt.week
date_week_df["week_no"] = date_week_df["week_no"].diff(1)
date_week_df["week_no"].fillna(0, inplace=True)
date_week_df["week_no"] = date_week_df["week_no"] != 0
date_week_df["week_no"] = date_week_df["week_no"].cumsum()

df = pd.merge(df, date_week_df, on="t_dat", how="left")

df.sort_values(['t_dat', 'customer_id'], inplace=True)

test_week = df.week_no.max()

valid = df[df.week_no == test_week].reset_index(drop=True)

if VALID:
    df = df[df.week_no < test_week].reset_index(drop=True)

  after removing the cwd from sys.path.


# Create mapping from ids to incremental integers and viceversa

In [10]:
dfu = pd.read_csv(csv_users)
dfi = pd.read_csv(csv_items, dtype={'article_id': str})

ALL_USERS = dfu['customer_id'].unique().tolist()
ALL_ITEMS = dfi['article_id'].unique().tolist()

user_to_customer_map = {user_id: customer_id for user_id, customer_id in enumerate(ALL_USERS)}
customer_to_user_map = {customer_id: user_id for user_id, customer_id in enumerate(ALL_USERS)}

item_to_article_map = {item_id: article_id for item_id, article_id in enumerate(ALL_ITEMS)}
article_to_item_map = {article_id: item_id for item_id, article_id in enumerate(ALL_ITEMS)}

In [11]:
df['user_id'] = df['customer_id'].map(customer_to_user_map)
df['item_id'] = df['article_id'].map(article_to_item_map)

# item vector

In [12]:
#article_columns = []
#for i in dfi.columns:
#    if "int" in str(dfi[i].dtype) and i != "article_id" and i != "product_code":
#        article_columns.append(i)
#article_columns += ["article_id"]      
#item_vector_df = dfi[article_columns]
#item_vector_df['item_id'] = item_vector_df['article_id'].map(article_to_item_map)
#item_vector_df.drop("article_id", axis=1, inplace=True)
#item_vector_df = item_vector_df.set_index("item_id")

#del dfu, dfi

#cols = [i for i in article_columns if i != "item_id" and i != "article_id"]
#item_vector_df = pd.get_dummies(item_vector_df, columns=cols)
#item_cols = list(item_vector_df.columns)

In [13]:
with open('article2vec.pickle', mode='rb') as f:
    V = pickle.load(f)

item_vector_df = pd.DataFrame(V)
item_cols = ["ivec"+str(i) for i in range(V.shape[1])]
item_vector_df.columns = item_cols
item_vector_df["article_id"] = dfi["article_id"]
item_vector_df['item_id'] = item_vector_df['article_id'].map(article_to_item_map)
item_vector_df.drop("article_id", axis=1, inplace=True)
item_vector_df = item_vector_df.set_index("item_id")

del dfu, dfi

# build model

In [15]:
def flatten(l):
    """ Flatten a list of lists"""
    return [item for sublist in l for item in sublist]

def compare_vectors(v1, v2):
    """Compare lists of purchased product for two given users
    v1 stands for the "vector representation for user 1", which is a list of the purchases of u1
    
    Returns:
        A value between 0 and 1 (similarity)
    """
    intersection = len(set(v1) & set(v2))
    denominator = np.sqrt(len(v1) * len(v2))
    return intersection / denominator

def sub_mat_cosine_similarity(v1, v2):
    v1 = np.vstack(v1)
    v2 = np.vstack(v2)
    ele1 = np.linalg.norm(v1, axis=1).reshape(-1,1)
    ele2 = np.linalg.norm(v2, axis=1).reshape(-1,1)
    cos_sim = v1.dot(v2.T) / ele1.dot(ele2.T)
    return cos_sim

def all_cosine_similarity(v2):
    v2 = np.vstack(v2)
    ele = np.linalg.norm(v2, axis=1).reshape(-1,1)
    cos_sim = v2.dot(v2.T) / ele.dot(ele.T)
    return cos_sim.T

def get_similar_users(u, index_, dfh, matrix):
    """
    Get the N_SIMILAR_USERS most similar users to the given one with their similarity score
    Arguments:
        u: the user_id, 
        v:  the "vector" representation of the user (list of item_id)
        dfh : the "history of transaccions" dataframe
        
    Returns:
        tuple of lists ([similar user_id], [similarity scores])
    """
    
    cosine_values = matrix[index_]
    tmp = pd.DataFrame(cosine_values, columns = ["cosine_values"])
    tmp.index = dfh.index
    similar_users = tmp.sort_values("cosine_values", ascending=False).head(N_SIMILAR_USERS + 1)

    if DROP_USER_FROM_HIS_NEIGHBORHOOD:
        similar_users = similar_users[similar_users.index != u]
                
    return similar_users.index.tolist(), similar_users.cosine_values.tolist()

def get_items(u, index_, dfh, matrix):
    """ Get the recommend items for a given users
    
    It will:
        1) Get similar users for the given user
        2) Obtain all the items those users purchased
        3) Rank them using the similarity scores of the user that purchased them
        4) Return the 12 best ranked
    
    Arguments:
        u: the user_id, 
        v:  the "vector" representation of the user (list of item_id)
        dfh : the "history of transaccions" dataframe
        matrix: similarity matrix
        
    Returns:
        list of item_id of lenght at most 12
    """
    
    users, scores = get_similar_users(u, index_, dfh, matrix)
    df_nn = pd.DataFrame({'user': users, 'score': scores})
    df_nn['items'] = df_nn.apply(lambda row: dfh.loc[row.user]["item_id"], axis=1)
    df_nn['weighted_items'] = df_nn.apply(lambda row: [(item, row.score) for item in row['items']], axis=1)

    recs = pd.DataFrame(flatten(df_nn['weighted_items'].tolist()), columns=['item', 'score']).groupby('item')['score'].sum().sort_values(ascending=False)
    if DROP_PURCHASED_ITEMS:
        recs = recs[~recs.index.isin(v)]
    # Keep top n item_ids
    return recs.head(RECOMMEND_ITEMS).index.tolist()

def get_items_chunk(user_ids: np.array, start: int, end: int, dfh: pd.DataFrame):
    """ Call get_item for a list of user_ids
    
    Arguments:
        user_ids: list of user_id, 
        dfh: the "history of transaccions" dataframe
        
    Returns:
        pd.Series with index user_id and list of item_id (recommendations) as value
    """
    
    #similarity_matrix = all_cosine_similarity(dfh.vector.values)
    similarity_matrix = sub_mat_cosine_similarity(dfh.vector.values[start:end], dfh.vector.values)

    df_user_vectors = pd.DataFrame(dfh.loc[user_ids]).reset_index()
    df_user_vectors["user_index"] = np.arange(end - start)
    df_user_vectors['recs'] = df_user_vectors.apply(lambda row: get_items(row.user_id, row.user_index, dfh, similarity_matrix), axis=1)
    return df_user_vectors.set_index('user_id').drop(["vector", "item_id"], axis=1)

def get_recommendations(users: list, dfh: pd.DataFrame, n_users: int):
    """
    Obtained recommendation for the users using transaccion dfh in a parallelized manner
    
    Call get_items_chunk in a "smart" multiprocessing fashion
    
    Arguments:
        users: list of user_id
        dfh: the "history of transaccions" dataframe
    
    Returns:
        pd.DataFrame with index user_id and list of item_id (recommendations) as value
    
    """
    time_start = time.time()
    
    df_recs = []
    
    # mini-batch * all-elements
    for i in range(0, n_users, USER_BATCH_SIZE):
        start = i
        if i + USER_BATCH_SIZE > n_users:
            end = n_users
        else:
            end = i + USER_BATCH_SIZE
        print(start, end)
            
        df_rec = get_items_chunk(users[start:end], start, end, dfh)
        df_recs.append(df_rec)
    df_recs = pd.concat(df_recs)
        
    elapsed = (time.time() - time_start) / 60
    print(f"Finished get_recommendations({len(users)}). It took {elapsed:5.2f} mins")
    return df_recs


def uucf(df, start_date=START_DATE):
    """ Entry point for the UUCF model. 
    
    Receive the original transactions_train.csv and a start_date and gets UUCF recommendations
    
    The model will not cover the full list of users, but just a subset of them.
    
    It will provide recommendations for users with at least MINIMUM_PURCHASES after start_date.
    It might return less than 12 recs per user.
    
    An ad-hoc function for filling these gaps should be used downstream.
    (See fill functionality right below)
    
    
    Arguments:
        df: The raw dataframe from transactions_train.csv
        start_date: a date
        
    Returns:
        a submission-like pd.DataFrame with columns [customer_id, prediction]
        'prediction' is a list and not a string though
    
    """
    operation_dict = {i : "mean" for i in item_cols}  
    df_small = df[df['t_dat'] > start_date]
    print(f"Kept data from {start_date} on. Total rows: {len(df_small)}")
    
    # H stands for "Transaction history"
    # dfh is a series of user_id => list of item_id (the list of purchases in order)
    dfh = df_small.groupby("user_id")['item_id'].apply(lambda items: list(set(items)))
    dfh = dfh[dfh.str.len() >= MINIMUM_PURCHASES]
    item_lists = dfh.values
    dfh = dfh.reset_index(drop=False).explode('item_id')
    dfh = pd.merge(dfh, item_vector_df, on = "item_id")
    dfh = dfh.groupby("user_id").agg(operation_dict)
    
    # transform to user * item_vector list
    index = dfh.index
    values = dfh.values.tolist()
    dfh = pd.DataFrame(index, columns=["user_id"])
    dfh["vector"] = values
    dfh["item_id"] = item_lists
    dfh = dfh.set_index("user_id")
    
    users = dfh.index.tolist()
    n_users = len(users)
    print(f"Total users in the time frame with at least {MINIMUM_PURCHASES}: {n_users}")
    
    df_rec = get_recommendations(users, dfh, n_users)
    df_rec['customer_id'] = df_rec.index.map(user_to_customer_map)
    df_rec['prediction'] = df_rec['recs'].map(lambda l: [item_to_article_map[i] for i in l])
    
    # Submission ready dataframe
    df_rec = df_rec.reset_index(drop=True)[['customer_id', 'prediction']]
    return df_rec 

In [16]:
df_recs = uucf(df)

Kept data from 2020-08-01 on. Total rows: 1753192
Total users in the time frame with at least 3: 197706
0 10000
10000 20000
20000 30000
30000 40000
40000 50000
50000 60000
60000 70000
70000 80000
80000 90000
90000 100000
100000 110000
110000 120000
120000 130000
130000 140000
140000 150000
150000 160000
160000 170000
170000 180000
180000 190000
190000 197706
Finished get_recommendations(197706). It took 156.58 mins


In [17]:
df_recs.head()

Unnamed: 0,customer_id,prediction
0,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0905518001, 0730683050, 0804992014, 085258400..."
1,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,"[0751471001, 0751471043, 0797892001, 088965200..."
2,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,"[0706016015, 0706016001, 0706016038, 080724100..."
3,0001b0127d3e5ff8dadcfc6e5043682dba2070f2667081...,"[0739590038, 0559630029, 0762143001, 082350500..."
4,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,"[0850244002, 0865699001, 0817361007, 088937000..."


In [18]:
def recall_check(input_df, cons_df):
    cons = cons_df.explode('prediction').reset_index(drop=True)
    cons.columns = ["customer_id", "article_id"]
    prev_length = input_df.shape[0]

    tmp_df = pd.merge(input_df, cons, on = ["customer_id", "article_id"], how="inner")
    after_length = tmp_df.shape[0]
    return tmp_df["article_id"].nunique(), after_length * 100 / prev_length

if VALID:
    print(recall_check(valid, df_recs))

(3300, 4.508740756769353)


(3266, 4.589053351698424)