This notebook creates a very simple model for generating outfits. This is sligth better than a random model. It computes the most popular category combinations in the outfits dataset and sort the candidate items, by this category combination pupularity.

This script takes less than 5 minutes to run

# Train - Computation of the category combinations

In [1]:
import datetime
print(datetime.datetime.now())

2022-05-22 21:08:00.057456


In [2]:
import pandas as pd

full_outfits = pd.read_parquet("../data/manual_outfits.parquet")
full_outfits.head()

Unnamed: 0,products,outfit_id
0,"[15360881, 15379678, 15781925, 16204075, 16260...",0
1,"[13893589, 13893721, 15426616, 16035469, 17173...",1
2,"[13508028, 14161732, 16160567, 17484491, 17503...",2
3,"[16127776, 16756133, 17040752, 18203427, 18205...",3
4,"[14480467, 15487690, 17257765]",4


In [3]:
items_metadata = pd.read_parquet("../data/products.parquet")
items_metadata = items_metadata[["product_id", "product_category"]]
items_metadata.set_index("product_id", inplace=True)
items_metadata.head()

Unnamed: 0_level_0,product_category
product_id,Unnamed: 1_level_1
17073270,Knitwear
17674562,Knitwear
17678603,Knitwear
17179699,Knitwear
15907453,Sweaters & Knitwear


In [4]:
outfits_category = full_outfits.explode(column="products")
outfits_category = outfits_category.merge(
    items_metadata,
    left_on="products",
    right_on=items_metadata.index,
    how="inner"
)
outfits_category.rename(columns={"product_category": "categories"}, inplace=True)
outfits_category.head()

Unnamed: 0,products,outfit_id,categories
0,15360881,0,Trainers
1,15360881,716,Trainers
2,15360881,977,Trainers
3,15360881,1069,Trainers
4,15360881,1186,Trainers


In [5]:
outfits_category = outfits_category.groupby("outfit_id").agg({"categories": list, "products": list})
outfits_category.reset_index(inplace=True)

outfits_category["categories_sorted"] = outfits_category.apply(lambda row: frozenset(row["categories"]), axis=1)
outfits_category.head()

Unnamed: 0,outfit_id,categories,products,categories_sorted
0,0,"[Trainers, Shirts, Jackets, Clutch Bags, Trous...","[15360881, 15379678, 15781925, 16204075, 16260...","(Shirts, Clutch Bags, Trousers, Jackets, Train..."
1,1,"[Necklaces, Necklaces, Necklaces, Earrings, Sa...","[13893589, 13893721, 15426616, 16035469, 17173...","(Necklaces, Sandals, Earrings, Jackets)"
2,2,"[Earrings, Trainers, Trousers, Tops, Satchels ...","[13508028, 14161732, 16160567, 17484491, 17503...","(Tops, Earrings, Trousers, Satchels & Cross Bo..."
3,3,"[Denim, Earrings, Pumps, All in One, Coats]","[16127776, 16756133, 17040752, 18203427, 18205...","(Coats, Denim, All in One, Earrings, Pumps)"
4,4,"[Trainers, Shoulder Bags, Sweaters & Knitwear]","[14480467, 15487690, 17257765]","(Shoulder Bags, Trainers, Sweaters & Knitwear)"


In [7]:
from collections import Counter, defaultdict

categories_sets = Counter(outfits_category.categories_sorted.values)
toy_model = {
    "items_metadata": items_metadata,
    "categories_sets": categories_sets
}
toy_model

{'items_metadata':                product_category
 product_id                     
 17073270               Knitwear
 17674562               Knitwear
 17678603               Knitwear
 17179699               Knitwear
 15907453    Sweaters & Knitwear
 ...                         ...
 18020113                  Denim
 18151251                  Denim
 18161372                  Denim
 18164968                  Denim
 18250540                  Denim
 
 [398670 rows x 1 columns],
 'categories_sets': Counter({frozenset({'Clutch Bags',
                     'Jackets',
                     'Shirts',
                     'Trainers',
                     'Trousers'}): 32,
          frozenset({'Earrings', 'Jackets', 'Necklaces', 'Sandals'}): 1,
          frozenset({'Earrings',
                     'Satchels & Cross Body Bags',
                     'Tops',
                     'Trainers',
                     'Trousers'}): 79,
          frozenset({'All in One', 'Coats', 'Denim', 'Earrings', 'Pumps'}):

# Predict - Using the category sets to rank the candidates

### Use the file name generatad by evaluation/simple_split_dataset.ipynb here

In [8]:
incomplete_outfits_input = "../data/manual_outfits_testinput_1651692691.parquet"

In [9]:
test_outfits = pd.read_parquet(incomplete_outfits_input)
test_outfits.head()

Unnamed: 0,outfit_id,incomplete_outfit,candidates
0,0,"[16260894, 15360881, 15781925, 16204075]","[16953540, 15843277, 16639054, 18203407, 17243..."
1,1,"[15426616, 16035469, 18218977, 13893721, 13893...","[16359488, 18006145, 17565154, 17236069, 16666..."
2,2,"[17484491, 16160567, 17503108, 13508028]","[16674563, 15180164, 16938764, 16917773, 18144..."
3,3,"[17040752, 18205465, 16127776, 18203427]","[17425028, 16709508, 17220615, 16115468, 16631..."
7,7,"[13990742, 14371345, 16487493]","[18149250, 16299400, 17524105, 17168907, 17305..."


In [10]:
def predict(model, incomplete_outfit, candidates):
    """
    This is the core of your model. In our example, we are going to use the toy model we build in the first part 
    of this notebook, but you are free to create your amazing model and use it here.
    Arguments:
        model: the outfits model
        incomplete_outfit: a list of product_id containing the outfit we want to complete
        candidates: a list of product_id that you are tasked to select the rigth product to complete the outfit
    Return
        A list of product_id sorted according your model
    """
    scores = []
    for candidate in candidates:
        outfit = incomplete_outfit + [candidate]
        try:
            categories = model["items_metadata"].loc[outfit]["product_category"].tolist()
            score = model["categories_sets"].get(frozenset(categories), 0)
        except KeyError:
            score = 0
        scores.append((score, candidate))
    return [
        candidate
        for _, candidate in sorted(scores, reverse=True)
    ]

test_outfits["predicted_products"] = test_outfits.apply(lambda row: predict(toy_model, row["incomplete_outfit"], row["candidates"]), axis=1)
test_outfits["predicted_product"] = test_outfits.apply(lambda row: row["predicted_products"][0], axis=1)
test_outfits.head()

Unnamed: 0,outfit_id,incomplete_outfit,candidates,predicted_products,predicted_product
0,0,"[16260894, 15360881, 15781925, 16204075]","[16953540, 15843277, 16639054, 18203407, 17243...","[18203407, 18057462, 18012827, 17803125, 17534...",18203407
1,1,"[15426616, 16035469, 18218977, 13893721, 13893...","[16359488, 18006145, 17565154, 17236069, 16666...","[18292694, 18217074, 18006145, 17793979, 17715...",18292694
2,2,"[17484491, 16160567, 17503108, 13508028]","[16674563, 15180164, 16938764, 16917773, 18144...","[18361594, 18325923, 18219660, 18144908, 18111...",18361594
3,3,"[17040752, 18205465, 16127776, 18203427]","[17425028, 16709508, 17220615, 16115468, 16631...","[17784236, 17754878, 17538910, 17425028, 17220...",17784236
7,7,"[13990742, 14371345, 16487493]","[18149250, 16299400, 17524105, 17168907, 17305...","[18149250, 17996819, 17524105, 17449640, 17389...",18149250


In [11]:
output_name = incomplete_outfits_input.replace(".parquet", "_predictions.csv")
output_columns = ["outfit_id", "predicted_product"]
test_outfits[output_columns].to_csv(output_name, header=True, index=False)

In [12]:
print(datetime.datetime.now())

2022-05-22 21:21:57.874392
