## Training Multi-class classifier

In [2]:
import pandas as pd
import numpy as np
import os
import random
import re
from sklearn.preprocessing import OneHotEncoder
import pickle
from tqdm import tqdm

### Importing and cleaning data

In [2]:
# to get the data of foundations from directory
# the first dataset
foundation_data = pd.read_json('data_full_review_cleaned/Anastasia_Beverly_Hills_Stick_Foundation.json', lines=True)
foundation_data['brand_product'] = 'Anastasia_Beverly_Hills_Stick_Foundation'
for file in os.listdir('data_full_review_cleaned/'):
   if file.endswith(".json"):
      new_data = pd.read_json(f'data_full_review_cleaned/{file}', lines=True)
      new_data['brand_product'] = file.replace('.json', '')
      foundation_data = pd.concat([foundation_data, new_data])
foundation_data.reset_index(drop=True, inplace=True)

In [3]:
# adding foundation features: coverage and finish
foundation_features = pd.read_csv('foundation_features_parsed.csv')
foundation_data = pd.merge(foundation_data, foundation_features, on='brand_product', how='left')

In [4]:
len(foundation_data)

121527

In [5]:
foundation_data.head()

Unnamed: 0,reviewer_id,rating,recommended,review_subject,review_content,reviewer_feature,purchased_shade,date_of_review,eye_color,hair_color,...,skin_type_combo,skin_type_combination,skin_type_normal,natural_finish,matte_finish,radiant_finish,medium_coverage,full_coverage,sheer_coverage,light_coverage
0,VanW,2,0,Very Dry - There Are Better Alternatives,I wanted to try a stick foundation and figured...,", Dry skin",,1618531200000,Brown,Brunette,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,camy2770,5,1,Game changer,I’ve had this foundation in my collection for ...,", Oily skin",,1617753600000,Blue,Blonde,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,MakittyMarie,5,1,highly recommend for contouring!,never used this as a foundation but I love it ...,", Dry skin",,1581379200000,Green,Red,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,essencie,5,1,Love it,Love it even with translucent powder on top be...,", Oily skin",,1580860800000,Green,Black,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,sehrishm,5,1,great for contouring!,"Love it, I use this to contour with a bronzer ...",", Oily skin",,1579564800000,Green,Black,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [6]:
foundation_data.tail()

Unnamed: 0,reviewer_id,rating,recommended,review_subject,review_content,reviewer_feature,purchased_shade,date_of_review,eye_color,hair_color,...,skin_type_combo,skin_type_combination,skin_type_normal,natural_finish,matte_finish,radiant_finish,medium_coverage,full_coverage,sheer_coverage,light_coverage
121522,Jeninthemtns,4,1,,This is my one product I CANNOT live without. ...,", Oily skin",Medium-Tan,1504310400000,Brown,Black,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
121523,rosered2384,5,1,Love this!,"This is my FAV product. It lasts all day, prot...",", Dry skin",Fair,1504224000000,Hazel,Black,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
121524,LargeMocha,5,1,,"This is my FAV product. It lasts all day, prot...",", Dry skin",Medium,1504224000000,Hazel,Black,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
121525,lanart,1,0,,"This is my FAV product. It lasts all day, prot...",", Combination skin",Light,1504051200000,Blue,Brunette,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
121526,acn2010,5,1,First Time at Sephora!,Purchased this for my 12 year old Granddaughte...,", Combination skin",Fair,1504051200000,Green,Brunette,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [7]:
foundation_data.columns

Index(['reviewer_id', 'rating', 'recommended', 'review_subject',
       'review_content', 'reviewer_feature', 'purchased_shade',
       'date_of_review', 'eye_color', 'hair_color', 'skin_tone', 'skin_type',
       'skin_tone_bin', 'skin_tone_cat', 'days_since_launch',
       'days_since_launch_scaled', 'month_of_purchase', 'finish', 'coverage',
       'shade_match', 'gifted', 'brand_product', 'product_features',
       'product_description', 'skin_type_dry', 'skin_type_oily',
       'skin_type_combo', 'skin_type_combination', 'skin_type_normal',
       'natural_finish', 'matte_finish', 'radiant_finish', 'medium_coverage',
       'full_coverage', 'sheer_coverage', 'light_coverage'],
      dtype='object')

In [8]:
foundation_data = foundation_data[foundation_data['light_coverage'].notna()].reset_index(drop=True)
# excluding foundations that are either no longer available on Sephora or do not have more than 200 reviews

In [9]:
# finish and coverage
def mapping_preference_for_foundations(data: pd.DataFrame, col: str = 'finish'):
    new_col = f'{col}_preference'
    foundation_features = [
        re.findall(f'.+{col}', i)[0] for i in foundation_data.columns if re.findall(f'.+{col}', i) != []
    ]
    data[new_col] = None
    for feature in foundation_features:
        for j in tqdm(range(len(data))):
            if data.loc[j, feature] == 1:
                data.loc[j, new_col] = feature
    return data

In [10]:
foundation_data = mapping_preference_for_foundations(foundation_data, col='finish')

100%|██████████| 119268/119268 [01:46<00:00, 1116.98it/s]
100%|██████████| 119268/119268 [01:18<00:00, 1527.37it/s]
100%|██████████| 119268/119268 [00:26<00:00, 4502.57it/s]


In [11]:
foundation_data = mapping_preference_for_foundations(foundation_data, col='coverage')

100%|██████████| 119268/119268 [01:37<00:00, 1223.96it/s]
100%|██████████| 119268/119268 [02:07<00:00, 937.22it/s] 
100%|██████████| 119268/119268 [00:24<00:00, 4931.47it/s]
100%|██████████| 119268/119268 [02:04<00:00, 958.78it/s] 


In [12]:
# skin_type
foundation_data['skin_type_match'] = 0
skin_types = [re.findall(f'^skin_type_.+', i)[0] for i in foundation_data.columns if re.findall(f'^skin_type_.+', i) != []]
for skin_type in skin_types:
    for i in tqdm(range(len(foundation_data))):
        if foundation_data.loc[i, skin_type] == 1:
            if foundation_data.loc[i, 'skin_type'] == skin_type.replace('skin_type_', '').capitalize():
                foundation_data.loc[i, 'skin_type_match'] = 1

100%|██████████| 119268/119268 [00:03<00:00, 37054.76it/s]
100%|██████████| 119268/119268 [00:03<00:00, 36499.34it/s]
100%|██████████| 119268/119268 [00:00<00:00, 228394.47it/s]
100%|██████████| 119268/119268 [00:08<00:00, 13889.14it/s]
100%|██████████| 119268/119268 [00:02<00:00, 46332.71it/s]
100%|██████████| 119268/119268 [00:00<00:00, 174312.15it/s]


In [13]:
foundation_data.head()

Unnamed: 0,reviewer_id,rating,recommended,review_subject,review_content,reviewer_feature,purchased_shade,date_of_review,eye_color,hair_color,...,natural_finish,matte_finish,radiant_finish,medium_coverage,full_coverage,sheer_coverage,light_coverage,finish_preference,coverage_preference,skin_type_match
0,VanW,2,0,Very Dry - There Are Better Alternatives,I wanted to try a stick foundation and figured...,", Dry skin",,1618531200000,Brown,Brunette,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,,light_coverage,0
1,camy2770,5,1,Game changer,I’ve had this foundation in my collection for ...,", Oily skin",,1617753600000,Blue,Blonde,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,,light_coverage,1
2,MakittyMarie,5,1,highly recommend for contouring!,never used this as a foundation but I love it ...,", Dry skin",,1581379200000,Green,Red,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,,light_coverage,0
3,essencie,5,1,Love it,Love it even with translucent powder on top be...,", Oily skin",,1580860800000,Green,Black,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,,light_coverage,1
4,sehrishm,5,1,great for contouring!,"Love it, I use this to contour with a bronzer ...",", Oily skin",,1579564800000,Green,Black,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,,light_coverage,1


In [15]:
foundation_data.to_json('data_full_review_cleaned/0_foundation_data_all_cleaned.json', orient='records', lines=True)

In [16]:
len(foundation_data)

119268

In [3]:
foundation_data = pd.read_json('data_full_review_cleaned/0_foundation_data_all_cleaned.json', lines=True)

### Encoding Multi-class labels into integers

In [4]:
labels = foundation_data['brand_product'].unique()
i = 0
idx_to_class = dict()
class_to_idx = dict()
for product in labels:
   idx_to_class[i] = product
   class_to_idx[product] = i
   i += 1

In [5]:
idx_to_class

{0: 'Anastasia_Beverly_Hills_Stick_Foundation',
 1: 'Dr._Jart+_Premium_Beauty_Balm_SPF_40',
 2: 'KVD_Vegan_Beauty_Lock-It_Powder_Foundation',
 3: 'Laura_Mercier_Flawless_Lumière_Radiance-Perfecting_Foundation',
 4: 'Laura_Mercier_Flawless_Fusion_Ultra-Longwear_Foundation',
 5: 'tarte_babassu_foundcealer™_skincare_foundation_SPF_20',
 6: 'bareMinerals_Matte_Loose_Powder_Mineral_Foundation_SPF_15\xa0',
 7: "FENTY_BEAUTY_by_Rihanna_Pro_Filt'r_Soft_Matte_Longwear_Foundation",
 8: 'SEPHORA_COLLECTION_Make_No_Mistake_Foundation_&_Concealer_Stick',
 9: 'Smashbox_Studio_Skin_24_Hour_Oil-Free_Hydra_Foundation',
 10: 'ILIA_Super_Serum_Skin_Tint_SPF_40_Foundation',
 11: 'Dior_BACKSTAGE_Face_&_Body_Foundation',
 12: 'Laura_Mercier_Tinted_Moisturizer_Natural_Skin_Perfector_Broad_Spectrum_SPF_30',
 13: 'SEPHORA_COLLECTION_Perfection_Mist_Airbrush_Foundation',
 14: 'Too_Faced_Born_This_Way_Foundation',
 15: 'rms_beauty_"Un"_Cover-up_Concealer',
 16: 'Kosas_Tinted_Face_Oil_Foundation',
 17: 'Hourglass

In [6]:
class_to_idx

{'Anastasia_Beverly_Hills_Stick_Foundation': 0,
 'Dr._Jart+_Premium_Beauty_Balm_SPF_40': 1,
 'KVD_Vegan_Beauty_Lock-It_Powder_Foundation': 2,
 'Laura_Mercier_Flawless_Lumière_Radiance-Perfecting_Foundation': 3,
 'Laura_Mercier_Flawless_Fusion_Ultra-Longwear_Foundation': 4,
 'tarte_babassu_foundcealer™_skincare_foundation_SPF_20': 5,
 'bareMinerals_Matte_Loose_Powder_Mineral_Foundation_SPF_15\xa0': 6,
 "FENTY_BEAUTY_by_Rihanna_Pro_Filt'r_Soft_Matte_Longwear_Foundation": 7,
 'SEPHORA_COLLECTION_Make_No_Mistake_Foundation_&_Concealer_Stick': 8,
 'Smashbox_Studio_Skin_24_Hour_Oil-Free_Hydra_Foundation': 9,
 'ILIA_Super_Serum_Skin_Tint_SPF_40_Foundation': 10,
 'Dior_BACKSTAGE_Face_&_Body_Foundation': 11,
 'Laura_Mercier_Tinted_Moisturizer_Natural_Skin_Perfector_Broad_Spectrum_SPF_30': 12,
 'SEPHORA_COLLECTION_Perfection_Mist_Airbrush_Foundation': 13,
 'Too_Faced_Born_This_Way_Foundation': 14,
 'rms_beauty_"Un"_Cover-up_Concealer': 15,
 'Kosas_Tinted_Face_Oil_Foundation': 16,
 'Hourglass_Van

In [7]:
foundation_data['label'] = foundation_data['brand_product'].replace(class_to_idx)

In [8]:
foundation_data.head()

Unnamed: 0,reviewer_id,rating,recommended,review_subject,review_content,reviewer_feature,purchased_shade,date_of_review,eye_color,hair_color,...,matte_finish,radiant_finish,medium_coverage,full_coverage,sheer_coverage,light_coverage,finish_preference,coverage_preference,skin_type_match,label
0,VanW,2,0,Very Dry - There Are Better Alternatives,I wanted to try a stick foundation and figured...,", Dry skin",,1618531200000,Brown,Brunette,...,0,0,0,1,0,1,,light_coverage,0,0
1,camy2770,5,1,Game changer,I’ve had this foundation in my collection for ...,", Oily skin",,1617753600000,Blue,Blonde,...,0,0,0,1,0,1,,light_coverage,1,0
2,MakittyMarie,5,1,highly recommend for contouring!,never used this as a foundation but I love it ...,", Dry skin",,1581379200000,Green,Red,...,0,0,0,1,0,1,,light_coverage,0,0
3,essencie,5,1,Love it,Love it even with translucent powder on top be...,", Oily skin",,1580860800000,Green,Black,...,0,0,0,1,0,1,,light_coverage,1,0
4,sehrishm,5,1,great for contouring!,"Love it, I use this to contour with a bronzer ...",", Oily skin",,1579564800000,Green,Black,...,0,0,0,1,0,1,,light_coverage,1,0


In [9]:
foundation_data.tail()

Unnamed: 0,reviewer_id,rating,recommended,review_subject,review_content,reviewer_feature,purchased_shade,date_of_review,eye_color,hair_color,...,matte_finish,radiant_finish,medium_coverage,full_coverage,sheer_coverage,light_coverage,finish_preference,coverage_preference,skin_type_match,label
119263,Jeninthemtns,4,1,,This is my one product I CANNOT live without. ...,", Oily skin",Medium-Tan,1504310400000,Brown,Black,...,1,0,1,0,1,1,matte_finish,light_coverage,1,87
119264,rosered2384,5,1,Love this!,"This is my FAV product. It lasts all day, prot...",", Dry skin",Fair,1504224000000,Hazel,Black,...,1,0,1,0,1,1,matte_finish,light_coverage,1,87
119265,LargeMocha,5,1,,"This is my FAV product. It lasts all day, prot...",", Dry skin",Medium,1504224000000,Hazel,Black,...,1,0,1,0,1,1,matte_finish,light_coverage,1,87
119266,lanart,1,0,,"This is my FAV product. It lasts all day, prot...",", Combination skin",Light,1504051200000,Blue,Brunette,...,1,0,1,0,1,1,matte_finish,light_coverage,1,87
119267,acn2010,5,1,First Time at Sephora!,Purchased this for my 12 year old Granddaughte...,", Combination skin",Fair,1504051200000,Green,Brunette,...,1,0,1,0,1,1,matte_finish,light_coverage,1,87


### Preparing for Train-test split

In [10]:
foundation_data.columns

Index(['reviewer_id', 'rating', 'recommended', 'review_subject',
       'review_content', 'reviewer_feature', 'purchased_shade',
       'date_of_review', 'eye_color', 'hair_color', 'skin_tone', 'skin_type',
       'skin_tone_bin', 'skin_tone_cat', 'days_since_launch',
       'days_since_launch_scaled', 'month_of_purchase', 'finish', 'coverage',
       'shade_match', 'gifted', 'brand_product', 'product_features',
       'product_description', 'skin_type_dry', 'skin_type_oily',
       'skin_type_combo', 'skin_type_combination', 'skin_type_normal',
       'natural_finish', 'matte_finish', 'radiant_finish', 'medium_coverage',
       'full_coverage', 'sheer_coverage', 'light_coverage',
       'finish_preference', 'coverage_preference', 'skin_type_match', 'label'],
      dtype='object')

In [11]:
def stratified_split(dataframe, target, val_percent = 0.3, random_seed_ = 0):
   """
   split a dataframe into train and validation sets, while preserving the ratio of the labels in the target variable
    Inputs:
    - dataframe, the dataframe
    - target, the target variable
    - val_percent, the percentage of validation samples, default 0.3
    Outputs:
    - train_idxs, the indices of the training dataset
    - val_idxs, the indices of the validation dataset
   """
   random.seed(random_seed_)
   classes = dataframe[target].unique().tolist()
   train_idxs = []
   val_idxs = []
   for c in classes:
      idx = dataframe[dataframe[target]==c].index.tolist()
      np.random.shuffle(idx)
      val_size =  int(len(idx) * val_percent)
      val_idxs += idx[:val_size]
      train_idxs += idx[val_size:]
   return train_idxs, val_idxs

In [12]:
train_idxs, val_idxs = stratified_split(dataframe=foundation_data, target='label')
val_idxs, test_idxs = stratified_split(dataframe=foundation_data[foundation_data.index.isin(val_idxs)],
                                       target='label', val_percent = 0.5)

In [13]:
total_length = len(foundation_data)
print('----Number of samples in training, validation, and test sets----')
print(f'Training set: {len(train_idxs)} / {total_length} ({round((len(train_idxs)/total_length)*100,2)}%)')
print(f'Validation set: {len(val_idxs)} / {total_length} ({round((len(val_idxs)/total_length)*100,2)}%)')
print(f'Test set: {len(test_idxs)} / {total_length} ({round(len(test_idxs)/total_length*100,2)}%)')

----Number of samples in training, validation, and test sets----
Training set: 83523 / 119268 (70.03%)
Validation set: 17891 / 119268 (15.0%)
Test set: 17854 / 119268 (14.97%)


In [14]:
def test_stratified(dataframe, target):
    """
    Analyzes the ratio of different classes in a categorical variable within a dataframe
    Inputs:
    - dataframe
    - categorical column to be analyzed
    Returns: None
    """
    classes = dataframe[target].unique().tolist()
    data_length = len(dataframe)
    for c in classes:
        print(f'Proportion of records with {c}: {len(dataframe[dataframe[target]==c])*1./data_length:0.2} ({len(dataframe[dataframe[target]==c])} / {data_length}))')

In [15]:
print('------------------Stratified Sampling Report------------------')
print('-----------Proportions of labels in FULL DATA-----------------')
test_stratified(foundation_data, 'label')

------------------Stratified Sampling Report------------------
-----------Proportions of labels in FULL DATA-----------------
Proportion of records with 0: 0.01 (1188 / 119268))
Proportion of records with 1: 0.0074 (888 / 119268))
Proportion of records with 2: 0.0085 (1019 / 119268))
Proportion of records with 3: 0.01 (1187 / 119268))
Proportion of records with 4: 0.0081 (969 / 119268))
Proportion of records with 5: 0.0064 (764 / 119268))
Proportion of records with 6: 0.0019 (231 / 119268))
Proportion of records with 7: 0.044 (5300 / 119268))
Proportion of records with 8: 0.0049 (580 / 119268))
Proportion of records with 9: 0.0086 (1025 / 119268))
Proportion of records with 10: 0.02 (2344 / 119268))
Proportion of records with 11: 0.028 (3300 / 119268))
Proportion of records with 12: 0.022 (2638 / 119268))
Proportion of records with 13: 0.0047 (562 / 119268))
Proportion of records with 14: 0.04 (4796 / 119268))
Proportion of records with 15: 0.0053 (634 / 119268))
Proportion of records 

In [16]:
print('-----------Proportions of labels in TRAINING SET-----------------')
test_stratified(foundation_data[foundation_data.index.isin(train_idxs)], 'label')

-----------Proportions of labels in TRAINING SET-----------------
Proportion of records with 0: 0.01 (832 / 83523))
Proportion of records with 1: 0.0074 (622 / 83523))
Proportion of records with 2: 0.0085 (714 / 83523))
Proportion of records with 3: 0.0099 (831 / 83523))
Proportion of records with 4: 0.0081 (679 / 83523))
Proportion of records with 5: 0.0064 (535 / 83523))
Proportion of records with 6: 0.0019 (162 / 83523))
Proportion of records with 7: 0.044 (3710 / 83523))
Proportion of records with 8: 0.0049 (406 / 83523))
Proportion of records with 9: 0.0086 (718 / 83523))
Proportion of records with 10: 0.02 (1641 / 83523))
Proportion of records with 11: 0.028 (2310 / 83523))
Proportion of records with 12: 0.022 (1847 / 83523))
Proportion of records with 13: 0.0047 (394 / 83523))
Proportion of records with 14: 0.04 (3358 / 83523))
Proportion of records with 15: 0.0053 (444 / 83523))
Proportion of records with 16: 0.013 (1083 / 83523))
Proportion of records with 17: 0.024 (2031 / 83

In [17]:
print('-----------Proportions of labels in VALIDATION SET-----------------')
test_stratified(foundation_data[foundation_data.index.isin(val_idxs)], 'label')

-----------Proportions of labels in VALIDATION SET-----------------
Proportion of records with 0: 0.0099 (178 / 17891))
Proportion of records with 1: 0.0074 (133 / 17891))
Proportion of records with 2: 0.0086 (153 / 17891))
Proportion of records with 3: 0.0099 (178 / 17891))
Proportion of records with 4: 0.0081 (145 / 17891))
Proportion of records with 5: 0.0064 (115 / 17891))
Proportion of records with 6: 0.002 (35 / 17891))
Proportion of records with 7: 0.044 (795 / 17891))
Proportion of records with 8: 0.0049 (87 / 17891))
Proportion of records with 9: 0.0086 (154 / 17891))
Proportion of records with 10: 0.02 (352 / 17891))
Proportion of records with 11: 0.028 (495 / 17891))
Proportion of records with 12: 0.022 (396 / 17891))
Proportion of records with 13: 0.0047 (84 / 17891))
Proportion of records with 14: 0.04 (719 / 17891))
Proportion of records with 15: 0.0053 (95 / 17891))
Proportion of records with 16: 0.013 (232 / 17891))
Proportion of records with 17: 0.024 (435 / 17891))
Pr

In [18]:
print('-----------Proportions of labels in TEST SET-----------------')
test_stratified(foundation_data[foundation_data.index.isin(test_idxs)], 'label')

-----------Proportions of labels in TEST SET-----------------
Proportion of records with 0: 0.01 (178 / 17854))
Proportion of records with 1: 0.0074 (133 / 17854))
Proportion of records with 2: 0.0085 (152 / 17854))
Proportion of records with 3: 0.01 (178 / 17854))
Proportion of records with 4: 0.0081 (145 / 17854))
Proportion of records with 5: 0.0064 (114 / 17854))
Proportion of records with 6: 0.0019 (34 / 17854))
Proportion of records with 7: 0.045 (795 / 17854))
Proportion of records with 8: 0.0049 (87 / 17854))
Proportion of records with 9: 0.0086 (153 / 17854))
Proportion of records with 10: 0.02 (351 / 17854))
Proportion of records with 11: 0.028 (495 / 17854))
Proportion of records with 12: 0.022 (395 / 17854))
Proportion of records with 13: 0.0047 (84 / 17854))
Proportion of records with 14: 0.04 (719 / 17854))
Proportion of records with 15: 0.0053 (95 / 17854))
Proportion of records with 16: 0.013 (232 / 17854))
Proportion of records with 17: 0.024 (435 / 17854))
Proportion 

### Split the dataframe into training, validation, and test sets

In [19]:
features = ['date_of_review', 'eye_color', 'hair_color', 'skin_tone', 'skin_type',
       'skin_tone_bin', 'skin_tone_cat', 'days_since_launch',
       'days_since_launch_scaled', 'month_of_purchase', 'finish', 'coverage',
       'shade_match', 'gifted', 'finish_preference', 'coverage_preference', 'skin_type_match']

train_data = foundation_data[foundation_data.index.isin(train_idxs)]
X_train = train_data[features]
y_train = train_data[['label']]

val_data = foundation_data[foundation_data.index.isin(val_idxs)]
X_val = val_data[features]
y_val = val_data[['label']]

test_data = foundation_data[foundation_data.index.isin(test_idxs)]
X_test = test_data[features]
y_test = test_data[['label']]

### One-hot Encoding

In [44]:
def one_hot_encoding(data, train_X, val_X, test_X, col: str):
    """
    one-hot encode a feature given col,
    rename the columns after one-hot encoding given the categories in col,
    and return training and validation sets as pd.DataFrame
    """
    cols_cat = data.groupby([col], as_index=False).count()[col]
    if re.match("^.*_color$", col):
        color = re.split("_color", col)[0]
        cols_cat = cols_cat.str.cat(pd.Series([color] * len(data)), sep='_')
    # for column names after one-hot encoding
    if re.findall('month', col):
        cols_cat = pd.Series(['month_' + str(i) for i in list(range(1, 13))])

    enc_rest = OneHotEncoder(sparse=False, handle_unknown='ignore')
    enc_rest = enc_rest.fit(train_X[[col]])
    train_X_transform = enc_rest.transform(train_X[[col]])
    # saving the encoder for prediction
    with open(f'models/multiclass_encoder_{col}.pickle', 'wb') as f:
        pickle.dump(enc_rest, f)

    val_X_transform = enc_rest.transform(val_X[[col]])
    test_X_transform = enc_rest.transform(test_X[[col]])

    train_X_transform = pd.DataFrame(train_X_transform)
    col_names_dict = dict()
    for col_idx in train_X_transform.columns:
        col_names_dict[col_idx] = cols_cat[col_idx]
    # saving the one-hot encoded col names
    with open(f'models/multiclass_col_names_{col}.pickle', 'wb') as f:
        pickle.dump(col_names_dict, f)
    train_X_transform.rename(columns=col_names_dict, inplace=True)

    val_X_transform = pd.DataFrame(val_X_transform)
    col_names_dict = dict()
    for col_idx in val_X_transform.columns:
        col_names_dict[col_idx] = cols_cat[col_idx]
    val_X_transform.rename(columns=col_names_dict, inplace=True)

    test_X_transform = pd.DataFrame(test_X_transform)
    col_names_dict = dict()
    for col_idx in test_X_transform.columns:
        col_names_dict[col_idx] = cols_cat[col_idx]
    test_X_transform.rename(columns=col_names_dict, inplace=True)

    return train_X_transform, val_X_transform, test_X_transform

In [21]:
# skin tone
skin_tone_one_hot_train, skin_tone_one_hot_val, skin_tone_one_hot_test = one_hot_encoding(
    data = foundation_data, train_X=X_train, val_X=X_val, test_X=X_test, col='skin_tone'
)

In [22]:
# skin type
skin_type_one_hot_train, skin_type_one_hot_val, skin_type_one_hot_test = one_hot_encoding(
    data = foundation_data, train_X=X_train, val_X=X_val, test_X=X_test, col='skin_type'
)

In [23]:
# hair color
hair_one_hot_train, hair_one_hot_val, hair_one_hot_test = one_hot_encoding(
    data = foundation_data, train_X=X_train, val_X=X_val, test_X=X_test, col='hair_color'
)

In [24]:
# eye color
eye_one_hot_train, eye_one_hot_val, eye_one_hot_test = one_hot_encoding(
    data = foundation_data, train_X=X_train, val_X=X_val, test_X=X_test, col='eye_color'
)

In [34]:
# preference about foundation features
# coverage
foundation_data['coverage_preference'].fillna('no_coverage_preference', inplace=True)
coverage_pref_one_hot_train, coverage_pref_one_hot_val, coverage_pref_one_hot_test = one_hot_encoding(
    data = foundation_data, train_X=X_train, val_X=X_val, test_X=X_test, col='coverage_preference'
)

In [35]:
# finish
foundation_data['finish_preference'].fillna('no_finish_preference', inplace=True)
finish_pref_one_hot_train, finish_pref_one_hot_val, finish_pref_one_hot_test = one_hot_encoding(
    data = foundation_data, train_X=X_train, val_X=X_val, test_X=X_test, col='finish_preference'
)

In [45]:
# month of purchase
foundation_data['month_of_purchase'].fillna(value=round(foundation_data['month_of_purchase'].mean(),0), inplace=True)
month_one_hot_train, month_one_hot_val, month_one_hot_test = one_hot_encoding(
    data = foundation_data, train_X=X_train, val_X=X_val, test_X=X_test, col='month_of_purchase'
)

In [36]:
# crossing one-hot encoded features
def cross_one_hot_features(
        one_hot_col1: pd.DataFrame,
        one_hot_col2: pd.DataFrame
):
    """
    given one-hot encoded feature 1 (one_hot_col1) and one-hot encoded feature 2 (one_hot_col1),
    returns a dataframe with crossed feature b/w one_hot_col1 and one_hot_col2
    columns of the returned dataframe are named by each of the crossed categories in col1 and col2 as "col1_col2"
    """
    total_col1_cat = one_hot_col1.columns.to_list()
    total_col2_cat = one_hot_col2.columns.to_list()
    # total_col1_cat = data.groupby([col1], as_index=False).count()[col1]
    # total_col2_cat = data.groupby([col2], as_index=False).count()[col2]

    data_cross = pd.DataFrame()

    i = 0  # col1

    while i <= len(total_col1_cat) - 1:
        j = 0  # col2
        while j <= len(total_col2_cat) - 1:
            col1_cat = total_col1_cat[i]
            col2_cat = total_col2_cat[j]
            new_cross = one_hot_col1[col1_cat] * one_hot_col2[col2_cat]
            new_cross = pd.Series(new_cross)
            data_cross[f'{col1_cat}_{col2_cat}'] = new_cross
            j += 1
        i += 1

    return data_cross.dropna(axis=0)

In [37]:
type_tone_cross_train = cross_one_hot_features(skin_tone_one_hot_train, skin_type_one_hot_train)
type_tone_cross_val = cross_one_hot_features(skin_tone_one_hot_val, skin_type_one_hot_val)
type_tone_cross_test = cross_one_hot_features(skin_tone_one_hot_test, skin_type_one_hot_test)

In [50]:
other_features = [
    'skin_tone_cat', 'finish', 'coverage', 'shade_match',
    'gifted', 'days_since_launch_scaled', 'skin_type_match'
]

In [51]:
train_X_transformed = pd.concat([
    skin_tone_one_hot_train,
    skin_type_one_hot_train,
    type_tone_cross_train,
    hair_one_hot_train,
    eye_one_hot_train,
    finish_pref_one_hot_train,
    coverage_pref_one_hot_train,
    month_one_hot_train
], axis=1)

val_X_transformed = pd.concat([
    skin_tone_one_hot_val,
    skin_type_one_hot_val,
    type_tone_cross_val,
    hair_one_hot_val,
    eye_one_hot_val,
    finish_pref_one_hot_val,
    coverage_pref_one_hot_val,
    month_one_hot_val
], axis=1)

test_X_transformed = pd.concat([
    skin_tone_one_hot_test,
    skin_type_one_hot_test,
    type_tone_cross_test,
    hair_one_hot_test,
    eye_one_hot_test,
    finish_pref_one_hot_test,
    coverage_pref_one_hot_test,
    month_one_hot_test
], axis=1)

In [52]:
for i in other_features:
    train_X_transformed[i] = X_train[i].reset_index(drop=True)
    val_X_transformed[i] = X_val[i].reset_index(drop=True)
    test_X_transformed[i] = X_test[i].reset_index(drop=True)

In [55]:
list(train_X_transformed.columns)

['Dark',
 'Deep',
 'Ebony',
 'Fair',
 'Light',
 'Medium',
 'Olive',
 'Porcelain',
 'Tan',
 'Combination',
 'Dry',
 'Normal',
 'Oily',
 'Dark_Combination',
 'Dark_Dry',
 'Dark_Normal',
 'Dark_Oily',
 'Deep_Combination',
 'Deep_Dry',
 'Deep_Normal',
 'Deep_Oily',
 'Ebony_Combination',
 'Ebony_Dry',
 'Ebony_Normal',
 'Ebony_Oily',
 'Fair_Combination',
 'Fair_Dry',
 'Fair_Normal',
 'Fair_Oily',
 'Light_Combination',
 'Light_Dry',
 'Light_Normal',
 'Light_Oily',
 'Medium_Combination',
 'Medium_Dry',
 'Medium_Normal',
 'Medium_Oily',
 'Olive_Combination',
 'Olive_Dry',
 'Olive_Normal',
 'Olive_Oily',
 'Porcelain_Combination',
 'Porcelain_Dry',
 'Porcelain_Normal',
 'Porcelain_Oily',
 'Tan_Combination',
 'Tan_Dry',
 'Tan_Normal',
 'Tan_Oily',
 'Auburn_hair',
 'Black_hair',
 'Blonde_hair',
 'Brunette_hair',
 'Gray_hair',
 'Red_hair',
 'Blue_eye',
 'Brown_eye',
 'Gray_eye',
 'Green_eye',
 'Hazel_eye',
 'matte_finish',
 'natural_finish',
 'no_preference',
 'radiant_finish',
 'full_coverage',
 'l

### Model building

In [185]:
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score
import seaborn as sns

#allow logloss and classification error plots for each iteraetion of xgb model
def plot_compare(metrics,eval_results,epochs):
    for m in metrics:
        test_score = eval_results[0][m]
        train_score = eval_results[1][m]
        rang = range(0, epochs)
        plt.rcParams["figure.figsize"] = [6,6]
        plt.plot(rang, test_score,"c", label="Val")
        plt.plot(rang, train_score,"orange", label="Train")
        title_name = m + " plot"
        plt.title(title_name)
        plt.xlabel('Iterations')
        plt.ylabel(m)
        lgd = plt.legend()
        plt.show()

def fitXgb(
        sk_model, train_X=X_train, train_y=y_train,
        val_X=X_val, val_y=y_val
):
    train_X=train_X.to_numpy()
    train_y=train_y.to_numpy()
    print('Fitting model...')
    sk_model.fit(train_X, train_y.reshape(train_y.shape[0],))
    print('Fitting done!')
    return sk_model

def predictXgb(
        sk_model,
        train_X=X_train, train_y=y_train,
        val_X=X_val, val_y=y_val,
        features=list(train_X_transformed.columns), epochs=100
):
    params = sk_model.get_xgb_params()
    metrics = ['mlogloss','merror']
    params['eval_metric'] = metrics
    store = {}
    predict_labels = sk_model.predict_proba(val_X_transformed)
    # xgb_model = xgb.train(params, train, epochs, evallist,evals_result=store,verbose_eval=100)
    print('-- Model Report --')
    print('XGBoost Accuracy: '+str(accuracy_score(sk_model.predict(val_X), val_y)))
    print('XGBoost F1-Score (Micro): '+str(f1_score(sk_model.predict(val_X),val_y, average='weighted')))
    # plot_compare(metrics,store,epochs)
    # f, ax = plt.subplots(figsize=(10,5))
    # plot = sns.barplot(x=features, y=sk_model.feature_importances_)
    # ax.set_title('Feature Importance')
    # plot.set_xticklabels(plot.get_xticklabels(),rotation='vertical')
    # plt.show()
    return predict_labels

In [121]:
from xgboost.sklearn import XGBClassifier
#initial model
xgb1 = XGBClassifier(learning_rate=0.1,
                    n_estimators=50,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=len(y_train['label'].unique()),
                     use_label_encoder=False,
                    seed=0)

In [128]:
fit_model = fitXgb(xgb1, train_X=train_X_transformed, train_y=y_train, val_X=val_X_transformed, val_y=y_val)

Fitting model...
Fitting done!


In [186]:
predict_labels = predictXgb(fit_model, train_X=train_X_transformed, train_y=y_train, val_X=val_X_transformed, val_y=y_val)

-- Model Report --
XGBoost Accuracy: 0.6624000894304399
XGBoost F1-Score (Micro): 0.6878593387620657


In [136]:
list(labels)

['Anastasia_Beverly_Hills_Stick_Foundation',
 'Dr._Jart+_Premium_Beauty_Balm_SPF_40',
 'KVD_Vegan_Beauty_Lock-It_Powder_Foundation',
 'Laura_Mercier_Flawless_Lumière_Radiance-Perfecting_Foundation',
 'Laura_Mercier_Flawless_Fusion_Ultra-Longwear_Foundation',
 'tarte_babassu_foundcealer™_skincare_foundation_SPF_20',
 'bareMinerals_Matte_Loose_Powder_Mineral_Foundation_SPF_15\xa0',
 "FENTY_BEAUTY_by_Rihanna_Pro_Filt'r_Soft_Matte_Longwear_Foundation",
 'SEPHORA_COLLECTION_Make_No_Mistake_Foundation_&_Concealer_Stick',
 'Smashbox_Studio_Skin_24_Hour_Oil-Free_Hydra_Foundation',
 'ILIA_Super_Serum_Skin_Tint_SPF_40_Foundation',
 'Dior_BACKSTAGE_Face_&_Body_Foundation',
 'Laura_Mercier_Tinted_Moisturizer_Natural_Skin_Perfector_Broad_Spectrum_SPF_30',
 'SEPHORA_COLLECTION_Perfection_Mist_Airbrush_Foundation',
 'Too_Faced_Born_This_Way_Foundation',
 'rms_beauty_"Un"_Cover-up_Concealer',
 'Kosas_Tinted_Face_Oil_Foundation',
 'Hourglass_Vanish™_Seamless_Finish_Foundation_Stick',
 'Too_Faced_Peach_

In [137]:
# for plotting AUC-ROC
classes_combinations = []
class_list = list(labels)
for i in range(len(class_list)):
    for j in range(i+1, len(class_list)):
        classes_combinations.append([class_list[i], class_list[j]])
        classes_combinations.append([class_list[j], class_list[i]])

In [184]:
len(classes_combinations)

7656

In [198]:
from sklearn.metrics import confusion_matrix, roc_auc_score
def calculate_tpr_fpr(y_real, y_pred):
    '''
    Calculates the True Positive Rate (tpr) and the True Negative Rate (fpr) based on real and predicted observations

    Args:
        y_real: The list or series with the real classes
        y_pred: The list or series with the predicted classes

    Returns:
        tpr: The True Positive Rate of the classifier
        fpr: The False Positive Rate of the classifier
    '''

    # Calculates the confusion matrix and recover each element
    cm = confusion_matrix(y_real, y_pred)
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]

    # Calculates tpr and fpr
    tpr =  TP/(TP + FN) # sensitivity - true positive rate
    fpr = 1 - TN/(TN+FP) # 1-specificity - false positive rate

    return tpr, fpr

def get_all_roc_coordinates(y_real, y_proba):
    '''
    Calculates all the ROC Curve coordinates (tpr and fpr) by considering each point as a treshold for the predicion of the class.

    Args:
        y_real: The list or series with the real classes.
        y_proba: The array with the probabilities for each class, obtained by using the `.predict_proba()` method.

    Returns:
        tpr_list: The list of TPRs representing each threshold.
        fpr_list: The list of FPRs representing each threshold.
    '''
    tpr_list = [0]
    fpr_list = [0]
    for i in range(len(y_proba)):
        threshold = y_proba[i]
        y_pred = y_proba >= threshold
        tpr, fpr = calculate_tpr_fpr(y_real, y_pred)
        tpr_list.append(tpr)
        fpr_list.append(fpr)
    return tpr_list, fpr_list

def plot_roc_curve(tpr, fpr, scatter = True, ax = None):
    '''
    Plots the ROC Curve by using the list of coordinates (tpr and fpr).

    Args:
        tpr: The list of TPRs representing each coordinate.
        fpr: The list of FPRs representing each coordinate.
        scatter: When True, the points used on the calculation will be plotted with the line (default = True).
    '''
    if ax == None:
        plt.figure(figsize = (5, 5))
        ax = plt.axes()

    if scatter:
        sns.scatterplot(x = fpr, y = tpr, ax = ax)
    sns.lineplot(x = fpr, y = tpr, ax = ax)
    sns.lineplot(x = [0, 1], y = [0, 1], color = 'green', ax = ax)
    plt.xlim(-0.05, 1.05)
    plt.ylim(-0.05, 1.05)
    # plt.xlabel("False Positive Rate")
    # plt.ylabel("True Positive Rate")

In [203]:
plt.figure(figsize = (20, 7))
bins = [i/20 for i in range(20)] + [1]
roc_auc_ovo = {}
for i in range(len(classes_combinations)):
    # Gets the class
    comb = classes_combinations[i]
    c1 = class_to_idx.get(comb[0])
    c2 = class_to_idx.get(comb[1])
    title = comb[0] + " vs " + comb[1]

    # Prepares an auxiliar dataframe to help with the plots
    df_aux = val_X_transformed.copy()
    df_aux['class'] = y_val
    df_aux['prob'] = predict_labels[:, c1]

    # Slices only the subset with both classes
    df_aux = df_aux[(df_aux['class'] == c1) | (df_aux['class'] == c2)]
    df_aux['class'] = [1 if y_val == c1 else 0 for y_val in df_aux['class']]
    df_aux = df_aux.reset_index(drop = True)

    # Plots the probability distribution for the class and the rest
    # ax = plt.subplot(2, 6, i+1)
    # sns.histplot(x = "prob", data = df_aux, hue = 'class', color = 'b', ax = ax, bins = bins)
    # ax.set_title(title)
    # ax.legend([f"Class 1: {comb[0]}", f"Class 0: {comb[1]}"])
    # ax.set_xlabel(f"P(x = {comb[0]})")

    # Calculates the ROC Coordinates and plots the ROC Curves
    ax_bottom = plt.subplot(8, 11, i+1)
    if len(df_aux['class'].unique()) == 2:
        tpr, fpr = get_all_roc_coordinates(df_aux['class'], df_aux['prob'])
        plot_roc_curve(tpr, fpr, scatter = False, ax = ax_bottom)
    # ax_bottom.set_title("ROC Curve OvO")

    if len(df_aux['class'].unique()) == 2:
        # Calculates the ROC AUC OvO
        roc_auc_ovo[title] = roc_auc_score(df_aux['class'], df_aux['prob'])
        print(f'for {comb}, ROC_AUC (OvO) = {roc_auc_ovo[title]}')

ValueError: num must be 1 <= num <= 88, not 0

<Figure size 1440x504 with 0 Axes>

In [206]:
roc_auc_score(y_val['label'], predict_labels, labels = list(range(88)), multi_class = 'ovr', average = 'weighted')

0.98923733924867

In [180]:
labels

array(['Anastasia_Beverly_Hills_Stick_Foundation',
       'Dr._Jart+_Premium_Beauty_Balm_SPF_40',
       'KVD_Vegan_Beauty_Lock-It_Powder_Foundation',
       'Laura_Mercier_Flawless_Lumière_Radiance-Perfecting_Foundation',
       'Laura_Mercier_Flawless_Fusion_Ultra-Longwear_Foundation',
       'tarte_babassu_foundcealer™_skincare_foundation_SPF_20',
       'bareMinerals_Matte_Loose_Powder_Mineral_Foundation_SPF_15\xa0',
       "FENTY_BEAUTY_by_Rihanna_Pro_Filt'r_Soft_Matte_Longwear_Foundation",
       'SEPHORA_COLLECTION_Make_No_Mistake_Foundation_&_Concealer_Stick',
       'Smashbox_Studio_Skin_24_Hour_Oil-Free_Hydra_Foundation',
       'ILIA_Super_Serum_Skin_Tint_SPF_40_Foundation',
       'Dior_BACKSTAGE_Face_&_Body_Foundation',
       'Laura_Mercier_Tinted_Moisturizer_Natural_Skin_Perfector_Broad_Spectrum_SPF_30',
       'SEPHORA_COLLECTION_Perfection_Mist_Airbrush_Foundation',
       'Too_Faced_Born_This_Way_Foundation',
       'rms_beauty_"Un"_Cover-up_Concealer',
       'Kosas_

In [134]:
def plot_auc_roc(
        predict_y,
        val_y,
        model: str,
        filename: str = "auc_roc"):
    """
    given predicted probability (predict_y) and label (val_y), computes AUC-ROC across threshold and saves the figure by product and model
    """

    thresholds = np.linspace(0, 1, 10)
    fpr = []
    tpr = []

    for threshold in thresholds:
        predict_y_binary = np.where(predict_y >= threshold, 1, 0)

        fp = np.sum((val_y == 0) & (predict_y_binary == 1))  # true value is 0 but predict to be 1
        tp = np.sum((val_y == 1) & (predict_y_binary == 1))  # true value is 1 & predict to be 1

        fn = np.sum((val_y == 1) & (predict_y_binary == 0))  # true value is 1 but predict to be 0
        tn = np.sum((val_y == 0) & (predict_y_binary == 0))  # true value is 0 but predict to be 0

        fpr.append(fp / (fp + tn))
        tpr.append(tp / (tp + fn))

        ## ROC & ROC-AUC
    roc_auc_ = round(auc(fpr, tpr), 5)
    fig3, ax3 = plt.subplots(1, 1)
    ax3.plot(fpr, tpr, label="ROC")
    ax3.plot([0, 1], [0, 1], 'k--')
    ax3.set_xlabel("False Positive Rate")
    ax3.set_ylabel("True Positive Rate")
    ax3.text(0.55, 0.2, 'AUC = {}'.format(roc_auc_))
    product_title = product.replace('_', ' ')
    plt.title(f'{model} with {product_title}')

    plt.savefig(f'plots/{model}_{filename}.jpeg')

In [None]:
plot_auc_roc

In [88]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(multi_class="multinomial", max_iter=1000)
logit.fit(train_X_transformed, y_train.to_numpy().reshape(y_train.shape[0],))

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [102]:
predict = logit.predict_proba(val_X_transformed)

In [103]:
logit.score(val_X_transformed, y_val.to_numpy().reshape(y_val.shape[0],))

0.5404393270359399

In [97]:
logit.predict_proba(val_X_transformed)[0]

array([6.41192210e-01, 2.84959322e-06, 8.06615462e-10, 2.11860887e-09,
       1.94251707e-06, 3.67972499e-09, 4.62797349e-06, 2.75671937e-04,
       2.31023877e-03, 1.79186603e-05, 1.40756523e-02, 4.63032155e-06,
       2.78577564e-04, 5.41512487e-07, 7.67571741e-03, 6.22555620e-03,
       4.38067487e-05, 3.14930664e-06, 4.68143065e-07, 2.31269496e-08,
       1.39428307e-06, 9.37000150e-08, 9.03190959e-09, 2.28412209e-05,
       2.16663884e-03, 4.13536238e-05, 3.27203930e-04, 5.76002904e-04,
       2.41752659e-09, 4.68675238e-10, 1.48802068e-04, 4.03832878e-07,
       2.54153968e-09, 1.03909752e-05, 7.60463292e-02, 3.08691256e-10,
       9.50632188e-08, 7.81570568e-05, 3.18105910e-06, 1.82970814e-05,
       1.28074430e-07, 1.04816833e-06, 7.24259566e-08, 9.80560596e-06,
       2.24778841e-06, 1.67643505e-05, 1.69203590e-06, 6.13955405e-06,
       6.26280973e-09, 3.11175943e-08, 1.52396937e-05, 7.94620302e-05,
       9.82116516e-06, 8.38491850e-08, 9.30753343e-06, 2.08920217e-02,
      