In [None]:
import pandas as pd
import sklearn.metrics

pd.options.mode.chained_assignment = None
import numpy as np
from tqdm import tqdm

import xgboost as xgb

import optuna
from optuna.samplers import RandomSampler, TPESampler
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import lightgbm as lgb

import os

from training import (
    plot_auc_roc, plot_recision_recall_f1, 
    plot_countbar, plot_line,
    plot_diff_in_means, plot_cross_tab_heatmap, 
    plot_predictions_by_scores,
    model_training
)

from datetime import datetime
import logging
NOW = datetime.today().isoformat()[:19]
logging.basicConfig(
    filemode='a',
    filename=f'./logs/{NOW}_outputlog.log',
    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
    datefmt='%H:%M:%S',
    level=logging.INFO
)

logger = logging.getLogger(__name__)
logger.info("++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
# to get the cleaned data from directory
list_of_file_names = []
for file in os.listdir('data_full_review_cleaned/'):
   if file.endswith(".json"):
      list_of_file_names.append(os.path.join(file))

In [None]:
# preparing the dataframe for foundations
list_of_foundations = pd.read_csv('foundation_from_sephora_with_url.csv')
list_of_foundations['model'] = None
## placeholder for path to trained models
list_of_foundations['threshold'] = float()
## placeholder for best threshold
list_of_foundations['auc'] = float()
## placeholder for performance metrics (ROC-AUC) for each of the models
list_of_foundations.head()

In [None]:
# training models
for i in range(len(list_of_file_names)):
    product = model_training(file_name = list_of_file_names[i], random_seeds = 0, logger = logger)
    if len(product.data) <= 200:
        logger.info(f'N = {len(product.data)} (<= 200), dropped for training')
        continue
    product.train_test_split()
    product.dropping_outlier_reviewers()
    product.feature_engineering() # one-hot encoding and feature crossing

    product.grid_search()
    product.train_xgb_classifier()
    product.thresholding()

    index = list_of_foundations.index[list_of_foundations['brand_product'] == product.product_name].tolist()[0]
    list_of_foundations.loc[index, 'model'] = f'{product.product_name}_xgb.model'
    list_of_foundations.loc[index, 'threshold'] = product.best_threshold
    list_of_foundations.loc[index, 'auc'] = roc_auc_score(product.val_y, product.predict_y)

    product.plot_predictions_by_scores(val_y = product.val_y, product=product.product_name, model = 'xgb')
    product.plot_precision_recall_f1(val_y=product.val_y, product=product.product_name, model='xgb')
    product.plot_auc_roc(val_y=product.val_y, product=product.product_name, model = 'xgb')

In [None]:
i

In [None]:
len(list_of_file_names)

In [None]:
product = model_training(file_name = list_of_file_names[i], random_seeds = 0, logger = logger)
# if len(product.data) <= 200:
#     logger.info(f'N = {len(product.data)} (<= 200), dropped for training')
product.train_test_split()

In [None]:
product.val_X.groupby(['hair_color']).count()['coverage'] / len(product.val_X)

In [None]:
product.dropping_outlier_reviewers()
product.feature_engineering() # one-hot encoding and feature crossing

product.grid_search()
product.train_xgb_classifier()
product.thresholding()

index = list_of_foundations.index[list_of_foundations['brand_product'] == product.product_name].tolist()[0]
list_of_foundations.loc[index, 'model'] = f'{product.product_name}_xgb.model'
list_of_foundations.loc[index, 'threshold'] = product.best_threshold
list_of_foundations.loc[index, 'auc'] = roc_auc_score(product.val_y, product.predict_y)

product.plot_predictions_by_scores(val_y = product.val_y, product=product.product_name, model = 'xgb')
product.plot_precision_recall_f1(val_y=product.val_y, product=product.product_name, model='xgb')
product.plot_auc_roc(val_y=product.val_y, product=product.product_name, model = 'xgb')