In [1]:
import pandas as pd
import os
import csv
import json
import lightgbm as lgb
from sklearn.datasets import load_svmlight_file
from collections import OrderedDict, deque, Counter
import numpy as np
import scipy
import sklearn
from tabulate import tabulate
import pickle
import itertools
from sklearn import preprocessing
from bayes_opt import BayesianOptimization
import warnings
from datetime import datetime

In [2]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerLine2D

pd.set_option('display.float_format', lambda x: '%.3f' % x)
import matplotlib.pyplot as plt
import seaborn as sns

matplotlib.style.use('seaborn')
plt.style.use("seaborn")
markers = [".", "*", ">", "o", "v", "^", "<", "s", "p", "h", "H", "D", "d", "|", "_"]
font = {'weight': 'normal',
        'size': 14}

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

matplotlib.rc('font', **font)
matplotlib.rcParams.update({'font.size': 28})
matplotlib.rcParams.update({'legend.fontsize': 20})
matplotlib.rcParams.update({'xtick.labelsize': 28, 'ytick.labelsize': 28, })

In [3]:
def dcg_score(y_true, y_score, k=10, gains="exponential"):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    """
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best


# Alternative API.

def dcg_from_ranking(y_true, ranking):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    ranking : array-like, shape = [k]
        Document indices, i.e.,
            ranking[0] is the index of top-ranked document,
            ranking[1] is the index of second-ranked document,
            ...
    k : int
        Rank.
    Returns
    -------
    DCG @k : float
    """
    y_true = np.asarray(y_true)
    ranking = np.asarray(ranking)
    rel = y_true[ranking]
    gains = 2 ** rel - 1
    discounts = np.log2(np.arange(len(ranking)) + 2)
    return np.sum(gains / discounts)


def ndcg_from_ranking(y_true, ranking):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    ranking : array-like, shape = [k]
        Document indices, i.e.,
            ranking[0] is the index of top-ranked document,
            ranking[1] is the index of second-ranked document,
            ...
    k : int
        Rank.
    Returns
    -------
    NDCG @k : float
    """
    k = len(ranking)
    best_ranking = np.argsort(y_true)[::-1]
    best = dcg_from_ranking(y_true, best_ranking[:k])
    return dcg_from_ranking(y_true, ranking) / best

In [4]:
today = datetime.now().strftime("%Y_%m_%d")

In [5]:
df_promotions = pd.read_excel("D:/Onedrive/Private/descuentos_ninja/promotions/amazon/amazon_promotions_" + today + ".xlsx", sheet_name=None)
df_nodes =df_promotions["categories"]
df_promotions =df_promotions["promotions"]
df_promotions_hot = pd.read_excel("D:/Onedrive/Private/descuentos_ninja/promotions/amazon/amazon_promotions_hot_" + today + ".xlsx")

In [6]:
df_promotions_hot = df_promotions_hot.query("rss_type == 'bestsellers' and availability_type == 'now'")
df_promotions = df_promotions.query("availability_type == 'now'")
df_promotions = df_promotions.groupby(["asin","root_node"]).head(1)
df_promotions = df_promotions[~df_promotions['sales_rank'].isnull()]
df_promotions = df_promotions[~df_promotions['root_node'].isnull()]
df_promotions['sales_rank'] = df_promotions['sales_rank'].astype(int)

In [7]:
hot_asins = set(df_promotions_hot.asin.unique())
df_promotions["relevance"] = df_promotions["asin"].apply(lambda x: 1 if x in hot_asins else 0)

In [14]:
df_promotions_top10 = df_promotions.sort_values(["root_node","sales_rank"], ascending=True).groupby("root_node").head(20)
writer = pd.ExcelWriter('D:/Onedrive/Private/descuentos_ninja/promotions/amazon/amazon_promotions_top20_' + today +'.xlsx')
df_promotions_top10.sort_values(["root_node","sales_rank"], ascending=True).to_excel(writer)
writer.save()

In [15]:
df_promotions_top10 = df_promotions.sort_values(["root_node","sales_rank"], ascending=True).groupby("root_node").head(1)
writer = pd.ExcelWriter('D:/Onedrive/Private/descuentos_ninja/promotions/amazon/amazon_promotions_top1_' + today +'.xlsx')
df_promotions_top10.sort_values(["root_node","sales_rank"], ascending=True).to_excel(writer)
writer.save()