In [292]:
from src.containers.evaluation_container import EvaluationContainer
from src.services.config.config_interface import DirConfig

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import math

In [4]:
container = EvaluationContainer('train001')
container.initialize()

[32m[EvaluationContainer 2022-12-30 22:11:18][0m [1mINFO Setup [EvaluationContainer][0m
[32m[ConfigService 2022-12-30 22:11:18][0m [1mINFO Setup [ConfigService][0m
[32m[ConfigService 2022-12-30 22:11:18][0m [1mINFO {'root_dir': PosixPath('/Users/tsukuba-yuuki-uehara/Desktop/projects/graduation_research')}[0m
[32m[ConfigService 2022-12-30 22:11:18][0m [1mINFO {'input_dir': PosixPath('/Users/tsukuba-yuuki-uehara/Desktop/projects/graduation_research/input')}[0m
[32m[ConfigService 2022-12-30 22:11:18][0m [1mINFO {'output_root': PosixPath('/Users/tsukuba-yuuki-uehara/Desktop/projects/graduation_research/output/train001')}[0m
[32m[ConfigService 2022-12-30 22:11:18][0m [1mINFO {'output_dir': PosixPath('/Users/tsukuba-yuuki-uehara/Desktop/projects/graduation_research/output/train001/outputs')}[0m
[32m[ConfigService 2022-12-30 22:11:18][0m [1mINFO {'output_prediction_dir': PosixPath('/Users/tsukuba-yuuki-uehara/Desktop/projects/graduation_research/output/train001/pred

In [179]:
from sklearn.mixture import GaussianMixture
import scipy

In [355]:
import cvxpy as cp
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from src.services.base.base_service import BaseService
from src.utils import save_as_pickle

class ClusterWithUncertaintyPostprocess(BaseService):
    def __init__(self, base_config, postprocess_config):
        super().__init__()
        self.base_config = base_config
        self.postprocess_config = postprocess_config
        self.cost = np.array(
            list(self.postprocess_config.variant_no_to_cost.values())[1:]
        )
        self.n_cluster = self.postprocess_config.params["n_cluster"]

    def postprocess(self, uplift_mat, budget_constraint: float, seed: int):
        # clustering
        self.n_treatment = uplift_mat.shape[1]
        clustering_model = self.get_clustering()
        clustering_model.fit(uplift_mat)
        save_clustering_model_path = (
            self.base_config.dir_config.output_model_dir
            / f"clustering_model_{seed}_{budget_constraint}.pkl"
        )
        save_as_pickle(clustering_model, save_clustering_model_path)

        # # get cluster labels
        labels = clustering_model.predict(uplift_mat)
        np.save(
            self.base_config.dir_config.output_optimize_dir
            / f"cluster_{seed}_{budget_constraint}.npy",
            labels,
        )

        uplift_df = pd.DataFrame(uplift_mat)
        pi_bar = uplift_df.groupby(labels).mean().sort_index().values.flatten()
        cov_mat = self.get_cov_mat(uplift_df, labels)
        
        # optimize
        x = self.optimize(
            pi_bar, cov_mat, labels, budget_constraint
        )  # x.shape = (n_cluster, n_treatment)

        # sampling each cluster
        assign_df = pd.DataFrame(index=uplift_df.index, columns=['assign'])
        assign_df['assign'] = 0
        assign_df['cluster'] = labels

        for cluster in range(self.n_cluster):
            for coupon in range(self.n_treatment):
                _assign_df = assign_df.query(f'cluster == {cluster} and assign == 0')
                n_assign = min(
                    math.floor(x[cluster, coupon]),
                    _assign_df.shape[0]
                )
                if n_assign > 0:
                    _sample_idx = _assign_df.sample(n_assign, random_state=0).index
                    assign_df.loc[_sample_idx, 'assign'] = coupon + 1

        return assign_df["assign"].to_numpy()

    def get_clustering(self):
        if self.postprocess_config.params["clustering"] == "kmeans":
            return KMeans(
                n_clusters=self.n_cluster,
                random_state=self.base_config.seed,
            )
        elif self.postprocess_config.params["clustering"] == "gmm":
            return GaussianMixture(
                n_components=self.n_cluster,
                random_state=self.base_config.seed,
            )

    def get_cov_mat(self, uplift_df: pd.DataFrame, labels):
        bootstrap_samples = []
        B = 1000
        for i in tqdm(range(B)):
            sample_idx = np.random.choice(len(uplift_df), len(uplift_df), replace=True)
            sample_uplift_pred = uplift_df.iloc[sample_idx]
            sample_uplift_pred = sample_uplift_pred.groupby(labels).mean().sort_index().values.flatten()
            bootstrap_samples.append(sample_uplift_pred)
        bootstrap_samples = np.stack(bootstrap_samples)
        cov_mat = np.cov(bootstrap_samples, rowvar=False, bias=False)

        return cov_mat

    def optimize(self, pi_bar, cov_mat, labels, budget_constraint):
        gamma = cp.Variable(shape=(self.n_cluster*self.n_treatment, 1), nonneg=True)
        cov_mat = cp.atoms.affine.wraps.psd_wrap(cov_mat)

        # Objective (Return & Risk)
        ret = pi_bar @ gamma
        risk = cp.quad_form(gamma, cov_mat)
        lam = self.postprocess_config.params["lambda"]
        objective = cp.Maximize((1-lam)*ret - lam*risk)

        # Constraints
        constraints = []

        ## Budget Constraint
        costs = np.tile(self.cost, self.n_cluster)
        constraints.append(costs @ gamma <= budget_constraint)

        ## Cluster size Constraint
        cluster_sizes = np.unique(labels, return_counts=True)[1]
        for i in range(cluster_sizes.shape[0]):
            size = cluster_sizes[i]
            _gamma = gamma[i*self.n_treatment:(i+1)*self.n_treatment]
            constraints.append(cp.sum(_gamma) <= size)

        # solve
        prob = cp.Problem(objective, constraints)
        prob.solve(solver=cp.SCS, verbose=True)

        return gamma.value.reshape(self.n_cluster, self.n_treatment)

In [359]:
len(uplift_pred) * 16 * 0.2, len(uplift_pred) * 16

(422400.0, 2112000)

In [364]:
400000 * 5 - 400000

1600000

In [368]:
np.linspace(400000, 400000 * 5, 5)

array([ 400000.,  800000., 1200000., 1600000., 2000000.])

In [321]:
import cvxpy as cp

In [350]:
n_cluster = 10
n_treatment = uplift_pred.shape[1]
lam = 0.1

gamma = cp.Variable(shape=(n_cluster*n_treatment, 1), nonneg=True)
pi_bar = uplift_pred.groupby(pred_cluster).mean().sort_index().values.flatten()
cov_mat = np.cov(bootstrap_samples, rowvar=False, bias=False)
cov_mat = cp.atoms.affine.wraps.psd_wrap(cov_mat)

ret = pi_bar @ gamma
risk = cp.quad_form(gamma, cov_mat)

objective = cp.Maximize((1-lam)*ret - lam*risk)
constraints = []

# budget constraint
budget_constraint = 100000000000
coupon_cost = np.array([5, 5, 13, 10, 10, 15])
costs = np.tile(coupon_cost, n_cluster)
constraints.append(costs @ gamma <= budget_constraint)

# cluster size constraint
cluster_sizes = np.unique(pred_cluster, return_counts=True)[1]
for i in range(cluster_sizes.shape[0]):
    size = cluster_sizes[i]
    _gamma = gamma[i*n_treatment:(i+1)*n_treatment]
    constraints.append(cp.sum(_gamma) <= size)

# solve
prob = cp.Problem(objective, constraints)
prob.solve(solver=cp.SCS, verbose=True)

x = gamma.value.reshape(n_cluster, n_treatment)

                                     CVXPY                                     
                                     v1.2.2                                    
(CVXPY) Dec 31 12:07:13 AM: Your problem has 60 variables, 11 constraints, and 0 parameters.
(CVXPY) Dec 31 12:07:13 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Dec 31 12:07:13 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Dec 31 12:07:13 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Dec 31 12:07:13 AM: Compiling problem (target solver=SCS).
(CVXPY) Dec 31 12:07:13 AM: Reduction chain: FlipObjective -> Dcp2Cone -> CvxAttr2Constr -> Cone



In [352]:
assign_df = pd.DataFrame(index=uplift_pred.index, columns=['assign'])
assign_df['assign'] = 0
assign_df['cluster'] = pred_cluster

for cluster in range(n_cluster):
    for coupon in range(n_treatment):
        _assign_df = assign_df.query(f'cluster == {cluster} and assign == 0')
        n_assign = min(
            math.floor(x[cluster, coupon]),
            _assign_df.shape[0]
        )
        if n_assign > 0:
            _sample_idx = _assign_df.sample(n_assign, random_state=0).index
            assign_df.loc[_sample_idx, 'assign'] = coupon + 1

In [353]:
x.sum(axis=0)

array([ 3265.37275005,  3429.3673902 , 81546.66233254,  1300.70148678,
       37134.46084405,   606.13345687])

In [354]:
assign_df['assign'].value_counts().sort_index()

0     5531
1     3111
2     2968
3    81500
4     1232
5    37087
6      571
Name: assign, dtype: int64