# Simulation to generate data set for DRM paper backtest

In [31]:
import sys
sys.path.append('/Users/shuyangduuber.com/Desktop/sapphire_optimization/')
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.utils import resample
from xgboost import XGBRegressor
import scipy.stats as st
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

from sapphire_optimization.models.models_targeting.core.cost_curve import CostCurve
from sapphire_optimization.models.models_targeting.models.drm.drm_gradient import DRM_Gradient
from sapphire_optimization.models.models_targeting.core.data_transform_flow_targeting import DataTransformFlowTargeting
from sapphire_optimization.models.models_targeting.core.model_flow import ModelFlow

from sapphire_optimization.models.models_cte.models.r_learner import RLearnerRidge, RLearnerXGBT
from sapphire_optimization.models.models_cte.core.data_transform_flow_cte import DataTransformFlowCTE
from sapphire_optimization.models.models_cte.core.model_flow_cte import ModelFlowCTE

from sapphire_optimization.models.core.data_transform_flow import DataTransformFlow
from sapphire_optimization.models.core.model_flow_base import ModelFlowBase

from research.models.core.backtest import BackTest
from research.models.core.backtest_space import BackTestSpace
from sapphire_optimization.models.models_targeting.data.schema.rxgy_adrm import schema

from IPython.display import display
from copy import deepcopy

%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'
sns.set()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [23]:
df_all = pd.read_csv(
    '/Users/shuyangduuber.com/Desktop/data/advanced_promo/rxgy_adrm_train_multimetric_US_20190602_20190804.csv'
)

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
df_all.columns

Index([u'rider_uuid', u'promo_city_id', u'cohort', u'proposal_start_datestr',
       u'rating_2driver_min_avg_84d', u'trip_incomplete_total_84d',
       u'days_active_84d', u'days_since_trip_first_lifetime',
       u'days_since_last_hard_churn_lifetime',
       u'days_since_last_soft_churn_lifetime', u'fare_max_sd_84d',
       u'churns_hard_lifetime', u'trips_lifetime', u'fare_max_p50_84d',
       u'duration_session_pre_request_max_p50_84d', u'trip_pool_per_x_84d',
       u'fare_total_win7d_sd_84d', u'trip_complete_win7d_sd_84d',
       u'session_per_days_active_84d', u'churns_soft_lifetime',
       u'trip_complete_per_days_active_84d', u'trip_pool_prc_84d',
       u'session_background_pre_request_prc_84d', u'session_lt_1m_prc_84d',
       u'session_request_prc_84d', u'duration_session_outside_total_prc_84d',
       u'trip_x_prc_84d', u'days_since_trip_last_lifetime',
       u'channel_signup_lifetime', u'device_os_primary_lifetime',
       u'promo_used_84d', u'has_session_request_84d',

In [25]:
value_col = 'label_trip_28d'
cost_col = 'label_cost_28d'

# Use all data to train a simulator

In [26]:
CostCurve.sample_stats(df_all[[value_col, cost_col, 'cohort']].values)

Unnamed: 0,count,Treatment Prct,Inc Cost Sum,Inc Cost P-val,Inc Value Sum,Inc Value P-val,CPIV
0,1972139.0,0.79957,15358380.0,0.0,1048864.0,0.0,14.64287


# Simulator
$Y(X_i)=b(X_i)+W_i\tau(X_i)+\epsilon_i$

In [36]:
class SimulatedDataGenerator(object):
    def __init__(self, model_tau=None, model_b=None):
        
        # model for true tau and b
        self.model_tau = model_tau
        self.model_b = model_b
        self.sigma = None
        self.mf_tau = None
        self.mf_b = None
    
    def generate_df(self, df=None, y_col=None, w_col='cohort'):
        
        dtf_tau = DataTransformFlowCTE(
            label_cols=[y_col, w_col],
            numerical_cols=schema.CONTINUOUS_COLS,
            categorical_cols=schema.CATEGORICAL_COLS,
            log_cols=schema.LOG_COLS,
            use_scale=True,
            use_pca=False,
            use_dummy=True,
        )
        
        dtf_b = DataTransformFlow(
            label_cols=[y_col],
            numerical_cols=schema.RIDER_CONTINUOUS_COLS,
            categorical_cols=schema.RIDER_CATEGORICAL_COLS,
            log_cols=schema.LOG_COLS,
            use_scale=True,
            use_pca=False,
            use_dummy=True,
        )
        
        self.mf_tau = ModelFlowCTE(model=self.model_tau, data_transform_flow=dtf_tau)
        self.mf_b = ModelFlow(model=self.model_b, data_transform_flow=dtf_b)
        
        tau = self.mf_tau.fit_predict(df)
        b = self.mf_b.fit_predict(df)
        
        # calculate empirical sigma
        w = (df[w_col]=='treatment').astype(int)
        e = df[y_col] - tau * w - b
        self.sigma = e.std()
        
        # add simulated y to df
        y = b + tau * w + np.random.normal(scale=self.sigma, size=len(df))
        
        return pd.DataFrame(
            np.stack([tau, b, y], axis=1),
            columns = ['{0}_{1}'.format(s, y_col) for s in ['tau', 'b', 'y']]
        )

# Model candidates for simulation

In [37]:
model_tau_linear = RLearnerRidge()
model_tau_xgboost = RLearnerXGBT()
model_b_linear = Lasso()
model_b_xgboost = XGBRegressor()

# Generate simulated data for trip count

In [38]:
simulator_trip = SimulatedDataGenerator(model_tau=model_tau_linear, model_b=model_b_linear)
# can also use xgboost model as simulator
# simulator_trip = SimulatedDataGenerator(model_tau=model_tau_xgboost, model_b=model_b_xgboost)

In [39]:
df_sim_trip = simulator_trip.generate_df(df=df_all, y_col=value_col)

In [41]:
simulator_cost = SimulatedDataGenerator(model_tau=model_tau_linear, model_b=model_b_linear)
# can also use xgboost model as simulator
# simulator_cost = SimulatedDataGenerator(model_tau=model_tau_xgboost, model_b=model_b_xgboost)

In [42]:
df_sim_cost = simulator_cost.generate_df(df=df_all, y_col=cost_col)

In [43]:
df_all_sim = pd.concat([df_all, df_sim_trip, df_sim_cost], axis=1)

# Sanity check for simulated y

In [45]:
CostCurve.sample_stats(df_all_sim[[value_col, cost_col, 'cohort']].values)

Unnamed: 0,count,Treatment Prct,Inc Cost Sum,Inc Cost P-val,Inc Value Sum,Inc Value P-val,CPIV
0,1972139.0,0.79957,15358380.0,0.0,1048864.0,0.0,14.64287


In [46]:
CostCurve.sample_stats(df_all_sim[['y_{}'.format(value_col), 'y_{}'.format(cost_col), 'cohort']].values)

Unnamed: 0,count,Treatment Prct,Inc Cost Sum,Inc Cost P-val,Inc Value Sum,Inc Value P-val,CPIV
0,1972139.0,0.79957,15489020.0,0.0,1048940.0,0.0,14.76635


In [47]:
df_all_sim.to_csv('/Users/shuyangduuber.com/Desktop/data/advanced_promo/rxgy_adrm_train_multimetric_US_20190602_20190804_simulated.csv')