In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

[?25l[K     |▎                               | 10kB 15.7MB/s eta 0:00:01[K     |▋                               | 20kB 3.2MB/s eta 0:00:01[K     |█                               | 30kB 4.6MB/s eta 0:00:01[K     |█▎                              | 40kB 3.0MB/s eta 0:00:01[K     |█▋                              | 51kB 3.7MB/s eta 0:00:01[K     |██                              | 61kB 4.4MB/s eta 0:00:01[K     |██▎                             | 71kB 5.0MB/s eta 0:00:01[K     |██▋                             | 81kB 5.6MB/s eta 0:00:01[K     |███                             | 92kB 6.2MB/s eta 0:00:01[K     |███▎                            | 102kB 4.9MB/s eta 0:00:01[K     |███▋                            | 112kB 4.9MB/s eta 0:00:01[K     |████                            | 122kB 4.9MB/s eta 0:00:01[K     |████▎                           | 133kB 4.9MB/s eta 0:00:01[K     |████▋                           | 143kB 4.9MB/s eta 0:00:01[K     |█████                     

In [0]:
import pandas as pd
link_hilstrom = 'https://drive.google.com/open?id=15osyN4c5z1pSo1JkxwL_N8bZTksRvQuU'
fluff, id = link_hilstrom.split('=')
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('Hillstrom.csv')
hillstrom_df = pd.read_csv('Hillstrom.csv')

In [0]:
import pandas as pd
link_ = 'https://drive.google.com/open?id=1b8N7WtwIe2WmQJD1KL5UAy70K13MxwKj'
fluff, id = link_.split('=')
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('Lalonde.csv')
lalonde_df = pd.read_csv('Lalonde.csv')

In [0]:
print(hillstrom_df[:5])
print(lalonde_df[:5])

   recency history_segment  history  ...  visit  conversion spend
0       10  2) $100 - $200   142.44  ...      0           0   0.0
1        6  3) $200 - $350   329.08  ...      0           0   0.0
2        7  2) $100 - $200   180.65  ...      0           0   0.0
3        9  5) $500 - $750   675.83  ...      0           0   0.0
4        2    1) $0 - $100    45.34  ...      0           0   0.0

[5 rows x 12 columns]
   treatment   age  education  black  ...  married  nodegree  RE75        RE78
0        1.0  37.0       11.0    1.0  ...      1.0       1.0   0.0   9930.0460
1        1.0  22.0        9.0    0.0  ...      0.0       1.0   0.0   3595.8940
2        1.0  30.0       12.0    1.0  ...      0.0       0.0   0.0  24909.4500
3        1.0  27.0       11.0    1.0  ...      0.0       1.0   0.0   7506.1460
4        1.0  33.0        8.0    1.0  ...      0.0       1.0   0.0    289.7899

[5 rows x 9 columns]


In [0]:
import csv
import json
import os
from os.path import isfile, join
from sklearn.model_selection import KFold, StratifiedKFold


def preprocess_data(df, dataset='hillstrom', verbose=True):
    """
    Preprocessing the dataset
     - Use one-hot encoding for categorical features
     - Check the name of the target variable and treatment variable
     - Drop the unused columns
     - Delete the unused data
    
    Args:
        df: A pandas.DataFrame which have all data of the dataset
        dataset: the name of the dataset
    Return:
        # I recommend to split into the dataframes of predictor variables, the 
        # target variable, and the treatment varaible
        # df_x: the dataframes of predictor variables
        # df_y: target variables
        # df_t: treatment variables
    """
    if dataset in ['hillstrom', 'email']:
        # For Hillstrom dataset, the ‘‘visit’’ target variable was selected
        #   as the target variable of interest and the selected treatment is 
        #   the e-mail campaign for women’s merchandise [1]
        # [1] Kane K, Lo VSY, Zheng J. True-lift modeling: Comparison of methods. 
        #    J Market Anal. 2014;2:218–238
    
        # Delete unused data: men's email cases should be removed
        df_x = df[df.segment != 'Mens E-Mail']
        
        # Assign Y for target (visit: 0, 1)
        df_y = df_x['visit']
        
        # Assign T for treatment (segment: Womens E-Mail, Mens E-Mail (not used), No E-Mail)
        df_t = (df_x['segment'] == 'Womens E-Mail').astype('int64')

        # Drop unused columns from X
        df_x = df_x.drop(columns=['conversion', 'spend', 'visit', 'segment'])
        
        # One-hot encoding for categorical features
        df_x = pd.get_dummies(df_x)

    elif dataset in ['criteo', 'ad']:
        raise NotImplementedError
    elif dataset in ['lalonde', 'job']:
        # Delete unused data: None
        df_x = df

        # Target variables (RE78: earnings in 1978)
        df_y = df_x['RE78']
        
        # Treatment variables (treatment: 0, 1)
        df_t = df_x['treatment']

        # Drop unused columns
        df_x = df_x.drop(columns=['treatment', 'RE78'])
        
        # One-hot encoding for categorical features
        df_x = pd.get_dummies(df_x)

    else:
        raise NotImplementedError

    return df_x, df_y, df_t

In [82]:
def print_preproccessing_data(df_x, df_y, df_t):
    print(df_x[:5])
    print(df_x.columns.values)
    print(df_x.shape)
    print(df_y[:5])
    print(df_t[:5])


hillstrom_df_x, hillstrom_df_y, hillstrom_df_t = preprocess_data(hillstrom_df, 'hillstrom')
print_preproccessing_data(hillstrom_df_x, hillstrom_df_y, hillstrom_df_t)

lalonde_df_x, lalonde_df_y, lalonde_df_t = preprocess_data(lalonde_df, 'lalonde')
print_preproccessing_data(lalonde_df_x, lalonde_df_y, lalonde_df_t)

   recency  history  mens  ...  channel_Multichannel  channel_Phone  channel_Web
0       10   142.44     1  ...                     0              1            0
1        6   329.08     1  ...                     0              0            1
2        7   180.65     0  ...                     0              0            1
4        2    45.34     1  ...                     0              0            1
5        6   134.83     0  ...                     0              1            0

[5 rows x 18 columns]
['recency' 'history' 'mens' 'womens' 'newbie'
 'history_segment_1) $0 - $100' 'history_segment_2) $100 - $200'
 'history_segment_3) $200 - $350' 'history_segment_4) $350 - $500'
 'history_segment_5) $500 - $750' 'history_segment_6) $750 - $1,000'
 'history_segment_7) $1,000 +' 'zip_code_Rural' 'zip_code_Surburban'
 'zip_code_Urban' 'channel_Multichannel' 'channel_Phone' 'channel_Web']
(42693, 18)
0    0
1    0
2    0
4    0
5    1
Name: visit, dtype: int64
0    1
1    0
2    1
4    1
5 

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def performance(pr_y1_ct1, pr_y1_ct0, y, ct, groups=10):
    """
    1. Split the total customers into the given number of groups
    2. Calculate the statistics of each segment
    
    Args:
        pr_y1_ct1: the series (list) of the customer's expected return
        pr_y1_ct0: the expected return when a customer is not treated
        y: the observed return of customers
        ct: whther each customer is treated or not
        groups: the number of groups (segments). Should be 5, 10, or 20
    Return:
        DataFrame:
            columns:
                'n_y1_ct1': the number of treated responders
                'n_y1_ct0': the number of not treated responders
                'r_y1_ct1': the average return of treated customers
                'r_y1_ct0': the average return of not treated customers
                'n_ct1': the number of treated customers
                'n_ct0': the number of not treated customers
                'uplift': the average uplift (the average treatment effect)
            rows: the index of groups
    """

    ### check valid arguments
    if groups not in [5, 10, 20]:
        raise Exception("uplift: groups must be either 5, 10 or 20")

    ### check for NAs.
    if pr_y1_ct1.isnull().values.any():
        raise Exception("uplift: NA not permitted in pr_y1_ct1")
    if pr_y1_ct0.isnull().values.any():
        raise Exception("uplift: NA not permitted in pr_y1_ct0")
    if y.isnull().values.any():
        raise Exception("uplift: NA not permitted in y")
    if ct.isnull().values.any():
        raise Exception("uplift: NA not permitted in ct")

    ### check valid values for ct
    if set(ct) != {0, 1}:
        raise Exception("uplift: ct must be either 0 or 1")

    ### check length of arguments
    if not (len(pr_y1_ct1) == len(pr_y1_ct0) == len(y) == len(ct)):
        raise Exception("uplift: arguments pr_y1_ct1, pr_y1_ct0, y and ct must all have the same length")

    ###############################
    ###     Do it yourself!     ###
    ###############################
    # Make group id to split
    group_id = (pd.Series(range(0, len(ct))) / (len(ct) / groups)).astype('int32').set_index(ct.index)
    
    # Split input data frame by group id
    # tr: treated responder
    # cr: controlled responder
    df = pd.DataFrame(data={'tr': y & ct, 'cr': y & (ct == 0), 'ct': ct,
                            'exp_tr': pr_y1_ct1 & ct, 'exp_cr': pr_y1_ct0 & ct, 'group_id': group_id})
    print(df)
    df_group = df.groupby('group_id')
    
    # Get group data
    n_ct1 = df['ct'].aggregate('sum')
    n_ct0 = (df['ct'] == 0).aggregate('sum')

    n_y1_ct1 = df['tr'].aggregate('sum')
    print(df['tr'])
    print(n_y1_ct1)
    n_y1_ct0 = df['cr'].aggregate('sum')
    print(df['cr'])
    print(n_y1_ct0)
    
    r_y1_ct1 = df['tr'].aggregate('mean')
    print(r_y1_ct1)
    
    r_y1_ct0 = df['cr'].aggregate('mean')
    
    uplift = r_y1_ct1 - r_y1_ct0
    
    # Create output data frame by grouped input data frame
    return pd.DataFrame(data={'n_y1_ct1': n_y1_ct1, 'n_y1_ct0': n_y1_ct0, 'r_y1_ct1': r_y1_ct1, 'r_y1_ct0': r_y1_ct0,
                              'n_ct1': n_ct1, 'n_ct0': n_ct0, 'uplift': uplift})


def qini(perf, plotit=True):
    """
    Calculating the incremental gains (y-axis of Qini curve)
     - First, the cumulitative sum of the treated and the control groups are
      calculated with respect to the total population in each group at the
      specified decile
     - Afterwards we calculate the percentage of the total amount of people
      (both treatment and control) are present in each decile
    Args:
        perf: A return of the performance function (above)
        plotit: whether draw a plot or not
    Return:
        1. Qini value
        2. return or save the plot if plotit is True
    """
    
    ###############################
    ###     Do it yourself!     ###
    ###############################
    return ##


In [88]:
df = performance(hillstrom_df_y, hillstrom_df_y, hillstrom_df_y, hillstrom_df_t, 10)
print(df)

AttributeError: ignored

In [0]:
import itertools
import numpy as np

def parameter_tuning(fit_mdl, pred_mdl, data, search_space):
    """
    Given a model, search all combination of parameter sets and find
    the best parameter set
    
    Args:
        fit_mdl: model function
        pred_mdl: predict function of fit_mdl
        data:
            {
                "x_train": predictor variables of training dataset,
                "y_train": target variables of training dataset,
                "ct_train": treatment variables of training dataset,
                "x_test": predictor variables of test (usually, validation) dataset,
                "y_test": target variables of test (usually, validation) dataset,
                "ct_test": treatment variables of test (usually, validation) dataset,
            }
        search_space:
            {
                parameter_name: [search values]
            }
    Return:
        The best parameter set
    """
    
    ###############################
    ###     Do it yourself!     ###
    ###############################
    return ##

  
def wrapper(fit_mdl, pred_mdl, data)
    """
    General wrapper approach
    
    Args:
        fit_mdl: model function
        pred_mdl: predict function of fit_mdl
        data:
            {
                "x_train": predictor variables of training dataset,
                "y_train": target variables of training dataset,
                "ct_train": treatment variables of training dataset,
                "x_test": predictor variables of test (usually, validation) dataset,
                "y_test": target variables of test (usually, validation) dataset,
                "ct_test": treatment variables of test (usually, validation) dataset,
            }
    Return:
        (A list of best models, The list of dropped variables)
    """
    
    ###############################
    ###     Do it yourself!     ###
    ###############################
    return ##


In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold


def tma(x, y, ct, method=LogisticRegression, **kwargs):
    """Training a model according to the "Two Model Approach" 
    (a.k.a. "Separate Model Approach")
    The default model is General Linear Model (GLM)
    
    Source: "Incremental Value Modeling" (Hansotia, 2002)

    Args:
        x: A data frame of predictors.
        y: A binary response (numeric) vector.
        ct: A binary response (numeric) representing the treatment assignment
            (coded as 0/1).
        method: A sklearn model specifying which classification or regression
            model to use. This should be a method that can handle a 
            multinominal class variable.

    Return:
        Dictionary: A dictionary of two models. One for the treatment group, 
            one for the control group.

            {
                'model_treat': a model for the treatment group,
                'model_control': a model for the control group
            }

    """
    
    ###############################
    ###     Do it yourself!     ###
    ###############################
    return {
        'model_treat': method(random_state=1234, solver='newton-cg').fit(x[ct == 1], y[ct == 1]),
        'model_control': method(random_state=1234, solver='newton-cg').fit(x[ct == 0], y[ct == 0])
    }


def predict_tma(obj, newdata, **kwargs):
    """Predictions according to the "Two Model Approach" 
    (a.k.a. "Separate Model Approach")
    
    For each instance in newdata two predictions are made:
    1) What is the probability of a person responding when treated?
    2) What is the probability of a person responding when not treated
      (i.e. part of control group)?

    Source: "Incremental Value Modeling" (Hansotia, 2002)

    Args:
        obj: A dictionary of two models. 
            One for the treatment group, one for the control group.
        newdata: A data frame containing the values at which predictions
            are required.
    
    Return:
        DataFrame: A dataframe with predicted returns for when the customers
            are treated and for when they are not treated.
    """

    ###############################
    ###     Do it yourself!     ###
    ###############################
    df_x = newdata.drop(['X', 'Y'])
    s_y = newdata['Y']
    s_t = newdata['T']
    
    return obj['model_treat'](df_x, s_y, s_t) - obj['model_control'](df_x, s_y, s_t)

In [0]:

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

seed = 1234

def main():
    ### Load data ###
    hillstrom_df_pre = preprocess_data(hillstrom_df, 'hillstrom')
    # lalonde_df_pre = preprocess_data(lalonde_df, 'lalonde')
    
    print(hillstrom_df_pre)
    data_train, data_test = train_test_split(hillstrom_df_pre)
    print(data_train)
    print(data_test)
    assert()
    
    for model in models:
        ### Cross validation ###
        x_train, y_train, x_test, y_test = train_test_split(hillstrom_df_pre)
        
        ### Variable selection (General wrapper approach) ###

        ### Parameter tuning ###

        print("Model: {}\n".format(model))
        print("Tuning space: \n")
        for key, val in search_space.items():
            print("    '{}': {}\n".format(key, val))
        print("Seed: {}\n".format(seed))
        print("Qini value: mean = {}, std = {}\n\n".format(mean_qini, std_qini))


In [58]:
main()

       recency  history  mens  womens  ...  channel_Phone  channel_Web  Y      T
0           10   142.44     1       0  ...              1            0  0   True
1            6   329.08     1       1  ...              0            1  0  False
2            7   180.65     0       1  ...              0            1  0   True
4            2    45.34     1       0  ...              0            1  0   True
5            6   134.83     0       1  ...              1            0  1   True
6            9   280.20     1       0  ...              1            0  0   True
7            9    46.42     0       1  ...              1            0  0   True
9           10    32.84     0       1  ...              0            1  0   True
10           7   548.91     0       1  ...              1            0  1   True
11           1   211.45     0       1  ...              1            0  0   True
12           5   642.90     0       1  ...              0            0  0   True
14           4   241.42     

AssertionError: ignored