Uplift Modeling to predict and target the right customers

In [1]:
#Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
#Import data
data = pd.read_csv('../input/customer-retention/data.csv')
data.head()

In [3]:
data.isnull().sum()

There is no null data in this dataset

In [4]:
data.offer.value_counts()

In [5]:
#Rename target column
data = data.rename(columns={'conversion':'target'})
#Rename & Label encode treatment column
data = data.rename(columns={'offer':'treatment'})
data.treatment = data.treatment.map({'No Offer':0,'Discount':1,'Buy One Get One':-1})
data.head()

In [6]:
data.zip_code.value_counts()

In [7]:
data.zip_code = data.zip_code.map({'Surburban':0,'Urban':1,'Rural':2})
data.head()

In [8]:
data.channel.value_counts()

In [9]:
data.channel = data.channel.map({'Web':0,'Phone':1,'Multichannel':2})
data.head()

In [10]:
#Split data with buy one get one and discount
data_off = data.copy().loc[data.treatment<=0].reset_index(drop=True)
data_discount = data.copy().loc[data.treatment>=0].reset_index(drop=True)
data_discount.head()

Currently, label is if a customer converted or not (1 or 0). We need to create four classes for TR, TN, CR, and CN.

Target Class Declaration
* Control Non-Responders(CN):
Customers that don't make a purchase without an offer (value = 0)
* Control Responders(CR):
Customers that make a purchase without an offer (value = 1)
* Treatment Non-Responders(TN):
Customer that don't make a purchase and receive an offer (value = 2)
* Treatment Responders(TR):
Customers that make a purchase and receive an offer (value = 3)

In [11]:
#Function to declare Target Class
def tc(df):
    #CN
    df['target_class'] = 0
    #CR
    df.loc[(df.target!=0) & (df.treatment==0), 'target_class'] = 1
    #TN
    df.loc[(df.target==0) & (df.treatment!=0), 'target_class'] = 2
    #TR
    df.loc[(df.target!=0) & (df.treatment!=0), 'target_class'] = 3
    
    return df

We need to target Treatment Responders (TR) and Control Non-Responders (CN). Since they won’t purchase unless we give an offer, these groups are boosting our uplift in promotional campaigns

In [12]:
#Execute the functions for each treatment
data_off = tc(data_off)
data_discount = tc(data_discount)
data_discount.head()

* The Uplift model use Lai’s Generalized Weighed Uplift (LGWUM) to calculate the uplift score.
* Uplift Score = P(TR)/P(T) + P(CN)/P(C) - P(TN)/P(T) - P(CR)/P(C) \
* P Denotes a Probabilty Score, T denotes the total treated population (TR + TN), and C denotes the total untreated population (CR + CN)
* The higher score means higher uplift.

In [13]:
def data_split(df):
    #Train-Test data split
    x = df.drop(['target','target_class'],axis=1)
    y = df.target_class
    xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,stratify=df['treatment'])
    
    return xtrain,xtest,ytrain,ytest

def uplift_model(xtrain,xtest,ytrain,ytest):
    #XGB Classifier to get uplift score
    result = pd.DataFrame(xtest).copy()
    model = xgb.XGBClassifier().fit(xtrain.drop('treatment',axis=1),ytrain)
    uplift_proba = model.predict_proba(xtest.drop('treatment', axis=1))
    print(uplift_proba)
    
    result['proba_CN'] = uplift_proba[:,0] 
    result['proba_CR'] = uplift_proba[:,1] 
    result['proba_TN'] = uplift_proba[:,2] 
    result['proba_TR'] = uplift_proba[:,3]
    result['uplift_score'] = result.eval('proba_CN/(proba_CN+proba_CR) + proba_TR/(proba_TN+proba_TR) - proba_TN/(proba_TN+proba_TR) - proba_CR/(proba_CN+proba_CR)')
    result['target_class'] = ytest
    print(result['uplift_score'])
    print('-'*150)
    return result

def uplift(df):
    #Combine the data split and Modeling
    xtrain,xtest,ytrain,ytest = data_split(df)
    result = uplift_model(xtrain,xtest,ytrain,ytest)
    
    return result

In [14]:
data_off_uplift = uplift(data_off)
data_discount_uplift = uplift(data_discount)

Qini is a generalization of gini curve for Uplift model \
QINI = (TR/T) - (CR/C)

In [15]:
#Functions to build the Uplift model and visualize the QINI Curve
def qini_rank(uplift:pd.DataFrame):
    # Create new dataframe
    ranked = pd.DataFrame({'n':[], 'target_class':[]})
    ranked['target_class'] = uplift['target_class']
    ranked['uplift_score'] = uplift['uplift_score']
    # Add proportion
    ranked['n'] = ranked.uplift_score.rank(pct=True, ascending=False)
    # Data Ranking   
    ranked = ranked.sort_values(by='n').reset_index(drop=True)
    
    return ranked


def qini_eval(ranked:pd.DataFrame):
    #Evaluate the uplift value with the QINI criterion
    uplift_model, random_model = ranked.copy(), ranked.copy()
    # Using Treatment and Control Group to calculate the uplift (Incremental gain)
    C, T = sum(ranked['target_class'] <= 1), sum(ranked['target_class'] >= 2)
    ranked['cr'] = 0
    ranked['tr'] = 0
    ranked.loc[ranked.target_class == 1,'cr'] = 1
    ranked.loc[ranked.target_class == 3,'tr'] = 1
    ranked['cr/c'] = ranked.cr.cumsum() / C
    ranked['tr/t'] = ranked.tr.cumsum() / T
    # Calculate and put the uplift and random value into dataframe
    uplift_model['uplift'] = round(ranked['tr/t'] - ranked['cr/c'],5)
    random_model['uplift'] = round(ranked['n'] * uplift_model['uplift'].iloc[-1],5)
    
    # Add q0
    q0 = pd.DataFrame({'n':0, 'uplift':0, 'target_class': None}, index =[0])
    uplift_model = pd.concat([q0, uplift_model]).reset_index(drop = True)
    random_model = pd.concat([q0, random_model]).reset_index(drop = True)  
    # Add model name & concat
    uplift_model['model'] = 'Uplift model'
    random_model['model'] = 'Random model'
    merged = pd.concat([uplift_model, random_model]).sort_values(by='n').reset_index(drop = True)
    
    return merged


def qini_plot(merged:pd.DataFrame):
    #Plot the QINI plot
    ax = sns.lineplot(x='n', y='uplift', hue='model', data=merged,
                      style='model', palette=['red','grey'])
    handles, labels = ax.get_legend_handles_labels()
    plt.xlabel('Proportion targeted',fontsize=15)
    plt.ylabel('Uplift',fontsize=15)
    plt.subplots_adjust(right=1)
    plt.subplots_adjust(top=1)
    plt.legend(fontsize=12)
    ax.tick_params(labelsize=15)
    ax.legend(handles=handles[1:], labels=labels[1:], loc='upper right')
    
    return ax


def qini(uplift:pd.DataFrame):
    ranked = qini_rank(uplift)
    merged = qini_eval(ranked)
    ax = qini_plot(merged)
    
    return ax

In [16]:
#Qini curve for BOGO treatment
qini(data_off_uplift)
plt.title('Qini Curve - Buy One Get One',fontsize=20)

In [17]:
#Qini curve for Discount treatment
qini(data_discount_uplift)
plt.title('Qini Curve - Discount',fontsize=20)

The QINI curve as shown below visualize the performance of Uplift model in targeting the right customers from the test-data. The results show that the Uplift model can give more uplift than random model (without model). \
By using this model, we can easily make our campaign more efficient by:
* Targeting specific segments based on the uplift score
* Trying different offers based on customer’s uplift score