# Pentathlon-III: Next Product to Buy Models

* Team-lead GitLab userid:
* Group name:
* Team member names:

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyrsm as rsm
import statsmodels.formula.api as smf
from sklearn import preprocessing
from statsmodels.genmod.families import Binomial
from statsmodels.genmod.families.links import logit
import xgboost as xgb
from sklearn import metrics
# increase plot resolution
# mpl.rcParams["figure.dpi"] = 150

In [2]:
## loading the data - this dataset must NOT be changed
pentathlon_nptb = pd.read_pickle("data/pentathlon_nptb.pkl")
pentathlon_nptb["buyer_yes"] = (pentathlon_nptb["buyer"] == "yes").astype(int)
#pentathlon_nptb.head()

Unnamed: 0,custid,buyer,total_os,message,age,gender,income,education,children,freq_endurance,...,endurance_os,strength_os,water_os,team_os,backcountry_os,winter_os,racquet_os,training,representative,buyer_yes
0,U45198803,no,0.0,endurance,30 to 44,M,25000,14,1.3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2147483648,1,0
1,U22197752,no,0.0,backcountry,45 to 59,F,40000,44,0.4,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2147483648,1,0
2,U83874832,no,0.0,backcountry,45 to 59,M,50000,24,0.8,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
3,U19423462,no,0.0,winter,45 to 59,F,50000,26,1.1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2147483648,1,0
4,U23888305,no,0.0,winter,30 to 44,M,40000,22,1.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2147483648,1,0


In [3]:
pent = pentathlon_nptb

In [4]:
## Question answers

In [5]:
# pentathlon_nptb['gender'].get_dummies()
pent["gender"] = pd.get_dummies(pent["gender"])
pent[
    ["under_thirty", "thirty_to_fortyfour", "fortyfive_to_fiftynine", "sixty_and_above"]
] = pd.get_dummies(pent["age"])
pent[[    "message_endurance",
    "message_strength",
    "message_water",
    "message_team",
    "message_backcountry",
    "message_winter",
    "message_racquet"]] = pd.get_dummies(pent["message"])

In [6]:
train = pent[pent['training'] == 1]
test = pent[pent['training'] == 0]
rep = pent[pent['representative'] == 1]

# XGBoost

In [7]:
rvar = "buyer_yes"
evar = [
    "income",
    "education",
    "children",
    "gender",
    "under_thirty",
    "thirty_to_fortyfour",
    "fortyfive_to_fiftynine",
    "sixty_and_above",
    "freq_endurance",
    "freq_strength",
    "freq_water",
    "freq_team",
    "freq_backcountry",
    "freq_winter",
    "freq_racquet",
    "message_endurance",
    "message_strength",
    "message_water",
    "message_team",
    "message_backcountry",
    "message_winter",
    "message_racquet",
]
idvar = "custid"
lev = 1

In [8]:
eval_dat = pd.concat([train, test, rep], axis = 0)
eval_dat = eval_dat[[idvar, rvar, "training", "representative"]]
X_train = train[evar]
y_train = train[rvar]
X_test = test[evar]
y_test = test[rvar]
X_rep = rep[evar]
y_rep = rep[rvar]

In [9]:
model = xgb.XGBClassifier(
    min_child_weight=5,
    colsample_bylevel=1,
    max_delta_step=1,
    scale_pos_weight=1,
    alpha=1,
    reg_lambda=1,
    learning_rate=0.01,
    subsample=.1,
    colsample_bytree=1,
    gamma=1,
    max_depth=15,
    n_estimators=1000,
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="auc",
    random_state=1234,
).fit(X_train, y_train.values, verbose=True)

In [10]:
pred_2_tr = model.predict_proba(X_train)
fpr, tpr, thresholds = metrics.roc_curve(y_train, pred_2_tr[:, 1])

pred_2_test = model.predict_proba(X_test)
fpr_t, tpr_t, thresholds_t = metrics.roc_curve(y_test, pred_2_test[:, 1])

print(f"AUC training data for the XGBoost model: {(metrics.auc(fpr, tpr) * 100).round(2)}%")
print(f"AUC testing data for the XGBoost model: {(metrics.auc(fpr_t, tpr_t) * 100).round(2)}%\n")

AUC training data for intial model: 0.9076593391836734
AUC testing data for intial model: 0.8886451555555556



In [11]:
dfs = [X_train, X_test, X_rep]
Xs = pd.concat(dfs).reset_index(drop=True)

In [12]:
# Predictions on final model
# strength
strength_set=Xs.copy()
strength_set["message_strength"]=1
strength_set[["message_endurance", "message_water", "message_team","message_backcountry","message_winter","message_racquet"]]=0
eval_dat["strength_p"] = 0  
eval_dat["strength_p"] = model.predict_proba(strength_set)[:, 1]

In [13]:
# Predictions on final model
# endurance
endurance_set=Xs.copy()
endurance_set["message_endurance"]=1
strength_set[["message_strength", "message_water", "message_team","message_backcountry","message_winter","message_racquet"]]=0
eval_dat["endurance_p"] = 0  
eval_dat["endurance_p"] = model.predict_proba(endurance_set)[:, 1]

In [14]:
# Predictions on final model
# water
water_set=Xs.copy()
water_set["message_water"]=1
strength_set[["message_strength", "message_endurance", "message_team","message_backcountry","message_winter","message_racquet"]]=0
eval_dat["water_p"] = 0  
eval_dat["water_p"] = model.predict_proba(water_set)[:, 1]

In [15]:
# Predictions on final model
# team
team_set=Xs.copy()
team_set["message_team"]=1
strength_set[["message_strength", "message_endurance", "message_water","message_backcountry","message_winter","message_racquet"]]=0
eval_dat["team_p"] = 0  
eval_dat["team_p"] = model.predict_proba(team_set)[:, 1]

In [16]:
# Predictions on final model
# backcountry
backcountry_set=Xs.copy()
backcountry_set["message_backcountry"]=1
strength_set[["message_strength", "message_endurance", "message_water","message_team","message_winter","message_racquet"]]=0
eval_dat["backcountry_p"] = 0  
eval_dat["backcountry_p"] = model.predict_proba(backcountry_set)[:, 1]

In [17]:
# Predictions on final model
# winter
winter_set=Xs.copy()
winter_set["message_winter"]=1
strength_set[["message_strength", "message_endurance", "message_water","message_team","message_backcountry","message_racquet"]]=0
eval_dat["winter_p"] = 0  
eval_dat["winter_p"] = model.predict_proba(winter_set)[:, 1]

In [18]:
# Predictions on final model
# racquet
racquet_set=Xs.copy()
racquet_set["message_racquet"]=1
strength_set[["message_strength", "message_endurance", "message_water","message_team","message_backcountry","message_winter"]]=0
eval_dat["racquet_p"] = 0  
eval_dat["racquet_p"] = model.predict_proba(racquet_set)[:, 1]

In [19]:
# get the message that will result in the highest profit for each customer
eval_dat["to_offer_i"] = (
    eval_dat[
        [
            "strength_p",
            "water_p",
            "team_p",
            "backcountry_p",
            "winter_p",
            "racquet_p",
            "endurance_p",
        ]
    ]
    .idxmax(axis=1)
    .str.replace("strength_p", "strength")
    .replace("water_p", "water")
    .replace("team_p", "team")
    .replace("backcountry_p", "backcountry")
    .replace("winter_p", "winter")
    .replace("racquet_p", "racquet")
    .replace("endurance_p", "endurance")
)

In [20]:
# using the predctions for each of the messages, get the prediction of the value
eval_dat["p_target_i"] = eval_dat[
    [
        "strength_p",
        "water_p",
        "team_p",
        "backcountry_p",
        "winter_p",
        "racquet_p",
        "endurance_p",
    ]
].max(axis=1)

### Q1: Determine the message (i.e., endurance, strength, water, team, backcountry, winter, or racquet) predicted to lead to the highest probability of purchase. Describe your approach.

In [22]:
reps = eval_dat[eval_dat["representative"] == 1].reset_index(drop=True)

In [23]:
#for each customer, the message that will lead to the highest probability of purchase
q1_xg = reps[["custid", "to_offer_i"]]

Unnamed: 0,custid,to_offer_i
0,U45198803,strength
1,U22197752,endurance
2,U19423462,strength
3,U23888305,strength
4,U16954857,endurance
...,...,...
99995,U12620333,endurance
99996,U18623424,endurance
99997,U64468968,team
99998,U33721691,endurance


With the XGBoost model we predicted the probability of purchase for each department type if the promotional message of the particular department is pushed to all the customers. After finding the probabilities for seven departments for each customer, we identified the department that has highest probability for each customers chose the department as target for the particular customer.

### Q2: For each message, report the percentage of customers for whom that message maximizes their probability of purchase.

In [25]:
# for each of the 7 message types, plus targeting apprach, the percentage of customers for whom that message maximizes their probability of purchase
q2_xg = reps['to_offer_i'].value_counts(normalize = True)* 100
q2_xg = pd.DataFrame(q2_xg)
q2_xg.columns = ['customers (%)']

endurance      48.763
strength       21.756
water          11.372
racquet         8.512
backcountry     5.364
team            2.513
winter          1.720
Name: to_offer_i, dtype: float64

##### The XGBoost model predicted that endurance would be the message that would have the highest chance of maximizing the probability of purchase in the representative data at 48.76%, in second and third, were strength 21.76% and water 11.37%, and last place winter at 1.72%.

##### At this point, because of the results of the previous two questions, we decided not to continue on with using this model to answer the next questions and focus our efforts on fine-tuning model types that will provide us with better predictions. The XGBoost regression did not give us the predictions we were hoping for, likely because of the type of data at hand.