In [1]:
import pandas as pd
import numpy as np
import math
import json
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

UsageError: Line magic function `%` not found.


In [2]:
portfolio = pd.read_json('../input/portfolio.json', orient='records', lines=True)
profile = pd.read_json('../input/profile.json', orient='records', lines=True)
transcript = pd.read_json('../input/transcript.json', orient='records', lines=True)

In [3]:
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'], format='%Y%m%d')
profile['became_member_year'] = pd.DatetimeIndex(profile['became_member_on']).year
profile['gender'] = profile['gender'].fillna('N')
profile['income'] = profile['income'].fillna(profile['income'].median())

In [4]:
bins = [10, 20, 30, 40, 50, 60, 70, 80, 90, 120]
group_names = ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', 'Missing']
profile['age_categories'] = pd.cut(profile['age'], bins, labels=group_names)

In [26]:
bins = [29999, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000]
labels = ['30k-39k', '40k-49k', '50k-59k', '60k-69k', '70k-79k', '80k-89k', '90k-99k', '100k-109k', '110k-120k']
profile['income_binned'] = pd.cut(profile['income'], bins=bins, labels=labels)

In [5]:
dummies = pd.get_dummies(portfolio['channels'].apply(pd.Series).stack(), prefix='channel').sum(level=0)

In [6]:
portfolio = portfolio.merge(dummies, left_index=True, right_index=True)
portfolio = portfolio.drop('channels', axis=1)

In [7]:
values_df = pd.DataFrame(transcript['value'].tolist())
values_df['offerid'] = values_df['offer id'].combine_first(values_df['offer_id'])
values_df.drop(['offer id', 'offer_id'], axis=1, inplace=True)
index_df = pd.DataFrame({'idx': transcript.index.values.tolist()})
combined = index_df.merge(values_df, left_index=True, right_index=True)
combined_index = combined.set_index('idx')

In [8]:
transcript = transcript.merge(combined_index, left_index=True, right_index=True)

In [9]:
transcript = transcript.merge(portfolio, left_on='offerid', right_on='id', how='left')
transcript = transcript.rename(columns={'reward_x': 'reward_paid', 'reward_y': 'reward_planned', 'offerid': 'offer_id'})

In [10]:
transcript['amount'] = transcript['amount'].fillna(0)
transcript = pd.get_dummies(transcript, columns=['event', 'offer_type'], prefix=None)
transcript = transcript.rename(columns={
    'event_offer completed': 'event_offer_completed',
    'event_offer received': 'event_offer_received',
    'event_offer viewed': 'event_offer_viewed'
})

In [11]:
transcript = transcript.drop(['value', 'id'], axis=1)

In [12]:
# adding general aggregated metrics
transcript['total_amount'] = transcript.groupby(['person'], as_index=False)['amount'].transform(np.sum).round()
transcript['total_transactions'] = transcript.groupby(['person'], as_index=False)['event_transaction'].transform(np.sum)
transcript['total_offers_received'] = transcript.groupby(['person'], as_index=False)['event_offer_received'].transform(np.sum)
transcript['total_offers_viewed'] = transcript.groupby(['person'], as_index=False)['event_offer_viewed'].transform(np.sum)
transcript['total_offers_completed'] = transcript.groupby(['person'], as_index=False)['event_offer_completed'].transform(np.sum)
transcript['total_offers_bogo'] = transcript.loc[transcript['event_offer_received'] == 1, :].groupby(['person'], as_index=False)['offer_type_bogo'].transform(np.sum)
transcript['total_offers_discount'] = transcript.loc[transcript['event_offer_received'] == 1, :].groupby(['person'], as_index=False)['offer_type_discount'].transform(np.sum)
transcript['total_offers_informational'] = transcript.loc[transcript['event_offer_received'] == 1, :].groupby(['person'], as_index=False)['offer_type_informational'].transform(np.sum)

# filling nans for each person with the first found values in the series groupby('person')
transcript['total_offers_bogo'] = transcript.groupby(['person'], as_index=False)['total_offers_bogo'].apply(lambda c: c.fillna(0) if c.fillna(0).sum() == 0 else c.fillna(method='ffill').fillna(method='bfill')).reset_index(level=0, drop=True)
transcript['total_offers_discount'] = transcript.groupby(['person'], as_index=False)['total_offers_discount'].apply(lambda c: c.fillna(0) if c.fillna(0).sum() == 0 else c.fillna(method='ffill').fillna(method='bfill')).reset_index(level=0, drop=True)
transcript['total_offers_informational'] = transcript.groupby(['person'], as_index=False)['total_offers_informational'].apply(lambda c: c.fillna(0) if c.fillna(0).sum() == 0 else c.fillna(method='ffill').fillna(method='bfill')).reset_index(level=0, drop=True)

# adding offers that did convert the person
transcript['converted_bogo'] = transcript.loc[transcript['event_offer_completed'] == 1, :].groupby(['person'], as_index=False)['offer_type_bogo'].transform(np.sum)
transcript['converted_discount'] = transcript.loc[transcript['event_offer_completed'] == 1, :].groupby(['person'], as_index=False)['offer_type_discount'].transform(np.sum)
transcript['converted_informational'] = transcript.loc[transcript['event_offer_completed'] == 1, :].groupby(['person'], as_index=False)['offer_type_informational'].transform(np.sum)

# filling nans for each person with the first found values in the series groupby('person')
transcript['converted_bogo'] = transcript.groupby(['person'], as_index=False)['converted_bogo'].apply(lambda c: c.fillna(0) if c.fillna(0).sum() == 0 else c.fillna(method='ffill').fillna(method='bfill')).reset_index(level=0, drop=True)
transcript['converted_discount'] = transcript.groupby(['person'], as_index=False)['converted_discount'].apply(lambda c: c.fillna(0) if c.fillna(0).sum() == 0 else c.fillna(method='ffill').fillna(method='bfill')).reset_index(level=0, drop=True)
transcript['converted_informational'] = transcript.groupby(['person'], as_index=False)['converted_informational'].apply(lambda c: c.fillna(0) if c.fillna(0).sum() == 0 else c.fillna(method='ffill').fillna(method='bfill')).reset_index(level=0, drop=True)

In [173]:
joined_df = profile.merge(transcript, how='inner', left_on='id', right_on='person')

In [228]:
df = joined_df.copy()

In [229]:
df['conversion_rate_bogo'] = df['converted_bogo'].div(df['total_offers_bogo'].where(df['total_offers_bogo'] != 0, np.nan))
df['conversion_rate_discount'] = df['converted_discount'].div(df['total_offers_discount'].where(df['total_offers_discount'] !=0, np.nan))

In [230]:
df.head(10)

Unnamed: 0,gender,age,id,became_member_on,income,became_member_year,age_categories,income_binned,person,time,...,total_offers_viewed,total_offers_completed,total_offers_bogo,total_offers_discount,total_offers_informational,converted_bogo,converted_discount,converted_informational,conversion_rate_bogo,conversion_rate_discount
0,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,168,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4
1,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,216,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4
2,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,336,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4
3,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,348,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4
4,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,360,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4
5,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,408,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4
6,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,408,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4
7,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,414,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4
8,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,444,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4
9,N,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,64000.0,2017,Missing,60k-69k,68be06ca386d4c31939f3a4f0e3dd783,504,...,5,2,0.0,5.0,0.0,0.0,2.0,0.0,,0.4


In [231]:
df = df[df['event_offer_completed'] == 1]

In [232]:
def offer_type(df):
    bogo = df['conversion_rate_bogo'] > 0.2
    discount = df['conversion_rate_discount'] > 0.2
    if bogo:
        return 'bogo'
    elif discount:
        return 'discount'
    else:
        return 'none'

df['offer_type'] = df.apply(offer_type, axis=1)

In [233]:
df = df.loc[df['offer_type'] != 'none', :]

In [234]:
df.columns

Index(['gender', 'age', 'id', 'became_member_on', 'income',
       'became_member_year', 'age_categories', 'income_binned', 'person',
       'time', 'amount', 'reward_paid', 'offer_id', 'reward_planned',
       'difficulty', 'duration', 'channel_email', 'channel_mobile',
       'channel_social', 'channel_web', 'event_offer_completed',
       'event_offer_received', 'event_offer_viewed', 'event_transaction',
       'offer_type_bogo', 'offer_type_discount', 'offer_type_informational',
       'total_amount', 'total_transactions', 'total_offers_received',
       'total_offers_viewed', 'total_offers_completed', 'total_offers_bogo',
       'total_offers_discount', 'total_offers_informational', 'converted_bogo',
       'converted_discount', 'converted_informational', 'conversion_rate_bogo',
       'conversion_rate_discount', 'offer_type'],
      dtype='object')

In [235]:
df = df[['gender', 'income_binned', 'became_member_year', 'age_categories', 'difficulty', 'duration', 'channel_email', 'channel_web', 'channel_mobile', 'channel_social', 'offer_type']]

In [216]:
df.head(30)

Unnamed: 0,gender,income_binned,became_member_year,age_categories,difficulty,duration,channel_email,channel_web,channel_mobile,channel_social,offer_type
0,N,60k-69k,2017,Missing,10.0,7.0,1.0,1.0,1.0,0.0,discount
2,N,60k-69k,2017,Missing,20.0,10.0,1.0,1.0,0.0,0.0,discount
5,N,60k-69k,2017,Missing,10.0,10.0,1.0,1.0,1.0,1.0,discount
9,N,60k-69k,2017,Missing,7.0,7.0,1.0,1.0,1.0,1.0,discount
16,N,60k-69k,2017,Missing,10.0,10.0,1.0,1.0,1.0,1.0,discount
23,F,110k-120k,2017,50-59,5.0,7.0,1.0,1.0,1.0,0.0,bogo
24,F,110k-120k,2017,50-59,0.0,4.0,1.0,1.0,1.0,0.0,bogo
28,N,60k-69k,2018,Missing,5.0,7.0,1.0,1.0,1.0,0.0,none
33,N,60k-69k,2018,Missing,0.0,3.0,1.0,0.0,1.0,1.0,none
37,F,90k-99k,2017,70-79,5.0,7.0,1.0,1.0,1.0,0.0,bogo


In [236]:
df['offer_type'].value_counts()

bogo        27429
discount     6109
Name: offer_type, dtype: int64

In [237]:
df = pd.get_dummies(df, columns=['age_categories', 'income_binned', 'became_member_year', 'gender'])

In [85]:
#df.to_csv('../input/processed.csv')

In [238]:
from sklearn.preprocessing import LabelEncoder


X = df.drop('offer_type', axis=1)
y = df['offer_type']

le = LabelEncoder()
y = le.fit_transform(y)

In [239]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7, test_size=0.2)

In [240]:
from sklearn.svm import SVC

modelSVC = SVC()
modelSVC.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [241]:
prediction = modelSVC.predict(X_test)
accuracy_score(y_test, prediction)

0.8307990459153249

In [223]:
from sklearn.ensemble import RandomForestClassifier
modelRFC = RandomForestClassifier(n_estimators=10)
modelRFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [224]:
predictionRFC = modelRFC.predict(X_test)
accuracy_score(y_test, predictionRFC)

0.6256554798112218

In [225]:
from sklearn.ensemble import GradientBoostingClassifier

modelGBC = GradientBoostingClassifier()
modelGBC.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [226]:
predictGBC = modelGBC.predict(X_test)
accuracy_score(y_test, predictGBC)

0.6377163083377032

In [195]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

params = {
              "n_estimators": [50, 100, 150, 200],
              "learning_rate": [0.003, 0.03, 0.3, 1]
             }

scorer = make_scorer(accuracy_score)
clf = AdaBoostClassifier(random_state=7)

grid_obj = GridSearchCV(clf, params, scorer, verbose=10)
grid_fit = grid_obj.fit(X_train, y_train)
best_clf = grid_fit.best_estimator_
best_clf

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] learning_rate=0.003, n_estimators=50 ............................
[CV]  learning_rate=0.003, n_estimators=50, score=0.817, total=   0.6s
[CV] learning_rate=0.003, n_estimators=50 ............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  learning_rate=0.003, n_estimators=50, score=0.817, total=   0.5s
[CV] learning_rate=0.003, n_estimators=50 ............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV]  learning_rate=0.003, n_estimators=50, score=0.817, total=   0.6s
[CV] learning_rate=0.003, n_estimators=100 ...........................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.7s remaining:    0.0s


[CV]  learning_rate=0.003, n_estimators=100, score=0.817, total=   1.1s
[CV] learning_rate=0.003, n_estimators=100 ...........................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.7s remaining:    0.0s


[CV]  learning_rate=0.003, n_estimators=100, score=0.817, total=   1.1s
[CV] learning_rate=0.003, n_estimators=100 ...........................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.8s remaining:    0.0s


[CV]  learning_rate=0.003, n_estimators=100, score=0.817, total=   1.0s
[CV] learning_rate=0.003, n_estimators=150 ...........................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    4.8s remaining:    0.0s


[CV]  learning_rate=0.003, n_estimators=150, score=0.817, total=   1.7s
[CV] learning_rate=0.003, n_estimators=150 ...........................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.5s remaining:    0.0s


[CV]  learning_rate=0.003, n_estimators=150, score=0.817, total=   1.6s
[CV] learning_rate=0.003, n_estimators=150 ...........................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    8.1s remaining:    0.0s


[CV]  learning_rate=0.003, n_estimators=150, score=0.817, total=   1.6s
[CV] learning_rate=0.003, n_estimators=200 ...........................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    9.8s remaining:    0.0s


[CV]  learning_rate=0.003, n_estimators=200, score=0.817, total=   2.1s
[CV] learning_rate=0.003, n_estimators=200 ...........................
[CV]  learning_rate=0.003, n_estimators=200, score=0.817, total=   2.2s
[CV] learning_rate=0.003, n_estimators=200 ...........................
[CV]  learning_rate=0.003, n_estimators=200, score=0.817, total=   2.2s
[CV] learning_rate=0.03, n_estimators=50 .............................
[CV] . learning_rate=0.03, n_estimators=50, score=0.817, total=   0.6s
[CV] learning_rate=0.03, n_estimators=50 .............................
[CV] . learning_rate=0.03, n_estimators=50, score=0.817, total=   0.6s
[CV] learning_rate=0.03, n_estimators=50 .............................
[CV] . learning_rate=0.03, n_estimators=50, score=0.817, total=   0.6s
[CV] learning_rate=0.03, n_estimators=100 ............................
[CV]  learning_rate=0.03, n_estimators=100, score=0.817, total=   1.0s
[CV] learning_rate=0.03, n_estimators=100 ............................
[CV

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  1.1min finished


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.3,
                   n_estimators=100, random_state=7)

In [196]:
accuracy_score(y_test, best_clf.predict(X_test))

0.8253424657534246

In [86]:
#!pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/7d/9d/dcd1e61fd0eeea1c2ed4fa32761e437012776d08530666b5ce843b45fc32/xgboost-1.0.1.tar.gz (820kB)
[K     |████████████████████████████████| 829kB 1.3MB/s eta 0:00:01
Building wheels for collected packages: xgboost
  Building wheel for xgboost (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/tarasowski/Library/Caches/pip/wheels/51/10/c1/d7b4a0cd3e7a3391958df48d09724c3c37be36c22b24c5d9f9
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-1.0.1


In [227]:
from xgboost import XGBClassifier

model = XGBClassifier(max_depth=7, n_estimators=300, learning_rate=0.03, random_state=7)
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {round(accuracy * 100, 2)}%")

Accuracy: 64.09%


In [198]:
print('Customer attributes:', '\n', X_test.iloc[0, :], '\n', 
      '\nPrediction (Offer): ', le.inverse_transform([y_pred[0]])[0],
      '\nOriginal (Offer): ', le.inverse_transform([y_test[0]])[0])

Customer attributes: 
 difficulty                 5.0
duration                   5.0
channel_email              1.0
channel_web                1.0
channel_mobile             1.0
channel_social             1.0
age_categories_10-19       0.0
age_categories_20-29       0.0
age_categories_30-39       0.0
age_categories_40-49       0.0
age_categories_50-59       0.0
age_categories_60-69       1.0
age_categories_70-79       0.0
age_categories_80-89       0.0
age_categories_Missing     0.0
income_binned_30k-39k      0.0
income_binned_40k-49k      0.0
income_binned_50k-59k      0.0
income_binned_60k-69k      0.0
income_binned_70k-79k      0.0
income_binned_80k-89k      0.0
income_binned_90k-99k      0.0
income_binned_100k-109k    0.0
income_binned_110k-120k    1.0
became_member_year_2013    0.0
became_member_year_2014    0.0
became_member_year_2015    0.0
became_member_year_2016    0.0
became_member_year_2017    1.0
became_member_year_2018    0.0
gender_F                   0.0
gender_M        

In [199]:
def recommend(attributes, model):
    return le.inverse_transform(model.predict(attributes))

In [204]:
recommend(X_test.iloc[100:550, :], model)

array(['bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'discount', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'discount', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'discount',
       'bogo', 'bogo', 'bogo', 'bogo', 'discount', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo',
       'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo', 'bogo'