In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import seaborn as sns

In [3]:
%run ../src/model_test_lib.py

In [4]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [5]:
train = pd.read_pickle(r'..\data\processed\train_v1.pkl')
test = pd.read_pickle(r'..\data\processed\test_v1.pkl')

In [6]:
features = train.columns.tolist()

In [7]:
target_open = 'is_open'
target_click = 'is_click'
pid = 'id'
campaign_id = 'campaign_id'
user_id = 'user_id'

In [8]:
features.remove(target_open)
features.remove(target_click)
features.remove(pid)
features.remove(campaign_id)
features.remove(user_id)

In [9]:
memory_cleaner()

Allocation: 2.20%
Unreference: -2.20%
Collect: 0.00%
Overall: 0.00%


### Predict all is_open = mode(is_open)

In [None]:
y = [int(train['is_open'].mode()[0])] * len(train)

In [None]:
metrics.accuracy_score(train['is_open'], y)

In [None]:
cmf = metrics.confusion_matrix(train['is_open'], y)
sns.heatmap(cmf, annot=True);

The base accuracy is high because the events are imbalanced. 

### Let's check how some basic models work without any hyperparameters

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
model_names = ['LR', 'DTC']
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier()
]

In [None]:
test_classifiers(model_names, model_list, train, features, target_open)

##### We can observe that models perform no better than mode prediction. This is because most of the algorithms are written to just maximize the accuracy. Also, Precision and F-score are ill-defined, this is because the True Positives and False Positives for is_open = 0
### Let's tune the hyperparameter class_weight to be balanced, so that the algorithm treats them equal

In [11]:
model_names = ['LR', 'DTC', 'MLP']
model_list = [
    LogisticRegression(class_weight='balanced'),
    DecisionTreeClassifier(class_weight='balanced'),
    MLPClassifier(
                        hidden_layer_sizes=(1024, ) * 4,
                        activation='relu',
                        solver='adam',
                        alpha=0.0001,
                        batch_size=100,
                        max_iter=64,
                        learning_rate_init=0.001,
                        random_state=42,
                        early_stopping=True,
                        verbose=True
    )
]

In [None]:
test_classifiers(model_names, model_list, train, features, target_open)

#### Let's use only few features and not all

In [None]:
features

In [15]:
original_features = ['no_of_images', 'no_of_internal_links', 'no_of_sections', 'total_links']
derived_feature_external_link = ['no_of_external_links']
time_features = ['day_of_week', 'time_group']
communication_features = [col for col in features if col.startswith('communication_')]
binned_features = [col for col in features if col.startswith('bin_')]
sub_features = [col for col in features if col.startswith('sub -')]
body_features = [col for col in features if col.startswith('body -')]

In [16]:
# Test to check if newly grouped feature contains all the features
set(features) == set(original_features + derived_feature_external_link + time_features + communication_features 
                     + binned_features + sub_features + body_features)

True

#### Let's not include the original feature and the external link features as they are already part of binned features. And create a seprate group of features that has at least 0.05 event distinguishing capability

In [17]:
# Not including body, as user cannot see the body before opening the email
new_feature_group = time_features + communication_features + binned_features + sub_features

In [None]:
selected_features = []
for feat in new_feature_group:
    group = (train.groupby(feat)['is_open'].sum() / train.groupby(feat)['is_open'].count())
    
    if len(group) == 2:
        present = group.get_values()[1]
        absent = group.get_values()[0]
        
        if abs(present - absent) >= 0.05:
            selected_features.append(feat)

In [None]:
selected_features

In [None]:
# Retest the same balanced model
test_classifiers(model_names, model_list, train, selected_features, target_open)

In [None]:
test_classifiers_roc_auc_score(model_names, model_list, train, selected_features, target_open)

In [None]:
len(new_feature_group)

In [None]:
selected_features = select_features(train, new_feature_group, target_open, k=35)

In [None]:
test_classifiers_roc_auc_score(model_names, model_list, train, selected_features, target_open)

In [18]:
# model_names = ['LR', 'DTC']
# model_list = [
#     LogisticRegression(class_weight='balanced', max_iter=1000),
#     DecisionTreeClassifier(class_weight='balanced', min_samples_split=0.05)
# ]
test_classifiers_roc_auc_score(model_names, model_list, train, new_feature_group, target_open)

Allocation: 2.20%
Unreference: -2.20%
Collect: 0.00%
Overall: 0.00%
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
ROC AUC Score : 0.6989
Cross-Validation ROC AUC Score : 0.3699


Allocation: 2.74%
Unreference: -2.67%
Collect: 0.00%
Overall: 0.06%
DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
ROC AUC Score : 0.699
Cross-Validation ROC AUC Score : 0.4507


Allocation: 2.86%
Unreference: -2.67%
Collect: 0.00%
Overall: 0.19%
MLPClassifier(activation='relu', alpha=0

MemoryError: 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_X, test_X, train_y, test_y = train_test_split(train[new_feature_group], train[target_open], test_size=0.3, random_state=42)

In [None]:
model = DecisionTreeClassifier(class_weight='balanced', min_samples_split=0.05)

In [None]:
model.fit(train_X, train_y)

In [None]:
pred_proba = model.predict_proba(test_X)

In [None]:
metrics.roc_auc_score(test_y, pred_proba[:,1])

### LGBM

In [None]:
import lightgbm as lgb

In [None]:
# set the data in format lgb accepts
# d_train = lgb.Dataset(train_X, label=train_y)
d_train = lgb.Dataset(train[new_feature_group], label=train[target_open])

In [None]:
# set parameters
## you can tune the parameters can try to better score

params = {'task': 'train',
    'boosting_type': 'dart',
    'objective': 'binary',
    'is_unbalance': True,
    'metric': 'auc',
    'learning_rate': 0.1, 
    'max_depth': 51, 
    'num_leaves': 175, 
    'feature_fraction': 0.5, 
    'max_bin': 256,
    'bagging_fraction': 0.8,
    'bagging_freq': 3
}

In [None]:
lgb_cv = lgb.cv(params, d_train, num_boost_round=1000, nfold= 5, shuffle=True, stratified=True,
                verbose_eval=20, early_stopping_rounds=500)

In [None]:
## get nround value which had lowest error
nround_max = lgb_cv['auc-mean'].index(np.max(lgb_cv['auc-mean']))
nround_min = lgb_cv['auc-mean'].index(np.min(lgb_cv['auc-mean']))
print("MAX Rounds = " + str(nround_max) + " & max auc: " + str(np.max(lgb_cv['auc-mean'])))
print("MIN Rounds = " + str(nround_min) + " & min auc: " + str(np.min(lgb_cv['auc-mean'])))

In [None]:
## train the model
model = lgb.train(params, d_train, num_boost_round=nround_max)

In [None]:
import pickle
filename = r'..\models\model_lgbm_open_v1.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# Read and predict
model = pickle.load(open(filename, 'rb'))

In [None]:
pred = model.predict(train[new_feature_group])

In [None]:
pred[:5]

In [None]:
best_ac = -1
best_i = -1
for i in np.arange(0.350, 0.800, 0.001):
    # ac = metrics.accuracy_score(test_y, np.where(preds > i, 1, 0))
    ac = metrics.roc_auc_score(train[target_open], np.where(pred > i, 1, 0))
    print('i = ' + str(i) + ', ac = ' + str(ac))
    if ac > best_ac:
        best_ac = ac
        best_i = i

In [None]:
print(best_ac)
print(best_i)

In [None]:
pred = model.predict(test[new_feature_group])

In [None]:
pred = np.where(pred > 0.5, 1, 0)

In [None]:
pred.sum() / len(pred)

In [None]:
test['is_open'] = pred

In [None]:
test.to_pickle(r'..\data\processed\test_open_pred_v1.pkl')

### Deeplearning - Keras

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
k_train_X = np.array(train_X.reset_index(drop=True))
k_train_y = np.array(train_y.reset_index(drop=True))

k_test_X = np.array(test_X.reset_index(drop=True))
k_test_y = np.array(test_y.reset_index(drop=True))

In [None]:
k_train_X.shape

In [None]:
# model = Sequential()
model = keras.layers.SimpleRNN(units, activation='tanh')
model.add(Dense(16, input_dim=k_train_X.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', roc_auc_pred])

In [None]:
model.fit(k_train_X, k_train_y, epochs=10, class_weight='balanced')

In [None]:
scores = model.evaluate(k_test_X, k_test_y)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
pred_proba = model.predict_proba(k_test_X)

In [None]:
pred = model.predict_classes(k_test_X)

In [None]:
metrics.roc_auc_score(k_test_y, pred_proba)

In [None]:
cmf = metrics.confusion_matrix(k_test_y, pred)

In [None]:
plot_confusion_matrix(cmf, [0, 1])

In [None]:
# import keras.backend as K

def roc_auc_pred(y_true, y_pred):
    return metrics.roc_auc_score(y_true, y_pred)