In [None]:
%pylab inline

In [None]:
import seaborn as sns

In [None]:
%run ../src/model_test_lib.py

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
train = pd.read_pickle(r'..\data\processed\train_v1.pkl')
test = pd.read_pickle(r'..\data\processed\test_open_pred_v1.pkl')

In [None]:
# train = train[(train['is_open']) == 1].reset_index(drop=True)
# test = test[(test['is_open']) == 1].reset_index(drop=True)

In [None]:
train.head()

In [None]:
len(train)

In [None]:
features = train.columns.tolist()

In [None]:
target_open = 'is_open'
target_click = 'is_click'
pid = 'id'
campaign_id = 'campaign_id'
user_id = 'user_id'

In [None]:
features.remove(target_open)
features.remove(target_click)
features.remove(pid)
features.remove(campaign_id)
features.remove(user_id)

In [None]:
memory_cleaner()

### Predict all is_open = mode(is_click)

In [None]:
y = [int(train['is_click'].mode()[0])] * len(train)

In [None]:
metrics.accuracy_score(train['is_click'], y)

In [None]:
metrics.roc_auc_score(train['is_click'], y)

### Let's check how some basic models work without any hyperparameters

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_names = ['LR', 'DTC']
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier()
]

In [None]:
test_classifiers(model_names, model_list, train, features, target_click)

### Let's tune the hyperparameter class_weight to be balanced, so that the algorithm treats them equal

In [None]:
model_names = ['LR', 'DTC']
model_list = [
    LogisticRegression(class_weight='balanced'),
    DecisionTreeClassifier(class_weight='balanced')
]

In [None]:
test_classifiers(model_names, model_list, train, features, target_click)

In [None]:
original_features = ['no_of_images', 'no_of_internal_links', 'no_of_sections', 'total_links']
derived_feature_external_link = ['no_of_external_links']
time_features = ['day_of_week', 'time_group']
communication_features = [col for col in features if col.startswith('communication_')]
binned_features = [col for col in features if col.startswith('bin_')]
sub_features = [col for col in features if col.startswith('sub -')]
body_features = [col for col in features if col.startswith('body -')]

In [None]:
# Test to check if newly grouped feature contains all the features
set(features) == set(original_features + derived_feature_external_link + time_features + communication_features 
                     + binned_features + sub_features + body_features)

#### Let's not include the original feature and the external link features as they are already part of binned features. And create a seprate group of features that has at least 0.05 event distinguishing capability

In [None]:
new_feature_group = time_features + communication_features + binned_features + sub_features + body_features + [target_open]

In [None]:
selected_features = []
for feat in new_feature_group:
    group = (train.groupby(feat)['is_click'].sum() / train.groupby(feat)['is_click'].count())
    
    if len(group) == 2:
        present = group.get_values()[1]
        absent = group.get_values()[0]
        
        if abs(present - absent) >= 0.05:
            selected_features.append(feat)

In [None]:
selected_features

In [None]:
# Retest the same balanced model
test_classifiers(model_names, model_list, train, selected_features, target_click)

In [None]:
len(new_feature_group)

In [None]:
selected_features = select_features(train, new_feature_group, target_open, k=100)

In [None]:
test_classifiers_roc_auc_score(model_names, model_list, train, selected_features, target_click)

In [None]:
model_names = ['LR', 'DTC']
model_list = [
    LogisticRegression(class_weight='balanced', max_iter=1000),
    DecisionTreeClassifier(class_weight='balanced', min_samples_split=0.05)
]
test_classifiers_roc_auc_score(model_names, model_list, train, new_feature_group, target_click)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_X, test_X, train_y, test_y = train_test_split(train[new_feature_group], train[target_click], test_size=0.3, random_state=42)

In [None]:
model = DecisionTreeClassifier(class_weight='balanced', min_samples_split=0.05)

In [None]:
model.fit(train_X, train_y)

In [None]:
pred_proba = model.predict_proba(test_X)

In [None]:
metrics.roc_auc_score(test_y, pred_proba[:,1])

### LGBM

In [None]:
import lightgbm as lgb

In [None]:
# set the data in format lgb accepts
# d_train = lgb.Dataset(train_X, label=train_y)
d_train = lgb.Dataset(train[new_feature_group], label=train[target_click])

In [None]:
# set parameters
## you can tune the parameters can try to better score

params = {'task': 'train',
    'boosting_type': 'dart',
    'objective': 'binary',
    'is_unbalance': True,
    'metric': 'auc',
    'learning_rate': 0.1, 
    'max_depth': 51, 
    'num_leaves': 175, 
    'feature_fraction': 0.5, 
    'max_bin': 256,
    'bagging_fraction': 0.8,
    'bagging_freq': 3
}

In [None]:
lgb_cv = lgb.cv(params, d_train, num_boost_round=1000, nfold= 5, shuffle=True, stratified=True,
                verbose_eval=20, early_stopping_rounds=500)

In [None]:
## get nround value which had lowest error
nround_max = lgb_cv['auc-mean'].index(np.max(lgb_cv['auc-mean']))
nround_min = lgb_cv['auc-mean'].index(np.min(lgb_cv['auc-mean']))
print("MAX Rounds = " + str(nround_max) + " & max auc: " + str(np.max(lgb_cv['auc-mean'])))
print("MIN Rounds = " + str(nround_min) + " & min auc: " + str(np.min(lgb_cv['auc-mean'])))

In [None]:
model = lgb.train(params, d_train, num_boost_round=nround_max)

In [None]:
import pickle

In [None]:
filename = r'..\models\model_lgbm_click_v2.pkl'
# pickle.dump(model, open(filename, 'wb'))

In [None]:
# Read and predict
model = pickle.load(open(filename, 'rb'))

In [None]:
pred = model.predict(train[new_feature_group])

In [None]:
pred[:5]

In [None]:
pred_class = np.where(pred > 0.5, 1, 0)

In [None]:
metrics.roc_auc_score(train['is_click'], pred_class)

In [None]:
best_ac = -1
best_i = -1
for i in np.arange(0.350, 0.800, 0.001):
    # ac = metrics.accuracy_score(test_y, np.where(preds > i, 1, 0))
    ac = metrics.roc_auc_score(train[target_click], np.where(pred > i, 1, 0))
    
    if ac > best_ac:
        print('i = ' + str(i) + ', ac = ' + str(ac))
        best_ac = ac
        best_i = i

In [None]:
pred = model.predict(test[new_feature_group])

In [None]:
pred_class = np.where(pred > 0.5, 1, 0)

In [None]:
pred_class.sum() / len(pred_class)

In [None]:
test['is_click'] = pred

In [None]:
test[[pid, target_click]].to_csv(r'..\data\processed\submission_lgbm_v2.csv', index=False)