In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from bayes_opt import BayesianOptimization

In [2]:
### Constants
seed = 69
cv_size = .3

train = pd.read_csv("data/Train.csv", encoding="ISO-8859-1")
test = pd.read_csv("data/Test.csv", encoding="ISO-8859-1")

X = train.loc[:, train.columns != 'revenue']
y = train["revenue"]

# encode string input values as integers
features = []
encode_cols = ["month", "channelGrouping", "browser", "operatingSystem", "deviceCategory", "continent", 
                "subContinent", "country", "region", "metro", "city", "topLevelDomain", "campaign", "source", 
                "medium", "keyword", "referralPath", "adContent", "adwordsClickInfo.page", "adwordsClickInfo.slot",
                "adwordsClickInfo.adNetworkType", "bounces", "newVisits"]

other_cols = ["visitNumber", "timeSinceLastVisit", "isMobile", "isTrueDirect", "hits", "pageviews"]

# encode string input values as integers
encoded_x = None
for i in encode_cols:
    col = X[i].apply(str)
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(col.fillna('0'))
    feature = feature.reshape(X.shape[0], 1)
    onehot_encoder = OneHotEncoder(sparse=False)
    feature = onehot_encoder.fit_transform(feature)
    if encoded_x is None:
        encoded_x = feature
    else:
        encoded_x = np.concatenate((encoded_x, feature), axis=1)
print("X shape: ", encoded_x.shape)

X = np.concatenate((X[other_cols].fillna(0), encoded_x), axis=1)
y = Binarizer().transform(y.values.reshape(-1, 1))

X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=cv_size)#, random_state=seed)

X shape:  (83578, 2498)


In [3]:
def xgbf(n_components, lamb, alpha):
    pca = PCA(n_components=int(n_components))
    X_train1 = pca.fit_transform(X_train)
    X_cv1 = pca.transform(X_cv)
    xgtrain = xgb.DMatrix(X_train1, y_train)
    xgcv = xgb.DMatrix(X_cv1, y_cv)
    # Specify sufficient boosting iterations to reach a minimum
    num_round = 5000
    evallist = [(xgcv, 'eval'), (xgtrain, 'train')]
    # Leave most parameters as default
    param = {'objective': 'binary:logistic',
             'tree_method': 'gpu_hist', # Use GPU accelerated algorithm
             'lambda': lamb,
             'alpha': alpha,
             }
    bst = xgb.train(param, xgtrain, num_round, evallist, early_stopping_rounds=10, verbose_eval = 10)
    
    pred = bst.predict(xgcv)
    
    f1_list = []
    thresholds = np.linspace(0.15, .55, 100)
    for i in thresholds:
        bin_pred = Binarizer(threshold=i).transform(pred.reshape(-1,1))
        f1 = f1_score(y_cv, bin_pred)
        f1_list.append(f1)
#     fig, ax = plt.subplots(figsize=(20, 10))
#     plt.plot(f1_list)
    return np.max(f1_list)

In [4]:
xgbf(100, 1.1, .1)

[0]	eval-error:0.05755	train-error:0.051774
Multiple eval metrics have been passed: 'train-error' will be used for early stopping.

Will train until train-error hasn't improved in 10 rounds.
[10]	eval-error:0.051169	train-error:0.041707
[20]	eval-error:0.05073	train-error:0.036989
[30]	eval-error:0.050371	train-error:0.032442
[40]	eval-error:0.051169	train-error:0.027246
[50]	eval-error:0.051288	train-error:0.02246
[60]	eval-error:0.051926	train-error:0.018682
[70]	eval-error:0.051886	train-error:0.015606
[80]	eval-error:0.051847	train-error:0.012734
[90]	eval-error:0.052844	train-error:0.010461
[100]	eval-error:0.053601	train-error:0.008444
[110]	eval-error:0.053801	train-error:0.006632
[120]	eval-error:0.053601	train-error:0.005453
[130]	eval-error:0.053641	train-error:0.00388
[140]	eval-error:0.053442	train-error:0.002957
[150]	eval-error:0.053442	train-error:0.002325
[160]	eval-error:0.05396	train-error:0.001521
[170]	eval-error:0.054559	train-error:0.001196
[180]	eval-error:0.0544

0.7252834692659638

In [None]:
XGB_BO = BayesianOptimization(xgbf, {
#                                      'max_depth': (2, 12),
#                                      'gamma': (0.001, 10.0),
#                                      'min_child_weight': (0, 20),
#                                      'max_delta_step': (0, 10),
#                                      'subsample': (0.4, 1.0),
#                                      'colsample_bytree': (0.4, 1.0),
                                     'n_components': (50, 2000),
                                     'lamb': (1.0, 1.7),
                                     'alpha': (0.0, 0.5)
                                    })

XGB_BO.explore({
#               'max_depth':            [3, 8, 3, 8, 8, 3, 8, 3],
#               'gamma':                [0.5, 8, 0.2, 9, 0.5, 8, 0.2, 9],
#               'min_child_weight':     [0.2, 0.2, 0.2, 0.2, 12, 12, 12, 12],
#               'max_delta_step':       [1, 2, 2, 1, 2, 1, 1, 2],
#               'subsample':            [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
#               'colsample_bytree':     [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
              'n_components':         [50, 200, 400, 200, 1200, 600, 800, 750],
              'lamb':                 [1.0, 1.0, 1.2, 1.2, 1.4, 1.5, 1.0, 1.3],
              'alpha':                [0.0, 0.0, 0.15, 0.15, 0.25, 0.3, 0.1, 0.15]
              })

XGB_BO.maximize(init_points=2, n_iter=15, acq='ei', xi=0.0001)

[31mInitialization[0m
[94m----------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |      lamb |   n_components | 
[0]	eval-error:0.058188	train-error:0.052184
Multiple eval metrics have been passed: 'train-error' will be used for early stopping.

Will train until train-error hasn't improved in 10 rounds.
[10]	eval-error:0.050371	train-error:0.041365
[20]	eval-error:0.050371	train-error:0.037382
[30]	eval-error:0.049972	train-error:0.033998
[40]	eval-error:0.050451	train-error:0.02822
[50]	eval-error:0.05061	train-error:0.02446
[60]	eval-error:0.051567	train-error:0.021127
[70]	eval-error:0.052126	train-error:0.016905
[80]	eval-error:0.052285	train-error:0.015349
[90]	eval-error:0.052205	train-error:0.01217
[100]	eval-error:0.052205	train-error:0.00964
[110]	eval-error:0.052405	train-error:0.00817
[120]	eval-error:0.052883	train-error:0.007094
[130]	eval-error:0.053841	train-error:0.005675
[140]	eval-error:0.053163	train-erro

  " state: %s" % convergence_dict)


[0]	eval-error:0.058347	train-error:0.051313
Multiple eval metrics have been passed: 'train-error' will be used for early stopping.

Will train until train-error hasn't improved in 10 rounds.
[10]	eval-error:0.050411	train-error:0.039023
[20]	eval-error:0.049773	train-error:0.033587
[30]	eval-error:0.050251	train-error:0.026596
[40]	eval-error:0.051847	train-error:0.018665
[50]	eval-error:0.052245	train-error:0.013469
[60]	eval-error:0.053322	train-error:0.009435
[70]	eval-error:0.053203	train-error:0.006803
[80]	eval-error:0.054319	train-error:0.004769
[90]	eval-error:0.054399	train-error:0.002889
[100]	eval-error:0.054559	train-error:0.002102
[110]	eval-error:0.055037	train-error:0.001145
[120]	eval-error:0.055875	train-error:0.000598
[130]	eval-error:0.055236	train-error:0.000444
[140]	eval-error:0.054798	train-error:0.000325
[150]	eval-error:0.055396	train-error:0.000205
[160]	eval-error:0.055476	train-error:0.000188
Stopping. Best iteration:
[157]	eval-error:0.055595	train-error:0

In [3]:
def xgbf(n_components, lamb, alpha):
    pca = PCA(n_components=int(n_components))
    X_train1 = pca.fit_transform(X_train)
    X_cv1 = pca.transform(X_cv)
    xgtrain = xgb.DMatrix(X_train1, y_train)
    xgcv = xgb.DMatrix(X_cv1, y_cv)
    # Specify sufficient boosting iterations to reach a minimum
    num_round = 5000
    evallist = [(xgcv, 'eval'), (xgtrain, 'train')]
    # Leave most parameters as default
    param = {'objective': 'binary:logistic',
             'tree_method': 'gpu_hist', # Use GPU accelerated algorithm
             'lambda': lamb,
             'alpha': alpha,
             }
    bst = xgb.train(param, xgtrain, num_round, evallist, early_stopping_rounds=10, verbose_eval = 10)
    
    pred = bst.predict(xgcv)
    
    f1_list = []
    thresholds = np.linspace(0.15, .55, 100)
    for i in thresholds:
        bin_pred = Binarizer(threshold=i).transform(pred.reshape(-1,1))
        f1 = f1_score(y_cv, bin_pred)
        f1_list.append(f1)
#     fig, ax = plt.subplots(figsize=(20, 10))
#     plt.plot(f1_list)
    return bst, thresholds[np.argmax(f1_list)]

In [4]:
bst, thresh = xgbf(1600, 1.7, 0)

[0]	eval-error:0.058108	train-error:0.050646
Multiple eval metrics have been passed: 'train-error' will be used for early stopping.

Will train until train-error hasn't improved in 10 rounds.
[10]	eval-error:0.052445	train-error:0.036818
[20]	eval-error:0.051847	train-error:0.030494
[30]	eval-error:0.052006	train-error:0.02487
[40]	eval-error:0.053282	train-error:0.017401
[50]	eval-error:0.054559	train-error:0.012341
[60]	eval-error:0.054957	train-error:0.007931
[70]	eval-error:0.056034	train-error:0.005504
[80]	eval-error:0.056752	train-error:0.003094
[90]	eval-error:0.057031	train-error:0.001863
[100]	eval-error:0.05727	train-error:0.001145
[110]	eval-error:0.057789	train-error:0.000718
[120]	eval-error:0.057948	train-error:0.00041
[130]	eval-error:0.057789	train-error:0.000325
[140]	eval-error:0.05759	train-error:0.000256
[150]	eval-error:0.057749	train-error:0.000222
Stopping. Best iteration:
[146]	eval-error:0.057629	train-error:0.000222



In [5]:
# encode string input values as integers
features = []
encode_cols = ["month", "channelGrouping", "browser", "operatingSystem", "deviceCategory", "continent", 
                "subContinent", "country", "region", "metro", "city", "topLevelDomain", "campaign", "source", 
                "medium", "keyword", "referralPath", "adContent", "adwordsClickInfo.page", "adwordsClickInfo.slot",
                "adwordsClickInfo.adNetworkType", "bounces", "newVisits"]

other_cols = ["visitNumber", "timeSinceLastVisit", "isMobile", "isTrueDirect", "hits", "pageviews"]

# encode string input values as integers
encoded_x = None
for i in encode_cols:
    col = test[i].apply(str)
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(col.fillna('0'))
    feature = feature.reshape(test.shape[0], 1)
    onehot_encoder = OneHotEncoder(sparse=False)
    feature = onehot_encoder.fit_transform(feature)
    if encoded_x is None:
        encoded_x = feature
    else:
        encoded_x = np.concatenate((encoded_x, feature), axis=1)
print("X shape: ", encoded_x.shape)

test = np.concatenate((test[other_cols].fillna(0), encoded_x), axis=1)

X shape:  (56208, 2118)


In [10]:
pca = PCA(n_components=1600)
test1 = pca.fit_transform(test)
xgtest = xgb.DMatrix(test1)
pred = bst.predict(xgtest)

In [11]:
bin_pred = Binarizer(threshold=thresh).transform(pred.reshape(-1,1))

In [32]:
test_raw = pd.read_csv("data/Test.csv", encoding="ISO-8859-1")

In [33]:
sub = test_raw['sessionId']

In [37]:
sub = pd.DataFrame(sub)

In [39]:
sub['predictedClass'] = bin_pred

In [41]:
sub.astype(int)

Unnamed: 0,sessionId,predictedClass
0,100000110,0
1,200000120,0
2,500000150,0
3,500000250,0
4,700000170,0
5,800000180,0
6,1000000110,0
7,1200000112,0
8,1800000118,0
9,1800000218,0
