In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os.path
import gc
gc.enable()

  return f(*args, **kwds)


In [2]:
def unpack_pickle(obj_name):
    file_path = "{0}.pkl".format(obj_name)
    n_bytes = 2**31
    max_bytes = 2**31 - 1
    data = bytearray(n_bytes)
    
    ## read
    bytes_in = bytearray(0)
    input_size = os.path.getsize(file_path)
    with open(file_path, 'rb') as f_in:
        for _ in range(0, input_size, max_bytes):
            bytes_in += f_in.read(max_bytes)
    loaded_data = pickle.loads(bytes_in)
    return loaded_data

In [3]:
X = unpack_pickle('X')

In [4]:
y = unpack_pickle('y')

In [5]:
X.shape, y.shape

((7730792, 73), (7730792,))

In [6]:
X.head()

Unnamed: 0,EngineVersion,AppVersion,AvSigVersion,AVProductStatesIdentifier,AVProductsInstalled,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,...,AppVersion_3,AvSigVersion_1,AvSigVersion_2,AvSigVersion_3,OsBuildLab_1,OsBuildLab_2,OsBuildLab_3,Census_OSVersion_1,Census_OSVersion_2,Census_OSVersion_3
0,0.556665,0.536882,0.595976,53447.0,1.0,29,128035.0,18.0,35.0,-85,...,0.536868,0.501162,0.556937,0.595976,0.525496,0.525496,0.534392,0.501158,0.501158,0.526698
1,0.400884,0.49673,0.409338,53447.0,1.0,93,1482.0,18.0,119.0,64,...,0.451809,0.501162,0.400011,0.409338,0.525496,0.525496,0.534392,0.501158,0.501158,0.526698
2,0.556665,0.536882,0.532939,53447.0,1.0,86,153579.0,18.0,64.0,49,...,0.536868,0.501162,0.556937,0.532939,0.525496,0.525496,0.534392,0.501158,0.501158,0.526698
3,0.556665,0.536882,0.7013,53447.0,1.0,88,20710.0,0.0,117.0,115,...,0.536868,0.501162,0.556937,0.7013,0.525496,0.525496,0.534392,0.501158,0.501158,0.526698
4,0.556665,0.536882,0.526942,53447.0,1.0,18,37376.0,0.0,277.0,75,...,0.536868,0.501162,0.556937,0.526942,0.525496,0.525496,0.534392,0.501158,0.501158,0.526698


In [7]:
y.head()

0    0
1    0
2    0
3    1
4    1
Name: HasDetections, dtype: int8

In [8]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, StratifiedKFold



In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.22, random_state=4000)

In [10]:
def score(params):
    print("Training with params : {}".format(params)) 
    
    params = {
        'learning_rate': float(params['learning_rate']),
        'max_depth': int(params['max_depth']),
        'colsample_bytree' : float(params['colsample_bytree']),
        'subsample' : float(params['subsample']),
        'reg_alpha' : float(params['reg_alpha']),
        'reg_lambda' : float(params['reg_lambda']),
    }
    model = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        n_estimators=100,
        **params
    )
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, preds)
    gc.collect()
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

In [11]:
def optimize(trials):
    params = {
        'learning_rate': hp.quniform('learning_rate', 0.1, 0.9, 0.1),
        'max_depth': hp.quniform('max_depth', 10, 100, 10),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 0.9, 0.1),
        'subsample' : hp.quniform('subsample', 0.4, 0.9, 0.1),
        'reg_alpha' : hp.quniform('reg_alpha', 0.1, 0.9, 0.1),
        'reg_lambda' : hp.quniform('reg_lambda', 0.1, 0.9, 0.1),
    }
    best = fmin(score, params, algo=tpe.suggest, trials=trials, max_evals=100)
    print(best)

In [12]:
#Trials object where the history of search will be stored
trials = Trials()

In [13]:
optimize(trials)

Training with params : {'colsample_bytree': 0.2, 'learning_rate': 0.30000000000000004, 'max_depth': 100.0, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.9, 'subsample': 0.4}
	Score 0.7223769279674205                            


Training with params : {'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.6000000000000001, 'max_depth': 30.0, 'reg_alpha': 0.2, 'reg_lambda': 0.5, 'subsample': 0.6000000000000001}
	Score 0.726117563241451                                                         


Training with params : {'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.2, 'max_depth': 90.0, 'reg_alpha': 0.8, 'reg_lambda': 0.1, 'subsample': 0.5}
	Score 0.7212389154464701                                                        


Training with params : {'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.1, 'max_depth': 50.0, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.9, 'subsample': 0.7000000000000001}
	Score 0.71660437457634                                  

	Score 0.7248324843106921                                                           


Training with params : {'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 40.0, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 0.6000000000000001, 'subsample': 0.7000000000000001}
	Score 0.7161784318786424                                                           


Training with params : {'colsample_bytree': 0.1, 'learning_rate': 0.2, 'max_depth': 30.0, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 0.9, 'subsample': 0.7000000000000001}
	Score 0.7154239633634365                                                           


Training with params : {'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.30000000000000004, 'max_depth': 10.0, 'reg_alpha': 0.9, 'reg_lambda': 0.7000000000000001, 'subsample': 0.6000000000000001}
	Score 0.7241471456066437                                                           


Training with params : {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth

Training with params : {'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.5, 'max_depth': 20.0, 'reg_alpha': 0.4, 'reg_lambda': 0.2, 'subsample': 0.5}
	Score 0.7246273192336812                                                           


Training with params : {'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 30.0, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 0.5, 'subsample': 0.9}
	Score 0.7161790085980428                                                           


Training with params : {'colsample_bytree': 0.1, 'learning_rate': 0.1, 'max_depth': 90.0, 'reg_alpha': 0.4, 'reg_lambda': 0.1, 'subsample': 0.9}
	Score 0.7073510792168402                                                           


Training with params : {'colsample_bytree': 0.2, 'learning_rate': 0.2, 'max_depth': 90.0, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.1, 'subsample': 0.9}
	Score 0.7201153805112418                                                           


Training with params : {'

Training with params : {'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.1, 'max_depth': 80.0, 'reg_alpha': 0.5, 'reg_lambda': 0.6000000000000001, 'subsample': 0.9}
	Score 0.7155243145690962                                                         


Training with params : {'colsample_bytree': 0.2, 'learning_rate': 0.6000000000000001, 'max_depth': 60.0, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 0.8, 'subsample': 0.8}
	Score 0.7250294160844765                                                         


Training with params : {'colsample_bytree': 0.1, 'learning_rate': 0.5, 'max_depth': 30.0, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 0.5, 'subsample': 0.6000000000000001}
	Score 0.7219397968646798                                                         


Training with params : {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 50.0, 'reg_alpha': 0.5, 'reg_lambda': 0.2, 'subsample': 0.7000000000000001}
	Score 0.7223389627952262                                  