In [1]:
cd scripts

/home/ubuntu/facebook/scripts


In [2]:
import itertools
from multiprocessing import Pool
import numpy as np
#Import libraries:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

from sklearn.cross_validation import train_test_split
import pickle

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

import traceback
import grid_generation as grid

In [3]:

params_range = {'a': range(1, 5),
          'b': range(6, 9)}

def transform_x(X, x_transformer = None):
    """
    X = [[x, y, a, t]]
    """
    fw = [500., 1000., 4., 3., 2., 10., 10.]
    minute_v = X[:, 3]%60
    hour_v = X[:, 3]//60
    weekday_v = hour_v//24
    month_v = weekday_v//30
    year_v = (weekday_v//365 + 1)*fw[5]
    hour_v = ((hour_v%24 + 1) + minute_v/60.0)*fw[2]
    hour_v_2 = (X[:, 3]%(60*60*24))//(60*60*2)
    hour_v_3 = (X[:, 3]%(60*60*24))//(60*60*3)
    hour_v_4 = (X[:, 3]%(60*60*24))//(60*60*4)
    hour_v_6 = (X[:, 3]%(60*60*24))//(60*60*6)
    hour_v_8 = (X[:, 3]%(60*60*24))//(60*60*8)
    weekday_v = (weekday_v%7 + 1)*fw[3]
    month_v = (month_v%12 +1)*fw[4]
    accuracy_v = np.log10(X[:, 2])*fw[6]
    x_v = X[:, 0]*fw[0]
    y_v = X[:, 1]*fw[1]
    return np.hstack((x_v.reshape(-1, 1),
                     y_v.reshape(-1, 1),
                     accuracy_v.reshape(-1, 1),
                     hour_v.reshape(-1, 1),
                     hour_v_2.reshape(-1, 1),
                     hour_v_3.reshape(-1, 1),
                     hour_v_4.reshape(-1, 1),
                     hour_v_6.reshape(-1, 1),
                     hour_v_8.reshape(-1, 1),
                     weekday_v.reshape(-1, 1),
                     month_v.reshape(-1, 1),
                     year_v.reshape(-1, 1)))

def transform_y(y, y_transformer = None):
    """
    place_ids to encoded array
    """
    y = y.astype(int)
    if y_transformer == None:
        label_encoder = LabelEncoder()
        label_encoder.fit(y)
        y_transformer = {'encoder': label_encoder}
    new_y = y_transformer['encoder'].transform(y).reshape(-1, 1)
    return (new_y, y_transformer)

def map3eval(preds, dtrain):
    actual = dtrain.get_label()
    predicted = preds.argsort(axis=1)[:,-np.arange(1,4)]
    metric = 0.
    for i in range(3):
        metric += np.sum(actual==predicted[:,i])/(i+1)
    metric /= actual.shape[0]
    return 'MAP@3', metric

def load_data(m, n):
    f = g.getGridFile(m, n)
    return np.loadtxt(f, delimiter = ',')

def get_preds(probs, encoder):
    return encoder.inverse_transform(np.argsort(probs, axis = 1)[:, ::-1][:, :3])


def get_dtrain_enc(m, n):
    data = load_data(m, n)
    M = g.M

    mask = np.array(map(lambda x: M[m][n][x] > 3, data[:, 5]))
    train = data[mask, :]
    print data.shape, "data_shape"


    X = transform_x(train[:, (1, 2, 3, 4)])
    y, enc = transform_y(train[:, 5])
    print X.shape, "X shape"
    print y.shape, "y shape"
    print len(enc['encoder'].classes_), "no of classes"

    dtrain = xgb.DMatrix(X, label=np.ravel(y))
    return (dtrain, enc)


orig_params = {
            'silent': 0,
            'nthread': 8,
            'eta': 0.1,
            'objective': 'multi:softprob',
            'max_depth': 8,
            'min_child_weight': 5,
            'gamma': 0.32,
            'subsample': 0.9,
            'colsample_bytree': 0.7,
            'scale_pos_weight': 1
            }

X = 200
Y = 50
xd = 20
yd = 5
rx = 5
ry = 10
xD = rx*xd
yD = ry*yd
XX = 10000
YY = 10000

eps = 0.001

m = XX/xd
n = YY/yd

M = XX/xD
N = YY/yD


g = grid.Grid(X = 200, Y = 50, xd = 20, yd = 5, pref = 'grid')
g.generateCardinalityMatrix()

tup = lambda t: list(itertools.izip(itertools.repeat(t[0]), t[1]))

def get_list_of_params(params_range):
    pr = list(map(tup, params_range.items()))
    pro = map(dict, list(itertools.product(*pr)))
    return pro


def grid_search_xgb(params_range_dict):
    grid_params_list = get_list_of_params(params_range_dict)
    p = Pool(4)
    maps = p.map(get_map_of_xgb, grid_params_list)
    sorted_maps = sorted(maps, cmp = lambda x, y: cmp(x['map'], y['map']), reverse = True)
    print "top map results", sorted_maps[:3]
    return sorted_maps

def get_map_of_xgb(grid_param):
    global orig_params
    num_class = {'num_class': len(enc['encoder'].classes_)}
    orig_params.update(num_class)
    orig_params.update(grid_param)
    # print orig_params, grid_param
    temp_cv = xgb.cv(orig_params, dtrain, num_boost_round = 100,
             early_stopping_rounds = 20, feval = map3eval, maximize = True)
    temp_map = temp_cv['test-MAP@3-mean'][temp_cv.shape[0]-1]
    grid_param['map'] = temp_map
    # print "cv results", grid_param
    return grid_param



In [4]:
#m, n = (12, 50) (37, 50) (12, 150) (37, 150)
dtrain, enc = get_dtrain_enc(12, 50)
param_range1 = {
    'max_depth': range(2, 7, 1),
    'min_child_weight': range(1, 7, 1)
}

param_range2 = {
    'gamma': [i/10.0 for i in range(0, 6)]
}

param_range3 = {
    'subsample': [i/100.0 for i in range(55, 100, 5)],
    'colsample_bytree': [i/100.0 for i in range(55, 100, 5)]
}

param_range4 = {
 'alpha':[0, 1e-5, 0.001, 0.005, 0.01, 0.05, 0.1]
}

(4810, 6) data_shape
(4530, 12) X shape
(4530, 1) y shape
91 no of classes


In [5]:
try:
    result1 = grid_search_xgb(param_range1)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result2 = grid_search_xgb(param_range2)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result3 = grid_search_xgb(param_range3)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result4 = grid_search_xgb(param_range4)
except Exception, e:
    print e
    print traceback.format_exc()


cv results {'map': 0.63885199999999998, 'max_depth': 3, 'min_child_weight': 5}
cv results {'map': 0.63796900000000001, 'max_depth': 5, 'min_child_weight': 5}
cv results {'map': 0.64238433333333333, 'max_depth': 3, 'min_child_weight': 1}
cv results {'map': 0.64459166666666656, 'max_depth': 5, 'min_child_weight': 1}
cv results {'map': 0.63642399999999999, 'max_depth': 3, 'min_child_weight': 7}
cv results {'map': 0.63708633333333331, 'max_depth': 5, 'min_child_weight': 7}
cv results {'map': 0.64150099999999999, 'max_depth': 3, 'min_child_weight': 3}
cv results {'map': 0.639073, 'max_depth': 5, 'min_child_weight': 3}
cv results {'map': 0.64128066666666672, 'max_depth': 7, 'min_child_weight': 1}
cv results {'map': 0.64061800000000002, 'max_depth': 7, 'min_child_weight': 5}
cv results {'map': 0.64061800000000002, 'max_depth': 9, 'min_child_weight': 5}
cv results {'map': 0.64083866666666667, 'max_depth': 9, 'min_child_weight': 1}
cv results {'map': 0.63730699999999996, 'max_depth': 7, 'min_ch

In [6]:
#m, n = (12, 50) (37, 50) (12, 150) (37, 150)
dtrain, enc = get_dtrain_enc(37, 50)
try:
    result1 = grid_search_xgb(param_range1)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result2 = grid_search_xgb(param_range2)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result3 = grid_search_xgb(param_range3)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result5 = grid_search_xgb(param_range5)
except Exception, e:
    print e
    print traceback.format_exc()

(2653, 6) data_shape
(2373, 12) X shape
(2373, 1) y shape
73 no of classes
cv results {'map': 0.62031166666666671, 'max_depth': 3, 'min_child_weight': 5}
cv results {'map': 0.61820466666666662, 'max_depth': 5, 'min_child_weight': 5}
cv results {'map': 0.61736199999999997, 'max_depth': 5, 'min_child_weight': 1}
cv results {'map': 0.61862633333333339, 'max_depth': 3, 'min_child_weight': 1}
cv results {'map': 0.61651899999999993, 'max_depth': 3, 'min_child_weight': 7}
cv results {'map': 0.61230499999999999, 'max_depth': 5, 'min_child_weight': 7}
cv results {'map': 0.61525533333333338, 'max_depth': 3, 'min_child_weight': 3}
cv results {'map': 0.61694066666666669, 'max_depth': 5, 'min_child_weight': 3}
cv results {'map': 0.61820466666666662, 'max_depth': 7, 'min_child_weight': 5}
cv results {'map': 0.61609766666666665, 'max_depth': 7, 'min_child_weight': 1}
cv results {'map': 0.61651933333333331, 'max_depth': 9, 'min_child_weight': 5}
cv results {'map': 0.61567666666666676, 'max_depth': 7, 

In [None]:
#m, n = (12, 50) (37, 50) (12, 150) (37, 150)
dtrain, enc = get_dtrain_enc(12, 150)
try:
    result1 = grid_search_xgb(param_range1)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result2 = grid_search_xgb(param_range2)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result3 = grid_search_xgb(param_range3)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result5 = grid_search_xgb(param_range5)
except Exception, e:
    print e
    print traceback.format_exc()

(4329, 6) data_shape
(4085, 12) X shape
(4085, 1) y shape
94 no of classes
cv results {'map': 0.60739633333333332, 'max_depth': 5, 'min_child_weight': 5}
cv results {'map': 0.60960099999999995, 'max_depth': 5, 'min_child_weight': 1}
cv results {'map': 0.60837599999999992, 'max_depth': 3, 'min_child_weight': 5}
cv results {'map': 0.60764166666666664, 'max_depth': 3, 'min_child_weight': 1}
cv results {'map': 0.60764133333333337, 'max_depth': 5, 'min_child_weight': 7}
cv results {'map': 0.60862099999999997, 'max_depth': 3, 'min_child_weight': 7}
cv results {'map': 0.60690666666666671, 'max_depth': 5, 'min_child_weight': 3}
cv results {'map': 0.60494700000000001, 'max_depth': 3, 'min_child_weight': 3}
cv results {'map': 0.6083763333333333, 'max_depth': 7, 'min_child_weight': 5}
cv results {'map': 0.60788633333333342, 'max_depth': 7, 'min_child_weight': 1}
cv results {'map': 0.60788633333333342, 'max_depth': 9, 'min_child_weight': 5}
cv results {'map': 0.60690666666666659, 'max_depth': 9, '

In [None]:
#m, n = (12, 50) (37, 50) (12, 150) (37, 150)
dtrain, enc = get_dtrain_enc(37, 150)
try:
    result1 = grid_search_xgb(param_range1)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result2 = grid_search_xgb(param_range2)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result3 = grid_search_xgb(param_range3)
except Exception, e:
    print e
    print traceback.format_exc()
try:
    result5 = grid_search_xgb(param_range5)
except Exception, e:
    print e
    print traceback.format_exc()

(3879, 6) data_shape
(3597, 12) X shape
(3597, 1) y shape
77 no of classes
cv results {'map': 0.68529333333333342, 'max_depth': 5, 'min_child_weight': 1}
cv results {'map': 0.68584933333333342, 'max_depth': 5, 'min_child_weight': 5}
cv results {'map': 0.68445933333333331, 'max_depth': 3, 'min_child_weight': 5}
cv results {'map': 0.68306933333333342, 'max_depth': 3, 'min_child_weight': 1}
cv results {'map': 0.68640533333333342, 'gamma': 0.1}
cv results {'map': 0.68557133333333331, 'gamma': 0.3}
cv results {'map': 0.68723933333333331, 'gamma': 0.2}
cv results {'map': 0.68751733333333342, 'gamma': 0.0}
cv results {'map': 0.68668333333333331, 'gamma': 0.5}
cv results {'map': 0.68473733333333342, 'gamma': 0.4}
top map results [{'map': 0.68751733333333342, 'gamma': 0.0}, {'map': 0.68723933333333331, 'gamma': 0.2}, {'map': 0.68668333333333331, 'gamma': 0.5}]
cv results {'scale_pos_weight': 1, 'map': 0.68807333333333343}
cv results {'scale_pos_weight': 4, 'map': 0.68807333333333343}
cv results

In [None]:
params_dict = [[{} for nt in range(n + 1)] for mt in range(m + 1)]
for Mt in range(M):
    for Nt in range(N):
        orig_params = {
            'silent': 0,
            'nthread': 8,
            'eta': 0.1,
            'objective': 'multi:softprob',
            'max_depth': 8,
            'min_child_weight': 5,
            'gamma': 0.32,
            'subsample': 0.9,
            'colsample_bytree': 0.7,
            'scale_pos_weight': 1
            }
        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        print Mt, "Mt"
        print Nt, "Nt"
        x = (Mt * xD)*eps + eps
        y = (Nt * yD)*eps + eps
        print x, y
        (mt, nt) = grid.get_grids((x, y), X, Y, xd, yd)[0]
        print mt, "mt"
        print nt, "nt"
        
        dtrain, enc = get_dtrain_enc(mt, nt)
        
        for param_range in [param_range1, param_range2, param_range3, param_range4]:
            result = None
            try:
                result = grid_search_xgb(param_range)
            except Exception, e:
                print e
                print traceback.format_exc()
        
            if result != None:
                temp_param = result[0]
                del(temp_param['map'])
                orig_params.update(temp_param)
        for mc in range(rx):
            for nc in range(ry):
                if (mt + mc) < (g.max_m + 1) and (nt + nc) < (g.max_n + 1):
                    params_dict[mt + mc][nt + nc] = dict(orig_params) 
        
        
        
        print "computed params for big grid %s, %s" %(Mt, Nt)
        print orig_params
        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0 Mt
0 Nt
0.001 0.001
0 mt
0 nt
(2756, 6) data_shape
(2645, 12) X shape
(2645, 1) y shape
43 no of classes
