In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb

import gzip

import sklearn.preprocessing

from scipy.spatial.distance import cosine


In [2]:
# We split trainfile into train and valid pretty early on
trainfile_orig = pd.read_csv('../walmart-input/train.csv')
trainfile = trainfile_orig.copy()

test_orig = pd.read_csv('../walmart-input/test.csv')
test = test_orig.copy()

sub_orig = pd.read_csv('../walmart-input/sample_submission.csv')
sub = sub_orig.copy()

In [3]:
def chop_triptype(tt):
    return int(tt[9:])

triptype_map = {}
triptypestr_map = {}

c = 0
for k in sub.loc[0].keys():
    if 'TripType_' in k:
        triptypestr_map[k] = c
        triptype_map[chop_triptype(k)] = c
        
        c += 1

triptype_map[45] = triptype_map[999]

print(triptypestr_map)
print(triptype_map)



{'TripType_5': 2, 'TripType_20': 12, 'TripType_38': 30, 'TripType_39': 31, 'TripType_36': 28, 'TripType_33': 25, 'TripType_21': 13, 'TripType_37': 29, 'TripType_34': 26, 'TripType_35': 27, 'TripType_12': 7, 'TripType_22': 14, 'TripType_14': 8, 'TripType_15': 9, 'TripType_27': 19, 'TripType_41': 33, 'TripType_18': 10, 'TripType_8': 5, 'TripType_7': 4, 'TripType_32': 24, 'TripType_30': 22, 'TripType_40': 32, 'TripType_44': 36, 'TripType_999': 37, 'TripType_25': 17, 'TripType_29': 21, 'TripType_24': 16, 'TripType_28': 20, 'TripType_31': 23, 'TripType_26': 18, 'TripType_19': 11, 'TripType_9': 6, 'TripType_42': 34, 'TripType_6': 3, 'TripType_43': 35, 'TripType_4': 1, 'TripType_3': 0, 'TripType_23': 15}
{3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 12: 7, 14: 8, 15: 9, 18: 10, 19: 11, 20: 12, 21: 13, 22: 14, 23: 15, 24: 16, 25: 17, 26: 18, 27: 19, 28: 20, 29: 21, 30: 22, 31: 23, 32: 24, 33: 25, 34: 26, 35: 27, 36: 28, 37: 29, 38: 30, 39: 31, 40: 32, 41: 33, 42: 34, 43: 35, 44: 36, 45: 37, 999: 

In [4]:
# replace days of week.  Yes this could be defined with one line...

dow = {}
dow['Monday'] = 1
dow['Tuesday'] = 2
dow['Wednesday'] = 3
dow['Thursday'] = 4
dow['Friday'] = 5
dow['Saturday'] = 6
dow['Sunday'] = 7

def preproc(df):
    df.Weekday.replace(dow, inplace=True)
    
    df.FinelineNumber.replace({np.nan: -1}, inplace=True)

    df.FinelineNumber = df.FinelineNumber.astype(int)
    df.FinelineNumber += 1

    # there are nan's that need to get converted
    df.DepartmentDescription = df.DepartmentDescription.astype(str)
    
    if 'TripType' in df:
        enc = sklearn.preprocessing.OneHotEncoder()
        enc.fit(df.TripType.reshape(-1, 1))
    
        df.TripTypeOneHot = enc.transform(df.TripType.reshape(-1, 1)).toarray()

# replace type 999 with 45 to keep from wasting memory in structs
#trainfile.TripType.replace({999: 45}, inplace=True)

trainfile.TripType.replace(triptype_map, inplace=True)

preproc(trainfile)
preproc(test)

In [5]:
# Now split the train data into train+valid.  2.5% with seed 0 appears to be pretty good, at least until this
# gets competitive

visits = np.unique(trainfile.VisitNumber)

# determine visits to go in validation set
np.random.seed(0)
validation_visits = np.random.choice(visits, int(len(visits) * .025))

# compute (raw) rows in validation set
validation_set = (trainfile.VisitNumber == validation_visits[0])

for i in range(1, len(validation_visits)):
    validation_set |= trainfile.VisitNumber == validation_visits[i]

valid = trainfile.ix[validation_set]
    
# flip that around to get the train set
training_set = np.invert(validation_set)
train = trainfile.ix[training_set]

In [6]:
# mclogloss.py (todo: copy in original link)
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    idea from this post:
    http://www.kaggle.com/c/emc-data-science/forums/t/2149/is-anyone-noticing-difference-betwen-validation-and-leaderboard-error/12209#post12209

    Parameters
    ----------
    y_true : array, shape = [n_samples]
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota


In [7]:
def build_visitmaps(df):
    visits = np.unique(df.VisitNumber)
    visitmap = {}

    vtype = np.zeros(len(visits))

    if 'TripType' in df:
        for i in range(0, len(visits)):
            vtype[i] = df.ix[df.VisitNumber == visits[i]].iloc[0].TripType
    
    for i in range(0, len(visits)):
        visitmap[visits[i]] = i
        
    return visits, vtype, visitmap
    
valid_visits, valid_type, valid_visitmap = build_visitmaps(valid)
test_visits, test_type, test_visitmap = build_visitmaps(test)

In [8]:
# make categorical #'s.  TODO: move up to preproc???
ddcat = np.unique(train.DepartmentDescription)

# build map
ddmap = {}
for i in range(0, len(ddcat)):
    ddmap[ddcat[i]] = i
           
# apply map
def makeddint(df):
    ddcatout = np.zeros(len(df.DepartmentDescription))
    for i in range(0, len(ddcat)):
        ddcatout[np.where(df.DepartmentDescription == ddcat[i])] = i
    
    # XXX: this is raising a SettingWithCopyWarning on valid and possibly train
    df['DepartmentDescriptionInt'] = ddcatout.astype(int)
    
    return df

train = makeddint(train)
valid = makeddint(valid)
test = makeddint(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
train_psc = train[train.ScanCount >= 1]
# Construct set of usable DepartmentDescriptionInt keys (with >1000 per dept)
vc= train_psc.DepartmentDescriptionInt.value_counts()

ddi_len = np.zeros(len(ddcat))
ddi_keys = {}

for i in vc.iteritems():
    ddi_len[i[0]] = i[1]
    
    if i[1] > 1000:
        ddi_keys[i[0]] = True

In [11]:
train_psc = train[train.ScanCount >= 1]
subsetted = np.full(len(train_psc), False, dtype=bool)

enc = sklearn.preprocessing.OneHotEncoder()
enc.fit(train_psc.TripType.reshape(-1, 1))

a = enc.transform(train_psc.TripType.reshape(-1, 1)).toarray()

flmap = {}
flnd = {}

ddmax = np.max(train.DepartmentDescriptionInt) + 1

fnum = 6
# 0 - % items returned
# 1 - 1 item
# 2 - 2 items
# 3 - 3-4 items
# 4 - 5-9 items
# 5 - 10+ items

f_ddstart = 6
f_ddend = 6 + ddmax

# 6-ddmax: coarse dept description
fnum += ddmax 

# rest of fnum: fineline mapping
for cat in range(0, ddmax):
#for cat in [20]:
    catmask = train_psc.DepartmentDescriptionInt == cat
    subset = train_psc[catmask]
    
    #print(cat, len(subset))
    if len(subset) < 10:
        continue
        
#    vc = subset.Upc.value_counts()
    vc = subset.FinelineNumber.value_counts()
    
    for iterit in vc.iteritems():
        fln = iterit[0]
        if (iterit[1] < 10): # or (len(subset) < (iterit[1] * 2)):
            continue
            
        fnum += 1
                
        flmap[(cat, fln)] = fnum
        
print(len(flmap.keys()))
for i in flmap:
    print(i, flmap[i])

6119
(32, 8246) 3003
(15, 9101) 1071
(57, 1220) 5254
(24, 3601) 1910
(43, 2025) 3962
(66, 1034) 6146
(60, 8002) 5552
(25, 6811) 2274
(5, 6101) 497
(31, 2018) 2928
(3, 2031) 251
(19, 5961) 1573
(57, 4606) 5353
(17, 2505) 1342
(12, 59) 813
(31, 6661) 2862
(37, 600) 3447
(25, 9805) 2373
(33, 6056) 3079
(24, 4152) 2040
(3, 6707) 273
(5, 4003) 450
(2, 912) 166
(36, 3068) 3352
(24, 2839) 2146
(28, 1168) 2566
(13, 2286) 895
(57, 1263) 5266
(49, 3553) 4286
(15, 906) 913
(55, 405) 5102
(23, 257) 1906
(17, 3664) 1340
(41, 6167) 3737
(17, 3054) 1237
(11, 83) 715
(62, 3232) 5805
(23, 5732) 1827
(5, 4521) 459
(24, 2258) 2038
(29, 8236) 2692
(8, 655) 664
(24, 3031) 2034
(19, 535) 1635
(24, 3307) 1943
(43, 1081) 3837
(66, 1402) 6032
(16, 1509) 1079
(62, 4937) 5658
(30, 156) 2771
(17, 4642) 1293
(13, 3762) 871
(17, 2713) 1381
(19, 1057) 1577
(12, 6537) 781
(33, 6434) 3069
(41, 6567) 3703
(15, 3001) 924
(30, 3811) 2720
(12, 90) 800
(52, 1119) 4741
(23, 3458) 1833
(24, 2404) 2122
(57, 173) 5443
(49, 421

In [17]:

def make_ddcomb(df, num_visits = 100000000):
    
    num_ents = len(df)
    
    visits = np.sort(np.unique(df.VisitNumber))
    
    num_visits = min(num_visits, len(visits))

    ddmax = np.max(train.DepartmentDescriptionInt)

    mat = np.zeros((num_visits, (ddmax * 1) + fnum + 2))
    tt = np.zeros(num_visits)
 
    df_scancount = df.ScanCount.values
    df_visitnumber = df.VisitNumber.values
    df_triptype = df.TripType.values if ('TripType' in df) else np.zeros(len(df))
    df_ddint = df.DepartmentDescriptionInt.values
    df_fln = df.Upc.values
    df_fln = df.FinelineNumber.values
    #    df_fmap = df.fmap.values
    df_weekday = df.Weekday.values
    
    
    visitmap = {}
    
    vnum = -1
    
    icount = np.zeros(num_visits + 1)
    
    for i in range(0, num_ents):
        try:
            visit = visitmap[df_visitnumber[i]]
        except:
            vnum += 1

            if (vnum + 1) == num_visits:
                break
            
            visitmap[df_visitnumber[i]] = vnum
            visit = vnum
            
            tt[vnum] = df_triptype[i]
#            mat[visit][fnum + 1] = df_weekday[i] >= 6
#            mat[visit][fnum + 2] = df_weekday[i] == 1
#            mat[visit][fnum + df_weekday[i]] = 1
            
        icount[visit] += 1
    
        if True: #df_scancount[i] > 0:
            dept = df_ddint[i]
            fln = df_fln[i]
            
            # allocation space: 
            # 0 - % returns
            # 1 - ddmax - regular map
            # ddmax - ddmax*2 - leftovers
            # ddmax*2 >= +fmap - map
            
            mat[visit][6 + df_ddint[i]] += (1 + ((df_scancount[i] - 1) * .25))
            
            try:
                feature = flmap[(dept, fln)]
                mat[visit][feature] += (1 + ((df_scancount[i] - 1) * .25))
            except:
                None
                #feature = 1 + ddmax + dept
                #mat[visit][feature] += (1 + ((df_scancount[i] - 1) * .25))
                
        if df_scancount[i] < 0:
            mat[visit][0] += 1
            
    vnum += 1
    for i in range(0, vnum):
        if np.sum(mat[i][f_ddstart:f_ddend]):
#            mat[i][] /= np.sum(mat[i][f_ddstart:f_ddmax])
            mat[i][f_ddstart:f_ddend] /= np.sum(mat[i][f_ddstart:f_ddend])
        if np.sum(mat[i][f_ddend:fnum]):
            mat[i][f_ddend:fnum] /= np.sum(mat[i][f_ddend:fnum])
        if icount[i] > 0:
            mat[i][0] /= icount[i]
            
            if icount[i] == 1:
                mat[i][1] = 1
            elif icount[i] == 2:
                mat[i][2] = 1
            elif icount[i] < 5:
                mat[i][3] = 1
            elif icount[i] < 10:
                mat[i][4] = 1
            else:
                mat[i][5] = 1
                
            
    return mat, visits, tt

train_ddmat, train_visitnum, train_tt = make_ddcomb(train)
#train_ddmat, train_visitnum, train_tt = make_ddcomb(train,num_visits=5000)
valid_ddmat, valid_visitnum, valid_tt = make_ddcomb(valid)
#test_ddmat, test_visitnum, test_tt = make_ddcomb(test)
    


In [23]:
n = 16000
n = len(train_tt)
dtrain = xgb.DMatrix(train_ddmat[0:n], label=train_tt[0:n].tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 8,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.35,
          'colsample_bytree': 1,
          'eta': 0.1}

watchlist = [(dtrain, 'train'), (dtest, 'eval')]
#watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bstc = xgb.train(xgb_params, dtrain, 2000, evals=watchlist, verbose_eval=False, early_stopping_rounds=25)
#bst = xgb.train(xgb_params, dtrain, 10)

predictv2 = bstc.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predictv2))

# .707850 for categorized counts :) (may be even lower)
# .707369 712 - 1.0 ss, 0.2 cat
# .717141 - 0.35/1.0

0.718428756593


Stopping. Best iteration:
[417]	train-mlogloss:0.341163	eval-mlogloss:0.717141



In [24]:

dtrain = xgb.DMatrix(train_ddmat, label=train_tt.tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

bst_cb = {}
predict_cb = {}
#for subs in [0.2, 0.35, 0.5, 0.65, 0.8, 0.9, 1.0]: #np.arange(0.1, 0.9, 0.1):
#    for cb in [0.2, 0.4, 0.6, 0.8, 0.9, 1.0]:
def dorunrun(subs, cb):
        xgb_params = {'max_depth': 8,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': subs,
          'colsample_bytree': cb,
          'eta': 0.01}

        watchlist = [(dtrain, 'train'), (dtest, 'eval')]
        bst_cb[(subs, cb)] = xgb.train(xgb_params, dtrain, 10000, evals=watchlist, verbose_eval=False, early_stopping_rounds=50)

        predict_cb[(subs, cb)] = bst_cb[(subs, cb)].predict(xgb.DMatrix(valid_ddmat))
        print(subs, cb, multiclass_log_loss(valid_tt, predict_cb[(subs, cb)]))

dorunrun(0.35, 0.8)
dorunrun(1.0, 0.2)

Stopping. Best iteration:
[4749]	train-mlogloss:0.326093	eval-mlogloss:0.698906

Stopping. Best iteration:
[7694]	train-mlogloss:0.252616	eval-mlogloss:0.692075



0.35 0.8 0.698924226577
1.0 0.2 0.692082645321


In [None]:
n = 4000
n = len(train_tt)
dtrain = xgb.DMatrix(train_ddmat[0:n], label=train_tt[0:n].tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 20,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.3,
          'colsample_bytree': 0.75,
          'eta': 0.01}

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(xgb_params, dtrain, 4000) #, evals=watchlist, verbose_eval=True)
#bst = xgb.train(xgb_params, dtrain, 10)

predictv = bst.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predictv))

# 1.22593422041 - reference "fast" run w/4000
# 1.21145138814 - new ref fast, 80 feat

# .9320 lowest w/24 md
# .927077 w/20
# .927421 w/18



In [None]:
dtrain = xgb.DMatrix(train_ddmat, label=train_tt.tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 64,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.3,
          'colsample_bytree': 0.8,
          'eta': 0.01}

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst10 = xgb.train(xgb_params, dtrain, 1250, evals=watchlist, verbose_eval=True)
#bst = xgb.train(xgb_params, dtrain, 10)

#predict10 = bst10.predict(xgb.DMatrix(valid_ddmat))
#print(multiclass_log_loss(valid_tt, predict10))

#predict = bst10.predict(xgb.DMatrix(train_ddmat))
#print(multiclass_log_loss(train_tt[0:len(train_ddmat)], predict))

# [1024]	eval-mlogloss:1.249856	train-mlogloss:0.456847

# [938]	eval-mlogloss:1.287829	train-mlogloss:0.475465 - fr .985
# [855]	eval-mlogloss:1.306764	train-mlogloss:0.503785 - fr .95

# [850]	eval-mlogloss:1.256834	train-mlogloss:0.522326 - fr .975 on base

# [942]	eval-mlogloss:1.253929	train-mlogloss:0.474049 .99 on base

# [300]	eval-mlogloss:1.263669	train-mlogloss:1.064013 - regular
# [293]	eval-mlogloss:1.263084	train-mlogloss:1.071790 - .99 cosine distance check, 57features
# [1249]	eval-mlogloss:0.980208	train-mlogloss:0.611616 - ibid

'''
with returns in one column:

+=2:
[269]	eval-mlogloss:1.262641	train-mlogloss:1.079731
[500]	eval-mlogloss:1.060215	train-mlogloss:0.809933
[1151]	eval-mlogloss:0.966263	train-mlogloss:0.611675

+=1
[270]	eval-mlogloss:1.261297	train-mlogloss:1.079624
[500]	eval-mlogloss:1.057744	train-mlogloss:0.810384
[1132]	eval-mlogloss:0.966242	train-mlogloss:0.614787
[1249]	eval-mlogloss:0.963500	train-mlogloss:0.598182

short run:
[1068]	eval-mlogloss:1.257743	train-mlogloss:0.455562
'''

In [None]:
dtrain = xgb.DMatrix(train_ddmat, label=train_tt.tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 12,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.3,
          'colsample_bytree': 0.75,
          'eta': 0.01}

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(xgb_params, dtrain, 2000) #, evals=watchlist, verbose_eval=True)
#bst = xgb.train(xgb_params, dtrain, 10)
predictv = bst.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predictv))


In [None]:
dtrain = xgb.DMatrix(train_ddmat, label=train_tt.tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 12,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.3,
          'colsample_bytree': 0.75,
          'eta': 0.01}

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(xgb_params, dtrain, 1450) #, evals=watchlist, verbose_eval=True)
#bst = xgb.train(xgb_params, dtrain, 10)

predictv = bst.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predictv))


In [None]:
for n in range(500,len(train_tt),500):
    dtrain = xgb.DMatrix(train_ddmat[0:n], label=train_tt[0:n].tolist())
    dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

    xgb_params = {'max_depth': 12,
            'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.3,
          'colsample_bytree': 0.75,
          'eta': 0.05}

    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(xgb_params, dtrain, 300) #, evals=watchlist, verbose_eval=True)
    #bst = xgb.train(xgb_params, dtrain, 10)

    predictv = bst.predict(xgb.DMatrix(valid_ddmat))
    print(n, multiclass_log_loss(valid_tt, predictv))


In [None]:
dtrain = xgb.DMatrix(train_ddmat[0:4000], label=train_tt[0:4000].tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 10,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.3,
          'colsample_bytree': 0.75,
          'eta': 0.05}

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(xgb_params, dtrain, 500, evals=watchlist, verbose_eval=True)
#bst = xgb.train(xgb_params, dtrain, 10)

#predictv = bst.predict(xgb.DMatrix(valid_ddmat))
#print(multiclass_log_loss(valid_tt, predictv))


In [None]:
dtrain = xgb.DMatrix(train_ddmat, label=train_tt.tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 12,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.3,
          'colsample_bytree': 0.8,
          'eta': 0.01}

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst10 = xgb.train(xgb_params, dtrain, 1000, evals=watchlist, verbose_eval=True)
#bst = xgb.train(xgb_params, dtrain, 10)

predictv = bst10.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predict11))


In [None]:
dtrain = xgb.DMatrix(train_ddmat, label=train_tt[0:len(train_ddmat)].tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 11,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'eta': 0.01}

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst11 = xgb.train(xgb_params, dtrain, 1250, evals=watchlist, verbose_eval=False)
#bst = xgb.train(xgb_params, dtrain, 10)

predict11 = bst11.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predict11))

predict = bst.predict(xgb.DMatrix(train_ddmat))
print(multiclass_log_loss(train_tt[0:len(train_ddmat)], predict))

In [None]:
dtrain = xgb.DMatrix(train_ddmat, label=train_tt[0:len(train_ddmat)].tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 10,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'seed': 12345,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'eta': 0.01}

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst10a = xgb.train(xgb_params, dtrain, 1250, evals=watchlist, verbose_eval=False)
#bst = xgb.train(xgb_params, dtrain, 10)

predict10a = bst10a.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predict10a))

predict = bst.predict(xgb.DMatrix(train_ddmat))
print(multiclass_log_loss(train_tt[0:len(train_ddmat)], predict))

In [17]:
test_ddmat, test_visitnum, test_tt = make_ddcomb(test)

In [18]:
n = 4000
n = len(train_tt)
dtrain = xgb.DMatrix(train_ddmat[0:n], label=train_tt[0:n].tolist())
dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

xgb_params = {'max_depth': 20,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.3,
          'colsample_bytree': 0.75,
          'eta': 0.01}

watchlist = [(dtest, 'eval')]
bst = xgb.train(xgb_params, dtrain, 4000, evals=watchlist, verbose_eval=False, early_stopping_rounds=25)
#bst = xgb.train(xgb_params, dtrain, 10)

predictv = bst.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predictv))

0.861043324963


Stopping. Best iteration:
[3100]	eval-mlogloss:0.861015



In [144]:
test_ddmat, test_visitnum, test_tt = make_ddcomb(test)
predict = bst_a[7].predict(xgb.DMatrix(test_ddmat))

subkeys = {}

for k in sub.loc[0].keys():
    if 'TripType_' in k:
        subkeys[k] = int(k[9:])
        
        if subkeys[k] == 999:
            subkeys[k] = 45
            
sub = sub_orig.copy()

predict_t = predict.transpose()
for k in subkeys.keys():
    pk = triptype_map[subkeys[k]]
    sub[k] = predict_t[pk]
    
with gzip.open('xgb-718.gz', 'wt') as write_file:
    sub.to_csv(write_file, index=False)

4.0 4.0
4.0 4.0
2.0 1.5
1.0 1.0
2.0 1.5
2.0 2.0
10.0 10.25
8.0 8.75
7.0 7.0
7.0 7.0


In [30]:

for i in range(len(predictv)):
    print(i, valid_tt[i], predictv[i][valid_tt[i]])

0 4.0 0.367053
1 34.0 0.559628
2 32.0 0.427736
3 5.0 0.514302
4 6.0 0.793134
5 6.0 0.165349
6 17.0 0.601169
7 32.0 0.843089
8 4.0 0.979511
9 2.0 0.793617
10 32.0 0.00677302
11 19.0 0.307958
12 4.0 0.873693
13 33.0 0.000883636
14 36.0 0.213921
15 4.0 0.979511
16 32.0 0.954743
17 0.0 0.99587
18 6.0 0.366529
19 0.0 0.997805
20 2.0 0.917615
21 4.0 0.515005
22 30.0 0.533601
23 26.0 0.310212
24 28.0 0.780221
25 0.0 0.999439
26 5.0 0.721697
27 24.0 0.971244
28 5.0 0.919701
29 37.0 0.999623
30 4.0 0.965865
31 2.0 0.9855
32 1.0 0.962265
33 32.0 0.168561
34 2.0 0.9855
35 37.0 0.997042
36 5.0 0.891583
37 5.0 0.891583
38 32.0 0.227922
39 0.0 0.364438
40 4.0 0.99396
41 14.0 0.748018
42 5.0 0.950764
43 2.0 0.887212
44 14.0 0.397617
45 30.0 0.585726
46 6.0 0.698669
47 5.0 0.8795
48 5.0 0.922604
49 6.0 0.779722
50 4.0 0.886031
51 37.0 0.997667
52 0.0 0.998822
53 11.0 0.260285
54 31.0 0.378776
55 6.0 0.0144954
56 36.0 0.251065
57 37.0 0.973435
58 22.0 0.265517
59 5.0 0.922604
60 22.0 0.265517
61 31.0 0

  app.launch_new_instance()


In [31]:
predictv[656]

array([  3.37984311e-05,   2.97883489e-05,   3.84287589e-04,
         5.05797179e-05,   1.77430883e-02,   1.38986437e-03,
         1.83235656e-03,   1.17183384e-03,   7.14346897e-05,
         6.33266580e-04,   3.37166637e-01,   1.52447128e-05,
         1.45834568e-03,   7.90645063e-05,   2.01356059e-04,
         1.15878693e-05,   5.46564755e-04,   8.60346074e-04,
         1.31736917e-04,   8.72696546e-05,   2.20847178e-05,
         2.81206608e-01,   1.83548094e-04,   1.41125192e-05,
         1.83781330e-03,   4.09990258e-04,   1.71466061e-04,
         1.92533957e-03,   1.16790773e-03,   2.78570247e-03,
         6.08719625e-02,   1.13923281e-01,   6.30103925e-04,
         4.33381356e-04,   1.44492939e-01,   2.36133430e-02,
         8.26069270e-04,   1.58587366e-03], dtype=float32)

In [41]:
dtrain = xgb.DMatrix(predictv[100:], label=valid_tt[100:].tolist())
dtest = xgb.DMatrix(predictv[0:100], label=valid_tt[0:100].tolist())

xgb_params = {'max_depth': 6,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'seed': 12345,
          'subsample': 0.25,
          'colsample_bytree': 1.0,
          'eta': 0.01}

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bstl2 = xgb.train(xgb_params, dtrain, 1250, evals=watchlist, verbose_eval=True)
#bst = xgb.train(xgb_params, dtrain, 10)

[0]	eval-mlogloss:3.568552	train-mlogloss:3.572205
[1]	eval-mlogloss:3.499104	train-mlogloss:3.507906
[2]	eval-mlogloss:3.436606	train-mlogloss:3.447335
[3]	eval-mlogloss:3.372570	train-mlogloss:3.386324
[4]	eval-mlogloss:3.312791	train-mlogloss:3.330973
[5]	eval-mlogloss:3.259954	train-mlogloss:3.280166
[6]	eval-mlogloss:3.206841	train-mlogloss:3.230270
[7]	eval-mlogloss:3.157535	train-mlogloss:3.183311
[8]	eval-mlogloss:3.113109	train-mlogloss:3.140368
[9]	eval-mlogloss:3.064487	train-mlogloss:3.095471
[10]	eval-mlogloss:3.022351	train-mlogloss:3.055498
[11]	eval-mlogloss:2.979074	train-mlogloss:3.014484
[12]	eval-mlogloss:2.938092	train-mlogloss:2.976585
[13]	eval-mlogloss:2.898180	train-mlogloss:2.938605
[14]	eval-mlogloss:2.859780	train-mlogloss:2.903468
[15]	eval-mlogloss:2.821076	train-mlogloss:2.868733
[16]	eval-mlogloss:2.786227	train-mlogloss:2.836865
[17]	eval-mlogloss:2.752142	train-mlogloss:2.805043
[18]	eval-mlogloss:2.718093	train-mlogloss:2.773438
[19]	eval-mlogloss:2.6

KeyboardInterrupt: 

In [62]:
np.max(train.DepartmentDescriptionInt)

68

In [None]:
bst_a = {}
for d in range(6,9):
    n = len(train_tt)
    dtrain = xgb.DMatrix(train_ddmat[0:n], label=train_tt[0:n].tolist())
    dtest = xgb.DMatrix(valid_ddmat, label=valid_tt.tolist())

    xgb_params = {'max_depth': d,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.5,
          'colsample_bytree': 0.8,
          'eta': 0.01}

    watchlist = [(dtest, 'eval')]
    bst_a[d] = xgb.train(xgb_params, dtrain, 10000, evals=watchlist, verbose_eval=False, early_stopping_rounds=100)

    try:
        predictv = bst_a[d].predict(xgb.DMatrix(valid_ddmat))
        print(d, multiclass_log_loss(valid_tt, predictv))

#        bst_a[d].save_model('dmp{0}.dmp'.format(d))
    except:
        None

In [50]:
predict_v = {}
for k in bst_a.keys():
    predict_v[k] = bst_a[k].predict(xgb.DMatrix(valid_ddmat))
    print(k, multiclass_log_loss(valid_tt, predict_v[k]))

8 0.718888711947
6 0.720522389446
7 0.718519640167


In [102]:
s = [6, 7]

count = 0
p = None
for i in s:
    count += 1
    try:
        p += predict_v[i].copy()
    except:
        p = predict_v[i].copy()
    print(i, multiclass_log_loss(valid_tt, p / count))

6 0.72574163027
7 0.72531458472


In [91]:
multiclass_log_loss(valid_tt, predict_v[7])

0.73167963243421852

In [15]:
predictv = bsta.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predictv))


0.75344228234


In [26]:
tm = []
for x in flmap.keys():
    tm.append(x[1])

In [30]:
len(np.unique(tm))

3965

In [33]:
len(flmap.keys())

3965

NameError: name 'predict_v' is not defined

In [48]:
bst_b = bst_a.copy()

NameError: name 'bst_a' is not defined

In [51]:
px = predict_v[7].transpose()

In [58]:
px = px.copy()
import sklearn.metrics

In [138]:
tt = np.zeros(len(valid_tt))
for i in range(px.shape[0]):
    for j in range(px.shape[1]):
        tt[j] = valid_tt[j] == i
        
    if np.sum(tt) > 25:
        print(i, np.sum(tt), sklearn.metrics.roc_auc_score(tt, px[i]), sklearn.metrics.log_loss(tt, px[i]))

0 89.0 0.998495406638 0.00921656065909
2 114.0 0.99715717225 0.0281430195106
3 34.0 0.999809344654 0.00590514609805
4 133.0 0.991077581086 0.0543550637115
5 284.0 0.988834411508 0.0866848036855
6 221.0 0.980891021918 0.0981872107827
14 29.0 0.99666919451 0.0161953095474
16 70.0 0.99108240311 0.0390734607616
17 89.0 0.992230827004 0.04163777929
22 27.0 0.99503726044 0.0187096128156
24 43.0 0.994571961862 0.0249479491007
25 36.0 0.991061130334 0.0253556248
27 57.0 0.980319633654 0.0482101070189
28 72.0 0.990102030853 0.0423854045597
29 72.0 0.986812878344 0.0451053864164
30 65.0 0.986515718185 0.0440399370502
31 264.0 0.952845998371 0.168061876825
32 160.0 0.989351005484 0.0632874681263
34 42.0 0.974920497254 0.0427529660013
35 28.0 0.945350985222 0.0455154073291
36 41.0 0.967881421337 0.0531941478741
37 197.0 0.995004094424 0.0339994282408


In [90]:
pc = predict_v[7].copy()
for i in range(0, len(valid_tt)):
#    pc[i][31] = (valid_tt[i] == 31)
#    pc[i][35] = (valid_tt[i] == 35)
#    pc[i][6] = (valid_tt[i] == 6)
    pc[i] /= np.sum(pc[i])

In [91]:
multiclass_log_loss(valid_tt, pc)

0.71851963858402246

In [134]:
n = len(train_tt)

key = 31

ttt_key = np.zeros(n)
vtt_key = np.zeros(len(valid_tt))

for i in range(n):
    ttt_key[i] = 1 if (train_tt[i] == key) else 0
    
for i in range(len(valid_tt)):
    vtt_key[i] = 1 if (valid_tt[i] == key) else 0

dtrain = xgb.DMatrix(train_ddmat[0:n], label=ttt_key)
dtest = xgb.DMatrix(valid_ddmat, label=vtt_key)

xgb_params = {'max_depth': 12,
          'objective': 'reg:linear',
          'eval_metric': 'logloss',
#          'num_class': 1,
          'subsample': 0.4,
          'colsample_bytree': 0.8,
          'eta': 0.01}

watchlist = [(dtrain, 'train'), (dtest, 'eval')]
#watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bsta = xgb.train(xgb_params, dtrain, 5000, evals=watchlist, verbose_eval=True, early_stopping_rounds=500)
#bst = xgb.train(xgb_params, dtrain, 10)

Will train until eval error hasn't decreased in 500 rounds.
[0]	train-logloss:0.685874	eval-logloss:0.686139
[1]	train-logloss:0.678624	eval-logloss:0.679283
[2]	train-logloss:0.671590	eval-logloss:0.672483
[3]	train-logloss:0.664524	eval-logloss:0.665696
[4]	train-logloss:0.657862	eval-logloss:0.659356
[5]	train-logloss:0.651208	eval-logloss:0.652969
[6]	train-logloss:0.644649	eval-logloss:0.646709
[7]	train-logloss:0.638134	eval-logloss:0.640541
[8]	train-logloss:0.631758	eval-logloss:0.634505
[9]	train-logloss:0.625483	eval-logloss:0.628526
[10]	train-logloss:0.619379	eval-logloss:0.622639
[11]	train-logloss:0.613400	eval-logloss:0.616891
[12]	train-logloss:0.607530	eval-logloss:0.611279
[13]	train-logloss:0.601765	eval-logloss:0.605796
[14]	train-logloss:0.596184	eval-logloss:0.600370
[15]	train-logloss:0.590618	eval-logloss:0.595038
[16]	train-logloss:0.585032	eval-logloss:0.589628
[17]	train-logloss:0.579612	eval-logloss:0.584421
[18]	train-logloss:0.574369	eval-logloss:0.579353


In [135]:
huh = bsta.predict(xgb.DMatrix(valid_ddmat))
huh[huh < 0] = 0
huh[huh > 1] = 1

In [136]:
huh

array([ 0.        ,  0.27075088,  0.5225513 , ...,  0.22783819,
        0.20815966,  0.        ], dtype=float32)

In [137]:
pc = predict_v[7].copy()
print(multiclass_log_loss(valid_tt, pc))
for i in range(0, len(valid_tt)):
    pc[i][31] = huh[i]
#    pc[i][31] = (valid_tt[i] == 31)
#    pc[i][35] = (valid_tt[i] == 35)
#    pc[i][6] = (valid_tt[i] == 6)
    pc[i] /= np.sum(pc[i])
print(multiclass_log_loss(valid_tt, pc))

0.718519640167
0.726872708659


In [139]:
predict_t = bst_a[7].predict(xgb.DMatrix(train_ddmat))

KeyboardInterrupt: 

In [145]:
pickle.dump(predict_v[7], file=open('predictv-718.pkl', 'wb'))

In [142]:
import pickle

In [37]:
pickle.dump(predict, file=open('predict-6813.pkl', 'wb'))

In [149]:
len(train_psc[train_psc.DepartmentDescriptionInt == 0]) / len(train_psc)

0.0005715705361883715

In [150]:
len(train_psc) / len(train)

0.9761203208683289

In [23]:
print(multiclass_log_loss(valid_tt, predictv))


0.727273044324


In [122]:
predictv = bstb.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predictv))


0.727273044324


In [121]:
predictvc = bstc.predict(xgb.DMatrix(valid_ddmat))
print(multiclass_log_loss(valid_tt, predictvc))

0.741308078585


In [31]:
print(multiclass_log_loss(valid_tt, (predictvc+predictv)/2))

0.716402033546


In [35]:
len(train.Upc.unique())

96804

In [36]:
len(train.FinelineNumber.unique())

5188

In [37]:
5188*5188

26915344

In [81]:
nfl = len(train.FinelineNumber.unique())
flgrid = np.zeros((38, nfl, nfl))

In [45]:
flgrid.shape

(39, 5188, 5188)

In [78]:
flnum = 0
flremap = {}
for i in np.unique(train.FinelineNumber):
    flremap[i] = flnum
    flnum += 1

In [82]:
for i in range(0, 38):
    subset = train[train.TripType == i]
    print(i, len(subset))
    
    for v in np.unique(subset.VisitNumber):
        subset2 = subset[subset.VisitNumber == v]
        
        for m in range(len(subset2)):
            ma = flremap[subset2.iloc[m].FinelineNumber]
            for n in range(m + 1, len(subset2)):
                na = flremap[subset2.iloc[n].FinelineNumber]
                flgrid[i][ma][na] += 1
#                print(m, ma, na, flgrid[i][ma][na])

0 6665
1 882
2 13463
3 3322
4 22629
5 22306
6 16401
7 2035
8 35
9 7051
10 2936
11 1144
12 3055
13 3903
14 3499
15 316
16 17645
17 26992
18 2436
19 4492
20 2612
21 2072
22 4759
23 1723
24 13461
25 9657
26 4637
27 12128
28 21510
29 38058
30 28929
31 93043
32 169650
33 5423
34 18901
35 6190
36 19785
37 17168


In [83]:
flgrid[0]

array([[  0.,   2.,   0., ...,   0.,   0.,   0.],
       [  3.,  60.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       ..., 
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.]])

In [58]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [67]:
np.sum(flgrid[32])

0.0

In [88]:
from scipy import sparse

In [93]:
csr[0] = sparse.csr_matrix(flgrid[0])

In [92]:
csr = {}

In [95]:
for i in range(0,38):
    csr[i] = sparse.csr_matrix(flgrid[i])

In [96]:
import pickle

In [97]:
pickle.dump(csr, file=open('/run/shm/csr.pkl', 'wb'))

In [98]:
csr[0]

<5188x5188 sparse matrix of type '<class 'numpy.float64'>'
	with 949 stored elements in Compressed Sparse Row format>

In [99]:
aflgrid = np.zeros((nfl, nfl))
for i in range(0, 38):
    aflgrid += flgrid[i]

array([[  2.51600000e+03,   3.40000000e+01,   3.00000000e+00, ...,
          0.00000000e+00,   9.00000000e+00,   1.00000000e+00],
       [  4.50000000e+01,   1.65000000e+02,   1.20000000e+01, ...,
          1.00000000e+00,   4.00000000e+00,   5.00000000e+00],
       [  6.00000000e+00,   1.00000000e+01,   3.70000000e+01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00, ...,
          8.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.00000000e+00,   1.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   6.00000000e+00,   0.00000000e+00],
       [  1.80000000e+01,   5.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   1.90000000e+01]])

In [119]:
vr = np.zeros((len(np.unique(valid.VisitNumber)), 38))
c = 0

for v in np.unique(valid.VisitNumber):
    subset = valid[valid.VisitNumber == v]
    if (len(subset) < 2):
        vr[c][32] = 1
    else:
        for m in range(len(subset)):
            try:
                ma = flremap[subset.iloc[m].FinelineNumber]

                for n in range(m + 1, len(subset)):
                    try:
                        na = flremap[subset.iloc[n].FinelineNumber]
                    
                        for g in range(0, 38):
                            vr[c][g] += np.log(flgrid[g][ma][na])
                    except:
                        None
            except:
                None
                
    if (np.sum(vr[c])) > 0:
        vr[c] /= np.sum(vr[c])

    c += 1
    
    if not (c % 100):
        print(c)

        
    
        

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300


In [118]:
np.sum(vr[1])

1.0

In [120]:
print(multiclass_log_loss(valid_tt, vr))


12.5330969423


In [25]:
pkl = []
for k in predict_cb.keys():
    pkl.append(k)
    
for m in range(0, len(pkl)):
    print(pkl[m], multiclass_log_loss(valid_tt, predict_cb[pkl[m]]))
    for n in range(m + 1, len(pkl)):
            v = multiclass_log_loss(valid_tt, (predict_cb[pkl[m]] + predict_cb[pkl[n]] / 2))
            if v < 0.705:
                print(pkl[m], pkl[n], multiclass_log_loss(valid_tt, predict_cb[pkl[n]]), v)

(1.0, 0.2) 0.692082645321
(1.0, 0.2) (0.35, 0.8) 0.698924226577 0.68350884736
(0.35, 0.8) 0.698924226577


In [26]:
pa = predict_cb[(1.0, 0.2)]
pb = predict_cb[(0.35, 0.8)]

In [35]:
for i in np.arange(0.1, 2, 0.1):
    print(i, multiclass_log_loss(valid_tt, ((pa * i) + pb) / (1 + i)))

0.1 0.692901810983
0.2 0.689412218784
0.3 0.687055037598
0.4 0.68539714695
0.5 0.684206496156
0.6 0.683342099048
0.7 0.682712484115
0.8 0.682255633201
0.9 0.68192814764
1.0 0.681698883614
1.1 0.681545030794
1.2 0.681449570532
1.3 0.681399666345
1.4 0.681385474235
1.5 0.681399392577
1.6 0.681435471175
1.7 0.681489026265
1.8 0.681556340126
1.9 0.68163446954


In [36]:
pm = ((pa * 1.4) + pb) / 2.4

In [32]:
import pickle
pickle.dump(pm, file=open('predict-6814.pkl', 'wb'))

In [34]:
pa = predict_cb[(1.0, 0.2)]
pb = predict_cb[(0.35, 0.8)]

test_ddmat, test_visitnum, test_tt = make_ddcomb(test)
predicta = bst_cb[(1.0, 0.2)].predict(xgb.DMatrix(test_ddmat))
predictb = bst_cb[(0.35, 0.8)].predict(xgb.DMatrix(test_ddmat))
predict = ((predicta * 1.4) + predictb) / 2.4
subkeys = {}

for k in sub.loc[0].keys():
    if 'TripType_' in k:
        subkeys[k] = int(k[9:])
        
        if subkeys[k] == 999:
            subkeys[k] = 45
            
sub = sub_orig.copy()

predict_t = predict.transpose()
for k in subkeys.keys():
    pk = triptype_map[subkeys[k]]
    sub[k] = predict_t[pk]
    
with gzip.open('xgb-718.gz', 'wt') as write_file:
    sub.to_csv(write_file, index=False)