In [1]:
import pyltr
import numpy as np
import sklearn.tree
import random
import math

In [2]:
data_archive_val = np.load('data/val_suggest_matrix.npz')
data_archive_test = np.load('data/test_suggest_matrix.npz')
data_archive_train = np.load('data/train_suggest_matrix.npz')

data_val = data_archive_val['arr_0']
data_test = data_archive_test['arr_0']
data_train = data_archive_train['arr_0']

In [3]:
#col0=anchor ID, col1=suggestion ID. col 2-19 features, col 19 is scores
def make_sessions_ids(anchor_ids):
    
    sessions_ids = np.zeros([len(anchor_ids)])
    iters = len(anchor_ids) /20
    for i in range(iters):
        sessions_ids[i*20:(i+1)*20] = i
    return sessions_ids

feature_idx = 20
train_data_anchor_ids = data_train[:,0]
train_data_suggestion_ids = data_train[:,1]
train_data_features = data_train[:,3:feature_idx]
train_data_label_scores = data_train[:,20]
train_data_session_ids = make_sessions_ids(train_data_anchor_ids)



val_data_anchor_ids = data_val[:,0]
val_data_suggestion_ids = data_val[:,1]
val_data_features = data_val[:,3:feature_idx]
val_data_label_scores = data_val[:,20]
val_data_session_ids = make_sessions_ids(val_data_anchor_ids)


test_data_anchor_ids = data_test[:,0]
test_data_suggestion_ids = data_test[:,1]
test_data_features = data_test[:,3:feature_idx]
test_data_label_scores = data_test[:,20]
test_data_session_ids = make_sessions_ids(test_data_anchor_ids)




In [4]:
metric = pyltr.metrics.NDCG(k=20)

In [5]:
print test_data_features.shape

(146260, 17)


In [6]:
n_trees = 100
model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=n_trees,
    learning_rate=1./n_trees,
    max_features=0.3,
    query_subsample=(1./n_trees),
    max_leaf_nodes=10,
    min_samples_leaf=200,
    verbose=1,
    random_state=None
)           

monitor = pyltr.models.monitors.ValidationMonitor(
    val_data_features, val_data_label_scores, val_data_session_ids, metric=metric, stop_after=5)

In [7]:
model.fit(train_data_features, train_data_label_scores, train_data_session_ids)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.3803       0.1211        7.14m                                         
    2       0.5027       0.1686        6.37m                                         
    3       0.5070       0.0251        6.05m                                         
    4       0.5890       0.0085        5.88m                                         
    5       0.5987       0.0147        5.74m                                         
    6       0.6071       0.0040        5.64m                                         
    7       0.5936       0.0087        5.55m                                         
    8       0.6142       0.0031        5.46m                                         
    9       0.5906       0.0021        5.39m                                         
   10       0.5884      -0.0005        5.32m                                         
   15       0.5947      -0.0027        4.99m         

<pyltr.models.lambdamart.LambdaMART at 0x10b25b090>

In [8]:
print test_data_features.shape
Epred = model.predict(test_data_features)

print Epred.shape

print 'Random ranking:', metric.calc_mean_random(val_data_session_ids, val_data_label_scores)
print 'Our model:', metric.calc_mean(test_data_session_ids, test_data_label_scores, Epred)

(146260, 17)
(146260,)
Random ranking: 0.352013419096
Our model: 0.586834316622


In [9]:
same = 0
not_same = 0
for i in range ( len(data_train)/20):
    if np.argmax(data_train[i*20:(i+1)*20,2]) == np.argmax(data_train[i*20:(i+1)*20,20]):
        same += 1
    else:
        not_same += 1

print same / float(not_same + same)
 

0.05785077951


In [10]:
def MMR_Score(Epred, test_data_session_ids, test_data_label_scores):
    sessions = len(test_data_session_ids)/20
    sum_score = 0
    for session_idx in range(0, sessions):
        pred_idx = np.argmax(Epred[session_idx*20:(session_idx +1)*20])
        sum_score += 1. /( pred_idx + 1)
    return sum_score/ sessions


print MMR_Score(Epred, test_data_session_ids, test_data_label_scores)

0.693137640259
