In [1]:
import pyltr
import numpy as np
import sklearn.tree
import random
import math
import os

In [2]:
NUM_TREES = 10
DATA_FOLDER = 'data/'
EXPERIMENTS = ['long-tail','noisy','query-length/c1','query-length/c2','query-length/c3','']
NUM_EXPERIMENTS = 5
MODES = ['train','val','test']



In [49]:
file_dict = {
    'long-tail' :{
        'train': 'train_longtail_suggest_matrix.npz',
        'val': 'val_longtail_suggest_matrix.npz',
        'test': 'test_longtail_suggest_matrix.npz',
    },
    'noisy' :{
        'train': 'tr_noisy_suggest_matrix.npz',
        'val': 'val_noisy_suggest_matrix.npz',
        'test': 'test_noisy_suggest_matrix.npz',
    },
    'query-length/c1' : {
        'train': 'tr_c1_suggest_matrix.npz',
        'val': 'val_c1_suggest_matrix.npz',
        'test': 'test_c1_suggest_matrix.npz',
    },
    'query-length/c2' : {
        'train': 'tr_c2_suggest_matrix.npz',
        'val': 'val_c2_suggest_matrix.npz',
        'test': 'test_c2_suggest_matrix.npz',
    },
    'query-length/c3': {
        'train': 'tr_c3_suggest_matrix.npz',
        'val': 'val_c3_suggest_matrix.npz',
        'test': 'test_c3_suggest_matrix.npz',
    },
    '' : {
        'train': 'train_suggest_matrix.npz',
        'val': 'val_suggest_matrix.npz',
        'test': 'test_suggest_matrix.npz',
    }
}

#col0=anchor ID, col1=suggestion ID. col 2-19 features, col 19 is scores
def make_sessions_ids(anchor_ids):
    
    sessions_ids = np.zeros([len(anchor_ids)])
    iters = len(anchor_ids) /20
    for i in range(iters):
        sessions_ids[i*20:(i+1)*20] = i
    return sessions_ids

def make_data(data_folder, experiment, file_dict,mode ,feature_idx=20):
            
        path = os.path.join(data_folder, experiment, file_dict[experiment][mode] )
        
        print path
        data_train = np.load(path)['arr_0']
       
        data_anchor_ids = data_train[:,0]
        data_suggestion_ids = data_train[:,1]
        data_features = data_train[:,3:feature_idx]
        data_label_scores = data_train[:,20]
        data_session_ids = make_sessions_ids(data_anchor_ids)
        
        return data_anchor_ids, data_suggestion_ids, data_features, data_label_scores, data_session_ids
    
def MRR_Score(Epred, test_data_session_ids, test_data_label_scores):
    sessions = len(test_data_session_ids)/20
    idx_array = []
    for session_idx in range(1, sessions):
        predicted_scores = Epred[session_idx*20:(session_idx +1)*20]
        
        labels = test_data_label_scores[session_idx*20:(session_idx +1)*20]

        merged = np.array([predicted_scores, labels]).T
        

        merged = merged[merged[:, 0].argsort()]

        pred_idx = np.argmax(merged[:,1][::-1]) 
        idx_array.append(pred_idx)

    
    idx_array = np.histogram(idx_array)[0]
    
    sum_score = 0
    for i,val in enumerate(idx_array):
        sum_score += (1./(i+1)) * val
    return sum_score / sessions




In [4]:
"""
Some small demo's
metric = pyltr.metrics.NDCG(k=20)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=n_trees,
    learning_rate=1./n_trees,
    max_features=0.3,
    query_subsample=(1./n_trees),
    max_leaf_nodes=10,
    min_samples_leaf=200,
    verbose=1,
    random_state=None
)           

monitor = pyltr.models.monitors.ValidationMonitor(
    val_data_features, val_data_label_scores, val_data_session_ids, metric=metric, stop_after=5)
    
    model.fit(train_data_features, train_data_label_scores, train_data_session_ids)
    
    Epred = model.predict(test_data_features)

print 'Random ranking:', metric.calc_mean_random(val_data_session_ids, val_data_label_scores)
print 'Our model:', metric.calc_mean(test_data_session_ids, test_data_label_scores, Epred)

print MMR_Score(Epred, test_data_session_ids, test_data_label_scores)

"""



"\nSome small demo's\nmetric = pyltr.metrics.NDCG(k=20)\n\nmodel = pyltr.models.LambdaMART(\n    metric=metric,\n    n_estimators=n_trees,\n    learning_rate=1./n_trees,\n    max_features=0.3,\n    query_subsample=(1./n_trees),\n    max_leaf_nodes=10,\n    min_samples_leaf=200,\n    verbose=1,\n    random_state=None\n)           \n\nmonitor = pyltr.models.monitors.ValidationMonitor(\n    val_data_features, val_data_label_scores, val_data_session_ids, metric=metric, stop_after=5)\n    \n    model.fit(train_data_features, train_data_label_scores, train_data_session_ids)\n    \n    Epred = model.predict(test_data_features)\n\nprint 'Random ranking:', metric.calc_mean_random(val_data_session_ids, val_data_label_scores)\nprint 'Our model:', metric.calc_mean(test_data_session_ids, test_data_label_scores, Epred)\n\nprint MMR_Score(Epred, test_data_session_ids, test_data_label_scores)\n\n"

In [5]:
metric = pyltr.metrics.NDCG(k=20)
result_dict = {
    'long-tail' :[],
    'noisy' :[],
    'query-length/c1' : [],
    'query-length/c2' : [],
    'query-length/c3': [],
    '' : []
}

for experiment in EXPERIMENTS:
    for epoch in range(NUM_EXPERIMENTS):
        print "--- new experiment ---" 
        train_data_anchor_ids, train_data_suggestion_ids, train_data_features, \
        train_data_label_scores, train_data_session_ids =  make_data(DATA_FOLDER, experiment, file_dict,'train' ,feature_idx=20)
        val_data_anchor_ids, val_data_suggestion_ids, val_data_features, \
        val_data_label_scores, val_data_session_ids =  make_data(DATA_FOLDER, experiment, file_dict,'val' ,feature_idx=20) 

        test_data_anchor_ids, test_data_suggestion_ids, test_data_features, \
        test_data_label_scores, test_data_session_ids =  make_data(DATA_FOLDER, experiment, file_dict,'test' ,feature_idx=20) 
        
        model = pyltr.models.LambdaMART(
            metric=metric,
            n_estimators=NUM_TREES,
            learning_rate=1./NUM_TREES,
            max_features=0.3,
            query_subsample=(1./NUM_TREES),
            max_leaf_nodes=10,
            min_samples_leaf=200,
            verbose=1,
            random_state=None
        )
        
        model.fit(train_data_features, train_data_label_scores, train_data_session_ids)
        
        Epred = model.predict(test_data_features)
        
        random_ranking = metric.calc_mean_random(val_data_session_ids, val_data_label_scores)
        model_ndcg = metric.calc_mean(test_data_session_ids, test_data_label_scores, Epred)
        mrr_score = MRR_Score(Epred, test_data_session_ids, test_data_label_scores)
        print "Experiment: ", epoch + 1
        print "MMR Score for " + experiment + " : ", mrr_score
        result_dict[experiment].append(mrr_score)
    print "__________________________________________________"

--- new experiment ---
 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.4195       0.1957        6.80s                                         
    2       0.5374       0.1142        5.51s                                         
    3       0.5865       0.0422        4.89s                                         
    4       0.5953       0.0195        4.19s                                         
    5       0.6475       0.0457        3.48s                                         
    6       0.6725       0.0018        2.81s                                         
    7       0.6376       0.0036        2.08s                                         
    8       0.6561       0.0031        1.37s                                         
    9       0.6770       0.0042        0.68s                                         
   10       0.6667       0.0005        0.00s                                         
Experiment:  1
MMR Score for l

KeyboardInterrupt: 

In [53]:
"""
ADJ model implementation
"""





val_data_anchor_ids, val_data_suggestion_ids, val_data_features, \
val_data_label_scores, val_data_session_ids =  make_data(DATA_FOLDER, '', file_dict,'val' ,feature_idx=20) 

test_data_anchor_ids, test_data_suggestion_ids, test_data_features, \
test_data_label_scores, test_data_session_ids =  make_data(DATA_FOLDER, 'query-length/c3', file_dict,'val' ,feature_idx=20)

print val_data_label_scores.shape
print test_data_label_scores.shape


data/val_suggest_matrix.npz
data/query-length/c3/val_c3_suggest_matrix.npz
(226600,)
(226600,)


In [54]:
def MMR_Score(Epred, test_data_session_ids, test_data_label_scores):
    sessions = len(test_data_session_ids)/20
    idx_array = []
    for session_idx in range(200, sessions):
        predicted_scores = Epred[session_idx*20:(session_idx +1)*20]
        
        labels = test_data_label_scores[session_idx*20:(session_idx +1)*20]
        
        merged = np.array([predicted_scores, labels]).T
        
        print merged
        merged = merged[merged[:, 0].argsort()]

        pred_idx = np.argmax(merged[:,1][::-1]) 
        idx_array.append(pred_idx)

        break 
    idx_array = np.histogram(idx_array)[0]
    
    sum_score = 0
    for i,val in enumerate(idx_array):
        sum_score += (1./(i+1)) * val
    return sum_score / sessions


print MMR_Score(val_data_features[:,0], val_data_session_ids,  val_data_label_scores)
print MMR_Score(test_data_features[:,0], test_data_session_ids,  test_data_label_scores)

[[ 34.   0.]
 [ 14.   0.]
 [ 13.   1.]
 [  6.   0.]
 [  4.   0.]
 [  3.   0.]
 [  3.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  1.   0.]]
1.4710208885e-05
[[ 34.   0.]
 [ 14.   0.]
 [ 13.   1.]
 [  6.   0.]
 [  4.   0.]
 [  3.   0.]
 [  3.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  2.   0.]
 [  1.   0.]]
1.4710208885e-05


In [17]:
0.5243168286960164

0.5243168286960164