# Init

In [1]:
import pandas
from rankpy.queries import Queries
from rankpy.queries import find_constant_features
from rankpy.models import LambdaMART

In [32]:
for i in range(16):
    
    training_queries = Queries.load_from_text('data/svmlight_training_{0}.txt'.format(i))
    validation_queries = Queries.load_from_text('data/svmlight_validation_{0}.txt'.format(i))
    test_queries = Queries.load_from_text('data/svmlight_test_{0}.txt'.format(i))

    # Print basic info about query datasets.
    print('Train queries: %s' % training_queries)
    print('Valid queries: %s' % validation_queries)
    print('Test queries: %s' % test_queries)
    
    # Set this to True in order to remove queries containing all documents
    # of the same relevance score -- these are useless for LambdaMART.
    remove_useless_queries = True

    # Find constant query-document features.
    cfs = find_constant_features([training_queries, validation_queries, test_queries])

    # Get rid of constant features and (possibly) remove useless queries.
    training_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
    validation_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
    test_queries.adjust(remove_features=cfs)

    # Print basic info about query datasets.
    print('Train queries: %s' % training_queries)
    print('Valid queries: %s' % validation_queries)
    print('Test queries: %s' % test_queries)
    
    model = LambdaMART(metric='nDCG@3', max_leaf_nodes=7, shrinkage=0.1,
                   estopping=50, n_jobs=-1, min_samples_leaf=50,
                   random_state=42)

    model.fit(training_queries, validation_queries=validation_queries)
    
    print('%s on the test queries: %.8f'
             % (model.metric, model.evaluate(test_queries, n_jobs=1)))

    model.save('models/lambdamart_{0}_{1}'.format(model.metric, i))

Train queries: Queries (7375 queries, 44250 documents, 8 features, 1 max. relevance)
Valid queries: Queries (1331 queries, 7986 documents, 8 features, 1 max. relevance)
Test queries: Queries (1294 queries, 7764 documents, 8 features, 1 max. relevance)
Train queries: Queries (4240 queries, 25440 documents, 8 features, 1 max. relevance)
Valid queries: Queries (698 queries, 4188 documents, 8 features, 1 max. relevance)
Test queries: Queries (1294 queries, 7764 documents, 8 features, 1 max. relevance)
nDCG@3 on the test queries: 0.42834832
Train queries: Queries (7339 queries, 44034 documents, 8 features, 1 max. relevance)
Valid queries: Queries (1287 queries, 7722 documents, 8 features, 1 max. relevance)
Test queries: Queries (1374 queries, 8244 documents, 8 features, 1 max. relevance)
Train queries: Queries (4149 queries, 24894 documents, 8 features, 1 max. relevance)
Valid queries: Queries (658 queries, 3948 documents, 8 features, 1 max. relevance)
Test queries: Queries (1374 queries, 8

In [8]:
prediction = model.predict(test_queries, compact=False)

In [98]:
def predict(model, features, queries):
    
    pred = []
    
    prediction = model.predict(queries, compact=True, n_jobs=-1)
    
    print('Getting place ids...')
    
    place_ids = features['place_id'].values
    
    for index, qid in enumerate(queries.query_ids):
        
        query_range = range(queries.query_indptr[index], queries.query_indptr[index+1])
        # get 3 highest prediction scores for each query
        docs = prediction[query_range].argsort()[-3:][::-1]
        query_place_ids = place_ids[query_range][docs]
        query_place_ids = map(str, query_place_ids)
        query_place_ids = ' '.join(query_place_ids)
        pred.append([qid, query_place_ids])
        
        '''docs = prediction[index].argsort()[-3:][::-1]
        place_ids = features[features.row_id == qid]['place_id'].iloc[docs].apply(str).str.cat(sep=' ')
        pred.append([qid, place_ids])'''
        
    return pred

In [99]:
predictions = []

for i in range(16):
    
    print('Loading features {0}...'.format(i))
    features = pandas.read_csv('data/features_test_{0}.csv'.format(i))
    features.sort_values('row_id', inplace=True)

    print('Loading queries {0}...'.format(i))
    queries = Queries.load_from_text('data/svmlight_unlabeled_{0}.txt'.format(i))
    #queries = Queries.load_from_text('data/svmlight_unlabeled_{0}.txt'.format('test'))
    print('Queries: %s' % queries)
    
    print('Loading model {0}...'.format(i))
    model = LambdaMART.load('models/lambdamart_{0}_{1}'.format('nDCG@3', i))
    
    print('Prediction {0}...'.format(i))
    #prediction = model.predict(queries, compact=False)
    predictions.extend(predict(model, features, queries))
    
print('Generating submission...')
submission = pandas.DataFrame(predictions, columns=['row_id', 'place_id'])
submission.sort_values('row_id', inplace=True)

submission.to_csv('submission_lambdamart.gz', index=False, compression='gzip')

Loading features 0...
Loading queries 0...
Queries: Queries (535822 queries, 2679110 documents, 8 features, 0 max. relevance)
Loading model 0...
Prediction 0...
Getting place ids...
Loading features 1...
Loading queries 1...
Queries: Queries (537340 queries, 2686700 documents, 8 features, 0 max. relevance)
Loading model 1...
Prediction 1...
Getting place ids...
Loading features 2...
Loading queries 2...
Queries: Queries (544336 queries, 2721680 documents, 8 features, 0 max. relevance)
Loading model 2...
Prediction 2...
Getting place ids...
Loading features 3...
Loading queries 3...
Queries: Queries (531828 queries, 2659140 documents, 8 features, 0 max. relevance)
Loading model 3...
Prediction 3...
Getting place ids...
Loading features 4...
Loading queries 4...
Queries: Queries (529968 queries, 2649840 documents, 8 features, 0 max. relevance)
Loading model 4...
Prediction 4...
Getting place ids...
Loading features 5...
Loading queries 5...
Queries: Queries (544254 queries, 2721270 docum

In [100]:
print("Submission size ok?", submission.shape[0] == 8607230)

('Submission size ok?', True)


# Evaluation

In [109]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        #print(i, p, actual, predicted)
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def format_prediction(x):
    if isinstance(x, str):
        return x.split()
    return []

score = mapk(val['place_id'].map(lambda x : str(x)), val['predicted'].map(lambda x: format_prediction(x)), k=3)
score

0.1347844674159942

# Submission file

In [111]:
#predictions = pd.concat([val['row_id'], val['predicted']], axis=1, keys=['row_id', 'place_id'])
#print("Prediction size ok?", predictions.shape[0] == 8607230)
#predictions[['row_id', 'place_id']].to_csv('submission.gz', index=False, compression='gzip')
test[['row_id','predicted']].to_csv('submission.gz', index=False, compression='gzip')

Prediction size ok? False


# 1-NN Benchmark

In [None]:
#train = pd.read_csv('train.csv')
#test = pd.read_csv('test.csv')

tree = KDTree(train[['x', 'y', 'weekhour', 'day']])
_, ind = tree.query(test[['x','y', 'weekhour', 'day']], k=1)
ind1 = [x[0] for x in ind]
test['place_id'] = train.iloc[ind1].place_id.values
test[['row_id', 'place_id']].to_csv('submission.gz', index=False, compression='gzip')

In [45]:
_, ind = place_tree.query(test[['x','y']], k=1)
ind1 = [x[0] for x in ind]
test['place_id'] = agg_place.iloc[ind1].place_id.values
test[['row_id', 'place_id']].to_csv('submission_meanxy.gz', index=False, compression='gzip')

In [76]:
test.iloc[0] #['place_id'] = 'test'

row_id            0.0000
x                 0.1675
y                 1.3608
accuracy        107.0000
time         930883.0000
weekhour         50.0000
dayofyear       282.0000
day               9.0000
Name: 0, dtype: float64

In [14]:
test['place_id'] = pd.Series(predictions)

In [16]:
test[['row_id', 'place_id']].to_csv('submission_spatial_multivariate.gz', index=False, compression='gzip')

In [56]:
places.iloc[0].row_id.count

<bound method Series.count of count    73.0
Name: 0, dtype: float64>