# Init

In [1]:
import pandas
import numpy
import logging
from sklearn.datasets import dump_svmlight_file

# Extract features from spatial and time models

In [2]:
#with open('data/gmms.pickle', 'wb') as handle:
#  gmms = pickle.load(handle)

'''def get_spatial_likelihood(place_index, x, y):
    
    gmm = gmms[place_index]
    if gmm:
        #print('Hit!')
        return gmm.score([x,y])
    
    #print('gaussian not found')
    return 0'''

def get_spatial_likelihood(place_index, distance):
    # from "Learning to Rank for Spatiotemporal Search". Blake Shaw et al"
    d = 50
    e = 4
    return (d / (distance + d)) ** e

def get_spatial_likelihoods(place_indexes, distances, accuracy):
    return [get_spatial_likelihood(place_index, distances[i]) for i, place_index in enumerate(place_indexes)]
    
def get_checkin_spatial_features(checkin, place_indexes, distances):
    
    #if checkin.row_id % 100000 == 0:
        #print(checkin.row_id)
        #logging.info(checkin.row_id)
    
    if place_indexes.size == 0:
        return
    
    spatial_likelihoods = get_spatial_likelihoods(place_indexes, distances, checkin.accuracy)
    
    features = []
    
    for i, place_index in enumerate(place_indexes):
        place_features = { 
                            'checkin_id': checkin.row_id,
                            'place_index': place_index,
                            'distance': distances[i],
                            'spatial_likelihood' : spatial_likelihoods[i]
                        }
        
        features.append(place_features)
        
    return features
    
def get_spatial_features(checkins, nearest_places, distances):

    features = []
    
    for index, checkin in checkins.iterrows():
        features.extend(get_checkin_spatial_features(checkin, nearest_places[index], distances[index]))
        
    return pandas.DataFrame(features)
    
#%prun -s cumulative predict()

# Split in training, validation and test sets

In [3]:
def split_training_set(train, train_test_ratio = 8.0):
    day_in_minutes = 60 * 24

    train_time_max = train.time.max()
    train_time = train_test_ratio * train_time_max / (train_test_ratio + 1)
    val_time = train_time_max - train_time
    test_time = val_time
    train_time = train_time - test_time

    print('Train interval:', train_time / day_in_minutes, 'days')
    print('Validation interval:', val_time / day_in_minutes, 'days')
    print('Test interval:', test_time / day_in_minutes, 'days')

    train_small = train[train.time <= train_time].copy()
    val_small = train[(train.time > train_time) & (train.time <= (train_time + val_time))].copy()
    test_small = train[(train.time > (train_time + val_time))].copy()

    return (train_small, val_small, test_small)

In [5]:
# load place model
places = pandas.read_csv('data/places.csv')
#places = places[places.checkins > 90]
places_hour = pandas.read_csv('data/places_hour.csv')

In [11]:
# Calculate relevance for each document: 1 if correct place, 0 otherwise
def relevance(a):
    if a[0] == a[1]:
        return 1
    else:
        return 0

def get_features(checkins, name, file_index, n_neighbors = 5):
    
    print(checkins.head())
    print(checkins.shape)
    
    print('Loading nearest neighbors...')
    
    nearest = numpy.load('data/neareast_{0}_{1}_{2}.npy'.format(n_neighbors, name, file_index))
    distances = numpy.load('data/distances_{0}_{1}_{2}.npy'.format(n_neighbors, name, file_index))
    
    print(nearest.shape)
    
    print('Getting query spatial features...')
    
    spatial_features = get_spatial_features(checkins, nearest, distances)
    
    print(spatial_features.head())
    print("Merging spatial features...")
    
    # Merge spatial features
    checkins = pandas.merge(checkins, spatial_features, left_on='row_id',
                         right_on='checkin_id', suffixes=('_checkin', '_feature'), sort=False)
    checkins.drop('checkin_id', axis=1, inplace=True)
    
    print(checkins.head())
    print(checkins.shape)
    print("Merging place spatial features...")
    
    # Merge place features
    checkins = pandas.merge(checkins, places[['place_id', 'accuracy', 'checkins']], left_on='place_index', 
                          right_index=True, suffixes=('_checkin', '_place'), sort=False)
    
    print(checkins.head())
    print(checkins.shape)
    print("Merging time features...")
    
    # Merge time features
    
    if 'place_id_place' in checkins.columns:
        checkins = pandas.merge(checkins, places_hour[['place_id', 'weekhour', 'time_likelihood']], how='left',
                             left_on=['place_id_place', 'weekhour'], right_on=['place_id', 'weekhour'], 
                             suffixes=('_checkin', '_place_hour'), sort=False)
        
    elif 'place_id' in checkins.columns:
        checkins = pandas.merge(checkins, places_hour[['place_id', 'weekhour', 'time_likelihood']], how='left',
                             left_on=['place_id', 'weekhour'], right_on=['place_id', 'weekhour'], 
                             suffixes=('_checkin', '_place_hour'), sort=False)
    
    #checkins.drop('place_id', axis=1, inplace=True)
    checkins.fillna(0, inplace=True)
    
    print(checkins.head())
    print(checkins.shape)
    
    if 'place_id_checkin' in checkins.columns:
        print("Calculating relevance...")
        checkins['relevance'] = checkins[['place_id_checkin', 'place_id_place']].apply(relevance, axis = 1)
    
    checkins.sort_values('row_id', inplace=True)
    return checkins

def save_features(checkins, name, file_index, n_neighbors = 5):
    
    features = get_features(checkins, name, file_index, n_neighbors)
    features.to_csv('data/features_{0}_{1}.csv'.format(name, file_index), index=False)
    
    print(features.head())
    
    if 'place_id' in checkins.columns:
        print("Splitting  training set...")
        train, val, test = split_training_set(features)
    
    col_names = ['accuracy_checkin', 'weekhour', 'day', 'spatial_likelihood', 'time_likelihood', 'accuracy_place',
            'checkins', 'distance']

    print("Dumping SVMLight files...")
    
    # Generate the SVMLight format file
    if 'place_id' in checkins.columns:
        dump_svmlight_file(train[col_names].values, train['relevance'].values, 'data/svmlight_training_{0}.txt'.format(file_index),
                           query_id=train.row_id.values)
        dump_svmlight_file(val[col_names].values, val['relevance'].values, 'data/svmlight_validation_{0}.txt'.format(file_index),
                           query_id=val.row_id.values)
        dump_svmlight_file(test[col_names].values, test['relevance'].values, 'data/svmlight_test_{0}.txt'.format(file_index),
                           query_id=test.row_id.values)
        
    else:
        dump_svmlight_file(features[col_names].values, [0] * features.shape[0],'data/svmlight_unlabeled_{0}.txt'.format(file_index),
                           query_id=features.row_id.values)

In [47]:
for i in range(16):
    print('Loading trainng data...')
    checkins = pandas.read_csv('data/train_{0}.csv'.format(i)).head(10000)
    save_features(checkins, 'train', i)  

Loading trainng data...
   row_id       x       y  accuracy    time    place_id  weekhour  day
0      12  0.8829  1.3445        64  574488  7652380351        44    3
1      39  1.2191  1.3462       743  477469  6171384989        65   28
2      54  1.9264  0.2691        64  220833  4228252677        16    3
3     109  0.4995  1.4831       155  769344  9841775341        30   19
4     112  2.2360  1.3655        66  623174  7663031065        18    9
(10000, 8)
Loading nearest neighbors...
(1767637, 6)
Getting query spatial features...
   checkin_id  distance  place_index  spatial_likelihood
0        12.0  0.005100        53411            0.999592
1        12.0  0.005842       107947            0.999533
2        12.0  0.008224        51094            0.999342
3        12.0  0.010704        25762            0.999144
4        12.0  0.025281        69546            0.997980
Merging spatial features...
   row_id       x       y  accuracy    time    place_id  weekhour  day  \
0      12  0.8829  

In [12]:
for i in range(16):
    print('Loading test data...')
    checkins = pandas.read_csv('data/test_{0}.csv'.format(i))
    save_features(checkins, 'test', i)

Loading test data...
   row_id       x       y  accuracy    time  weekhour  day
0       0  0.1675  1.3608       107  930883        50    9
1       3  0.9990  1.0591        62  907285         3   23
2       5  0.1771  0.0022       161  814077         7   20
3      39  1.4303  0.2069        23  862332       120   22
4      41  0.2945  0.6054        57  972213        18    7
(535822, 7)
Loading nearest neighbors...
(535822, 5)
Getting query spatial features...
   checkin_id  distance  place_index  spatial_likelihood
0         0.0  0.019959        51776            0.998405
1         0.0  0.031605        34258            0.997476
2         0.0  0.043062        17096            0.996562
3         0.0  0.046956        82027            0.996252
4         0.0  0.048588        41072            0.996122
Merging spatial features...
   row_id       x       y  accuracy    time  weekhour  day  distance  \
0       0  0.1675  1.3608       107  930883        50    9  0.019959   
1       0  0.1675  1.360

In [None]:
'''for i in range(1):
    
    print('Loading trainng data...')
    
    test = pandas.read_csv('data/test_{0}.csv'.format(i))
    
    print(test.head())
    
    features = get_features(test, 'test')
    features.to_csv('data/features_{0}_{1}.csv'.format('train', i), index=False)
    
    print(train.head())
    print("Splitting  training set...")
    
    train, val, test = split_training_set(train)
    
    col_names = ['accuracy_checkin', 'weekhour', 'day', 'spatial_likelihood', 'time_likelihood', 'accuracy_place',
            'checkins', 'distance']

    print("Dumping SVMLight files...")
    
    # Generate the SVMLight format file
    dump_svmlight_file(train[col_names].values, train['relevance'].values, 'data/svmlight_training_{0}.txt'.format(i),
                       query_id=train.row_id.values)
    dump_svmlight_file(val[col_names].values, val['relevance'].values, 'data/svmlight_validation_{0}.txt'.format(i),
                       query_id=val.row_id.values)
    dump_svmlight_file(test[col_names].values, test['relevance'].values, 'data/svmlight_test_{0}.txt'.format(i),
                       query_id=test.row_id.values)'''

In [9]:
[0] * 10

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]