In [1]:
import pandas as pd
import numpy as np

# Initial setup

In [2]:
# Sequence length to consider
sequences = [1]

folder = './datasets/

datasets = ['generic_nyc']

# Indicates the column from each dataset to be analysed
dd = {'generic_nyc_poi-hour': 'poi-hour'}

In [None]:
# Example to multiple datasets
# datasets = ['d_1', 'd_2']
# dd = {'d_1': c_1', 'd_2': 'c_2}

In [5]:
def to_file(core_name, x_train, x_test, y_train, y_test):
    df_x_train = pd.DataFrame(x_train).to_csv(core_name+'-x_train.csv', index=False, header=None)
    df_x_test = pd.DataFrame(x_test).to_csv(core_name+'-x_test.csv', index=False, header=None)
    df_y_train = pd.DataFrame(y_train, columns=['label']).to_csv(core_name+'-y_train.csv', index=False)
    df_y_test = pd.DataFrame(y_test, columns=['label']).to_csv(core_name+'-y_test.csv', index=False)

# POI-F: POI Frequency

In [6]:
def poi(df_train, df_test, possible_sequences, seq2idx):
    
    print('Starting POI...')
    method = 'poi'
    
    # Train
    train_tids = df_train['tid'].unique()
    x_train = np.zeros((len(train_tids), len(possible_sequences)))
    y_train = df_train.drop_duplicates(subset=['tid', 'label'],
                                       inplace=False) \
                      .sort_values('tid', ascending=True,
                                   inplace=False)['label'].values

    for i, tid in enumerate(train_tids):
        traj_pois = df_train[df_train['tid'] == tid][feature].values
        for idx in range(0, (len(traj_pois)-(sequence - 1))):
            aux = []
            for b in range (0, sequence):
                aux.append(traj_pois[idx + b])
            aux = tuple(aux)
            x_train[i][seq2idx[aux]] += 1

    # Test
    test_tids = df_test['tid'].unique()
    test_unique_features = df_test[feature].unique().tolist()
    x_test = np.zeros((len(test_tids), len(possible_sequences)))
    y_test = df_test.drop_duplicates(subset=['tid', 'label'],
                                       inplace=False) \
                      .sort_values('tid', ascending=True,
                                   inplace=False)['label'].values

    for i, tid in enumerate(test_tids):
        traj_pois = df_test[df_test['tid'] == tid][feature].values
        for idx in range(0, (len(traj_pois)-(sequence - 1))):
            aux = []
            for b in range (0, sequence):
                aux.append(traj_pois[idx + b])
            aux = tuple(aux)
            if aux in possible_sequences:
                x_test[i][seq2idx[aux]] += 1
    core_name = 'outputs/'+method+'_'+feature+'_'+str(sequence)+'_'+dataset
    to_file(core_name, x_train, x_test, y_train, y_test)

# NPOI-F: Normalized POI Frequency

In [7]:
def npoi(df_train, df_test, possible_sequences, seq2idx):
    
    print('Starting NPOI...')
    method = 'npoi'
    
    # Train
    train_tids = df_train['tid'].unique()
    x_train = np.zeros((len(train_tids), len(possible_sequences)))
    y_train = df_train.drop_duplicates(subset=['tid', 'label'],
                                       inplace=False) \
                      .sort_values('tid', ascending=True,
                                   inplace=False)['label'].values

    for i, tid in enumerate(train_tids):
        traj_pois = df_train[df_train['tid'] == tid][feature].values
        for idx in range(0, (len(traj_pois)-(sequence - 1))):
            aux = []
            for b in range (0, sequence):
                aux.append(traj_pois[idx + b])
            aux = tuple(aux)
            x_train[i][seq2idx[aux]] += 1
        x_train[i] = x_train[i]/len(traj_pois)

    # Test
    test_tids = df_test['tid'].unique()
    test_unique_features = df_test[feature].unique().tolist()
    x_test = np.zeros((len(test_tids), len(possible_sequences)))
    y_test = df_test.drop_duplicates(subset=['tid', 'label'],
                                       inplace=False) \
                      .sort_values('tid', ascending=True,
                                   inplace=False)['label'].values

    for i, tid in enumerate(test_tids):
        traj_pois = df_test[df_test['tid'] == tid][feature].values
        for idx in range(0, (len(traj_pois)-(sequence - 1))):
            aux = []
            for b in range (0, sequence):
                aux.append(traj_pois[idx + b])
            aux = tuple(aux)
            if aux in possible_sequences:
                x_test[i][seq2idx[aux]] += 1
        x_test[i] = x_test[i]/len(traj_pois)
    core_name = 'outputs/'+method+'_'+feature+'_'+str(sequence)+'_'+dataset
    to_file(core_name, x_train, x_test, y_train, y_test)

# WNPOI-F: Weighted Normalized POI Frequency.

In [8]:
def wnpoi(df_train, df_test, possible_sequences, seq2idx):
    
    print('Starting WNPOI...')    
    method = 'wnpoi'
    
    train_labels = df_train['label'].unique()
    weights = np.zeros(len(possible_sequences))
    for label in train_labels:
        aux_w = np.zeros(len(possible_sequences))
        class_pois = df_train[df_train['label'] == label][feature].values
        for idx in range(0, (len(class_pois)-(sequence - 1))):
            aux = []
            for b in range (0, sequence):
                aux.append(class_pois[idx + b])
            aux = tuple(aux)
            seqidx = seq2idx[aux]
            if aux_w[seqidx] == 0:
                weights[seqidx] += 1
                aux_w[seqidx] = 1
    weights = np.log2(len(train_labels)/weights)
    # Train
    train_tids = df_train['tid'].unique()
    x_train = np.zeros((len(train_tids), len(possible_sequences)))
    y_train = df_train.drop_duplicates(subset=['tid', 'label'],
                                       inplace=False) \
                      .sort_values('tid', ascending=True,
                                   inplace=False)['label'].values

    for i, tid in enumerate(train_tids):
        traj_pois = df_train[df_train['tid'] == tid][feature].values
        for idx in range(0, (len(traj_pois)-(sequence - 1))):
            aux = []
            for b in range (0, sequence):
                aux.append(traj_pois[idx + b])
            aux = tuple(aux)
            x_train[i][seq2idx[aux]] += 1
        x_train[i] = x_train[i]/len(traj_pois)
        for w in range(0, len(possible_sequences)):
            x_train[i][w] *= weights[w]

    # Test
    test_tids = df_test['tid'].unique()
    test_unique_features = df_test[feature].unique().tolist()
    x_test = np.zeros((len(test_tids), len(possible_sequences)))
    y_test = df_test.drop_duplicates(subset=['tid', 'label'],
                                       inplace=False) \
                      .sort_values('tid', ascending=True,
                                   inplace=False)['label'].values

    for i, tid in enumerate(test_tids):
        traj_pois = df_test[df_test['tid'] == tid][feature].values
        for idx in range(0, (len(traj_pois)-(sequence - 1))):
            aux = []
            for b in range (0, sequence):
                aux.append(traj_pois[idx + b])
            aux = tuple(aux)
            if aux in possible_sequences:
                x_test[i][seq2idx[aux]] += 1
        x_test[i] = x_test[i]/len(traj_pois)
        for w in range(0, len(possible_sequences)):
            x_test[i][w] *= weights[w]
            
    core_name = 'outputs/'+method+'_'+feature+'_'+str(sequence)+'_'+dataset+'_grrr'
    to_file(core_name, x_train, x_test, y_train, y_test)

In [10]:
def do_all(sequence, dataset, feature, folder):
    print('Dataset: {}, Feature: {}, Sequence: {}'.format(dataset, feature, sequence))
    df_train = pd.read_csv(folder+dataset+'_train.csv')
    df_test = pd.read_csv(folder+dataset+'_test.csv')
    unique_features = df_train[feature].unique().tolist()
    
    points = df_train[feature].values
    possible_sequences = []
    for idx in range(0, (len(points)-(sequence - 1))):
        aux = []
        for i in range (0, sequence):
            aux.append(points[idx + i])
        aux = tuple(aux)
        if aux not in possible_sequences:
            possible_sequences.append(aux)

    seq2idx = dict(zip(possible_sequences, np.r_[0:len(possible_sequences)]))
    
    poi(df_train, df_test, possible_sequences, seq2idx)
    npoi(df_train, df_test, possible_sequences, seq2idx)
    wnpoi(df_train, df_test, possible_sequences, seq2idx)

# Run all experiments

In [11]:
for sequence in sequences:
    for dataset in datasets:
        feature = dd[dataset]
        do_all(sequence, dataset, feature, folder)

Dataset: generic_nyc_poi-hour, Feature: poi-hour, Sequence: 1
Starting WNPOI_SEQ...
