In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
print(os.listdir("../input"))

['ey-nextwave', 'ndscindexes']


In [2]:
df_train = pd.read_csv("../input/ey-nextwave/data_train/data_train.csv")
df_test = pd.read_csv("../input/ey-nextwave/data_test/data_test.csv")

In [3]:
df_train.tail()

Unnamed: 0.1,Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit
814257,814257,ffffc6359725f0e1feac9ef1872ab207_11,traj_ffffc6359725f0e1feac9ef1872ab207_11_4,02:21:11,02:21:11,,,,3744666.0,-19256790.0,3744666.0,-19256790.0
814258,814258,ffffc6359725f0e1feac9ef1872ab207_11,traj_ffffc6359725f0e1feac9ef1872ab207_11_5,06:02:17,06:02:17,,,,3744732.0,-19256140.0,3744732.0,-19256140.0
814259,814259,ffffc6359725f0e1feac9ef1872ab207_11,traj_ffffc6359725f0e1feac9ef1872ab207_11_7,09:52:13,09:52:13,,,,3744666.0,-19256790.0,3744666.0,-19256790.0
814260,814260,ffffc6359725f0e1feac9ef1872ab207_11,traj_ffffc6359725f0e1feac9ef1872ab207_11_8,14:20:26,14:27:15,,,,3741043.0,-19290510.0,3741057.0,-19289360.0
814261,814261,ffffc6359725f0e1feac9ef1872ab207_11,traj_ffffc6359725f0e1feac9ef1872ab207_11_10,14:56:46,15:04:05,0.0,0.0,0.0,3743948.0,-19235600.0,3744842.0,-19262180.0


In [4]:
df_test.tail()

Unnamed: 0.1,Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit
202932,202932,fff9552047b095e8242b4913f3289a26_25,traj_fff9552047b095e8242b4913f3289a26_25_3,11:23:33,11:23:33,,,,3762713.0,-19354930.0,3762713.0,-19354930.0
202933,202933,fff9552047b095e8242b4913f3289a26_25,traj_fff9552047b095e8242b4913f3289a26_25_4,12:12:10,12:12:10,,,,3761040.0,-19352740.0,3761040.0,-19352740.0
202934,202934,fff9552047b095e8242b4913f3289a26_25,traj_fff9552047b095e8242b4913f3289a26_25_5,13:08:14,13:12:01,,,,3762680.0,-19355700.0,3762683.0,-19355290.0
202935,202935,fff9552047b095e8242b4913f3289a26_25,traj_fff9552047b095e8242b4913f3289a26_25_6,14:14:36,14:14:36,,,,3761776.0,-19357720.0,3761776.0,-19357720.0
202936,202936,fff9552047b095e8242b4913f3289a26_25,traj_fff9552047b095e8242b4913f3289a26_25_7,15:30:54,15:30:54,,,,3762713.0,-19354780.0,,


In [5]:
# normalising location information
X_MIN = 3750901.5068
X_MAX = 3770901.5068
X_MID = X_MIN + 0.5 * (X_MAX - X_MIN)
Y_MIN = -19268905.6133
Y_MAX = -19208905.6133
Y_MID = Y_MIN + 0.5 * (Y_MAX - Y_MIN)

def normalise_X(arr):
    return (arr - X_MID) / 10000

def normalise_Y(arr):
    return (arr - Y_MID) / 100000  
    # extra zero by design, seems to make figure to be in proportion
    # looking for evidence that the x-axis and y-axis fulfil some ratio

x_min, x_max = normalise_X(X_MIN), normalise_X(X_MAX)
y_min, y_max = normalise_Y(Y_MIN), normalise_Y(Y_MAX)
print("Borders:")
print("{:.4f} < X < {:.4f}".format(x_min, x_max))
print("{:.4f} < Y < {:.4f}".format(y_min, y_max))

df_train['x_entry'], df_train['x_exit'] = normalise_X(df_train['x_entry']), normalise_X(df_train['x_exit'])
df_train['y_entry'], df_train['y_exit'] = normalise_Y(df_train['y_entry']), normalise_Y(df_train['y_exit'])
df_test['x_entry'], df_test['x_exit'] = normalise_X(df_test['x_entry']), normalise_X(df_test['x_exit'])
df_test['y_entry'], df_test['y_exit'] = normalise_Y(df_test['y_entry']), normalise_Y(df_test['y_exit'])

Borders:
-1.0000 < X < 1.0000
-0.3000 < Y < 0.3000


In [6]:
# normalising time information
def convert_time(time_sting):
    hms = time_sting.split(":")
    seconds = int(hms[0])*60*60 + int(hms[1])*60 + int(hms[2])
    seconds = (seconds-15*60*60)/(10*60*60)
    return seconds

df_train["t_entry"] = df_train["time_entry"].apply(lambda x: convert_time(x))
df_train["t_exit"] = df_train["time_exit"].apply(lambda x: convert_time(x))
df_test["t_entry"] = df_test["time_entry"].apply(lambda x: convert_time(x))
df_test["t_exit"] = df_test["time_exit"].apply(lambda x: convert_time(x))

In [7]:
# obtaining metadata from IDs
df_train['tid_0'] = [tid.split("_")[-1] for tid in df_train['trajectory_id']]
df_train['tid_1'] = [tid.split("_")[-2] for tid in df_train['trajectory_id']]
df_test['tid_0'] = [tid.split("_")[-1] for tid in df_test['trajectory_id']]
df_test['tid_1'] = [tid.split("_")[-2] for tid in df_test['trajectory_id']]
df_train['tid_0'], df_test['tid_0'] = df_train['tid_0'].astype(int), df_test['tid_0'].astype(int)
df_train['tid_1'], df_test['tid_1'] = df_train['tid_1'].astype(int), df_test['tid_1'].astype(int)

In [8]:
# extract relevant infromation and rearrange
columns = ['hash','tid_0',
           't_entry','t_exit',
           'x_entry','y_entry','x_exit','y_exit',
           'vmax','vmin','vmean',
           'time_entry','time_exit',
           'trajectory_id','tid_1']
df_train = df_train[columns]
df_test = df_test[columns]

In [9]:
# tid_1 is likely the day of the month, this information may be useful
print(max([int(x) for x in df_test['tid_0']]), max([int(x) for x in df_test['tid_1']]))

54 31


In [10]:
hash_most_freq = df_train['hash'].mode().tail(1).item()
df_train.loc[df_train['hash'] == hash_most_freq]

Unnamed: 0,hash,tid_0,t_entry,t_exit,x_entry,y_entry,x_exit,y_exit,vmax,vmin,vmean,time_entry,time_exit,trajectory_id,tid_1
808281,fe1ebe07b2f3adca2e60e10f2d95099f_9,0,-1.439111,-1.439111,0.196111,-0.000211,0.196111,-0.000211,-1.0,-1.0,-1.0,00:36:32,00:36:32,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_0,9
808282,fe1ebe07b2f3adca2e60e10f2d95099f_9,1,-1.404222,-1.404222,0.196445,0.004345,0.196445,0.004345,-1.0,-1.0,-1.0,00:57:28,00:57:28,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_1,9
808283,fe1ebe07b2f3adca2e60e10f2d95099f_9,2,-1.369028,-1.369028,0.200341,0.004003,0.200341,0.004003,-1.0,-1.0,-1.0,01:18:35,01:18:35,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_2,9
808284,fe1ebe07b2f3adca2e60e10f2d95099f_9,3,-1.334472,-1.334472,0.196111,-0.000325,0.196111,-0.000325,-1.0,-1.0,-1.0,01:39:19,01:39:19,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_3,9
808285,fe1ebe07b2f3adca2e60e10f2d95099f_9,4,-1.300222,-1.300222,0.197447,0.006965,0.197447,0.006965,-1.0,-1.0,-1.0,01:59:52,01:59:52,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_4,9
808286,fe1ebe07b2f3adca2e60e10f2d95099f_9,5,-1.204111,-1.204111,0.197113,0.009357,0.197113,0.009357,-1.0,-1.0,-1.0,02:57:32,02:57:32,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_5,9
808287,fe1ebe07b2f3adca2e60e10f2d95099f_9,6,-1.064333,-1.064333,0.197781,0.006737,0.197781,0.006737,-1.0,-1.0,-1.0,04:21:24,04:21:24,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_6,9
808288,fe1ebe07b2f3adca2e60e10f2d95099f_9,7,-1.031722,-1.031722,0.201566,0.006395,0.201566,0.006395,-1.0,-1.0,-1.0,04:40:58,04:40:58,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_7,9
808289,fe1ebe07b2f3adca2e60e10f2d95099f_9,8,-0.966444,-0.966444,0.196,0.004915,0.196,0.004915,-1.0,-1.0,-1.0,05:20:08,05:20:08,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_8,9
808290,fe1ebe07b2f3adca2e60e10f2d95099f_9,9,-0.864306,-0.864306,0.198226,0.005028,0.198226,0.005028,-1.0,-1.0,-1.0,06:21:25,06:21:25,traj_fe1ebe07b2f3adca2e60e10f2d95099f_9_9,9


# BASELINE SUBMISSION

In [11]:
df_test_1st_traj_only = df_test[df_test['x_exit'].isnull()]
df_submit = df_test_1st_traj_only[['trajectory_id']].copy()
df_submit = df_submit.rename(columns = {'trajectory_id':'id'})

# helper function to determine if point is inside
def is_inside(arr_x, arr_y):
    return ((arr_x > x_min) & 
            (arr_x < x_max) & 
            (arr_y > y_min) & 
            (arr_y < y_max)).astype(float)

df_submit['target'] = is_inside(df_test_1st_traj_only['x_entry'],
                                df_test_1st_traj_only['y_entry'])
df_submit.to_csv('submission.csv', index=False)
df_submit.tail()

Unnamed: 0,id,target
202899,traj_ffe98f6e0adf12f9c7b51c4e9607a87a_15_13,0.0
202913,traj_fff607ecd3f8d3dcb65791e8b4c22a5f_3_25,1.0
202914,traj_fff813b56230c2f026f783f5b9f9ca90_19_0,0.0
202929,traj_fff9400843a88c3bfe52e7ce8bf97316_19_17,0.0
202936,traj_fff9552047b095e8242b4913f3289a26_25_7,0.0


# DATASET PIVOTING

In [12]:
p_train = df_train.pivot('hash', 'tid_0')
p_train.tail()

Unnamed: 0_level_0,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,...,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1
tid_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,58,59,60,61,62,64,65,66
hash,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2
fffd57d840cb6c16553cc8efc907ef6d_29,-0.322222,-0.304667,,-0.243778,,-0.094722,0.020083,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
fffe0c49b72b5d066949457ea383a77e_3,-0.700111,,-0.651556,0.002611,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
fffe926c252d7dfc79dec28c3a7ba0bb_3,-0.945639,-0.905194,-0.712694,-0.661806,-0.109528,0.053472,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
ffff74ab47d4fa20695231ce8a293c98_1,-0.474583,-0.294972,-0.274083,0.013861,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
ffffc6359725f0e1feac9ef1872ab207_11,-1.474278,,,-1.296194,-1.264694,-0.896194,,-0.512972,-0.065944,,-0.005389,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [13]:
p_test = df_test.pivot('hash', 'tid_0')
p_test.tail()

Unnamed: 0_level_0,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,t_entry,...,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1,tid_1
tid_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,54
hash,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2
ffe98f6e0adf12f9c7b51c4e9607a87a_15,-1.365694,-1.26975,-1.219111,-1.120028,,,-0.652972,,-0.506472,-0.447389,,-0.095722,-0.000444,0.061639,,,,,,,,,,,,,,,,,,,,,,,,,,,...,15.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
fff607ecd3f8d3dcb65791e8b4c22a5f_3,,-1.414667,,-1.317861,,-1.048056,-1.023556,-1.005722,,-0.980222,-0.949167,-0.923944,,,,-0.724139,,-0.683139,,,-0.347278,-0.322722,,-0.041139,,-0.008472,,,,,,,,,,,,,,,...,,,3.0,,3.0,,,3.0,3.0,,3.0,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
fff813b56230c2f026f783f5b9f9ca90_19,0.013389,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
fff9400843a88c3bfe52e7ce8bf97316_19,-1.098333,,-0.932833,-0.899167,-0.784583,-0.619917,-0.539389,-0.517889,-0.4985,-0.438667,-0.409528,-0.37725,-0.317889,,-0.098722,-0.048556,,0.022861,,,,,,,,,,,,,,,,,,,,,,,...,,19.0,19.0,,19.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
fff9552047b095e8242b4913f3289a26_25,-0.702889,,-0.39575,-0.36075,-0.279722,-0.186278,-0.075667,0.0515,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [14]:
def obtain_matrix(row):
    df_hash = row.stack().iloc[::-1].reset_index()
    trajectory_id = df_hash.loc[0,"trajectory_id"]
    df_hash = df_hash[['t_entry','t_exit',
                       'x_entry','y_entry','x_exit','y_exit',
                       'vmax','vmin','vmean','tid_0','tid_1']]
    targets = df_hash.loc[0,"x_exit"], df_hash.loc[0,"y_exit"]

    df_hash.loc[0,"x_exit"] = np.nan
    df_hash.loc[0,"y_exit"] = np.nan
    embeds = np.transpose(df_hash.values)
    df_hash = df_hash.append(pd.DataFrame([[np.nan]*df_hash.shape[1]], 
                                            columns=list(df_hash),
                                            index=[99]*(21-df_hash.shape[0])))
    return {"targets" : targets, 
            "df_hash" : df_hash,
            "matrix" : df_hash.values,
            "trajectory_id" : trajectory_id}

print(np.shape(obtain_matrix(p_train.iloc[[323]])["matrix"]))
print(obtain_matrix(p_train.iloc[[323]])["targets"])
obtain_matrix(p_train.iloc[[323]])["df_hash"]
# note that x_exit and y_exit is removed from matrix

(21, 11)
(-1.6122742073539644, -0.29982424471039326)


Unnamed: 0,t_entry,t_exit,x_entry,y_entry,x_exit,y_exit,vmax,vmin,vmean,tid_0,tid_1
0,0.028083,0.028083,-1.612274,-0.299824,,,,,,17.0,19.0
1,-0.152417,-0.152417,-1.612274,-0.299824,-1.612274,-0.299824,,,,16.0,19.0
2,-0.298111,-0.298111,-1.612274,-0.299824,-1.612274,-0.299824,,,,15.0,19.0
3,-0.430444,-0.430444,-1.612274,-0.299824,-1.612274,-0.299824,,,,14.0,19.0
4,-0.50175,-0.50175,-1.612274,-0.299824,-1.612274,-0.299824,,,,13.0,19.0
5,-0.807222,-0.807222,-1.612274,-0.299824,-1.612274,-0.299824,,,,11.0,19.0
6,-0.891333,-0.891333,-1.856843,-0.489747,-1.856843,-0.489747,,,,10.0,19.0
7,-0.908806,-0.908806,-1.619621,-0.572908,-1.619621,-0.572908,,,,9.0,19.0
8,-0.944778,-0.944778,-1.596022,-0.546938,-1.596022,-0.546938,,,,8.0,19.0
9,-0.979806,-0.979806,-1.60515,-0.574633,-1.60515,-0.574633,,,,7.0,19.0


In [15]:
test_data = []
test_ids = []

for i in tqdm(range(p_test.shape[0])):
    output = obtain_matrix(p_test.iloc[[i]])
    test_data.append(output["matrix"])
    test_ids.append(output["trajectory_id"])
#     if i>100:
#         break

HBox(children=(IntProgress(value=0, max=33515), HTML(value='')))

In [16]:
print(np.shape(test_data))
print(np.shape(test_ids))
np.save("test_data", test_data)
np.save("test_ids", test_ids)

(33515, 21, 11)
(33515,)


In [17]:
train_data = []
train_targets = []

for i in tqdm(range(p_train.shape[0])):
    output = obtain_matrix(p_train.iloc[[i]])
    train_data.append(output["matrix"])
    train_targets.append(output["targets"])
#     if i>100:
#         break

HBox(children=(IntProgress(value=0, max=134063), HTML(value='')))

In [18]:
# evaluate if the targets are inside
train_targets = np.array(train_targets)
train_targets_inside = is_inside(train_targets[:,0], train_targets[:,1])

In [19]:
print(np.shape(train_data))
print(np.shape(train_targets))
print(np.shape(train_targets_inside))
np.save("train_data", train_data)
np.save("train_targets", train_targets)
np.save("train_targets_inside", train_targets_inside)

(134063, 21, 11)
(134063, 2)
(134063,)


# TRAIN-TEST SPLIT INDICES

In [20]:
# standardised 4-fold train-test split for clustering purposes
from sklearn.model_selection import StratifiedKFold, KFold
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
trn_index_list = []
val_index_list = []
for trn_index, val_index in skf.split(np.arange(len(train_data)),
                                      train_targets_inside.astype(int)):
    trn_index_list.append(trn_index)
    val_index_list.append(val_index)
    
np.save("trn_index_list",trn_index_list)
np.save("val_index_list",val_index_list)

In [21]:
!ls

__notebook__.ipynb  test_ids.npy	      trn_index_list.npy
__output__.json     train_data.npy	      val_index_list.npy
submission.csv	    train_targets.npy
test_data.npy	    train_targets_inside.npy


In [22]:
# to document: specifications for all if not clear enough
# might not care to do: make the pivot table to 3D array faster