# create_dataset.ipynb

The original jane-street-market-prediction dataset will be modified to suit our needs.

1. Dataset is too big (6gb), hence we need to trim it down
2. Competition is closed, we have no access to their test set. Hence we can use this train set to generate our own unseen test set
3. Need to generate our own output values (y-value) since it is indirectly provided

In [3]:
''' data and math '''
import pandas as pd
import numpy as np

''' plotting images '''
from matplotlib import pyplot as plt
%matplotlib inline

''' traversing directories '''
import os
from pathlib import Path

''' utilities '''
from tqdm import tqdm

In [4]:
''' used to reference the root directory, for directory traversal ''' 
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
mount_dir = '/content/gdrive'
root_dir = Path('/content/gdrive/My Drive/it3011_project')

Mounted at /content/gdrive


# Basic data inspection and cleaning

In [5]:
# load data
data = pd.read_csv(root_dir/"data/jane-street-market-prediction/train.csv")
data.head()

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,...,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100,feature_101,feature_102,feature_103,feature_104,feature_105,feature_106,feature_107,feature_108,feature_109,feature_110,feature_111,feature_112,feature_113,feature_114,feature_115,feature_116,feature_117,feature_118,feature_119,feature_120,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.0,0.009916,0.014079,0.008773,0.00139,0.00627,1,-1.872746,-2.191242,-0.474163,-0.323046,0.014688,-0.002484,,,-0.989982,-1.05509,,,-2.667671,-2.001475,-1.703595,-2.196892,,,1.483295,1.307466,,,1.1752,0.967805,1.60841,1.319365,,,-0.515073,-0.448988,,,...,1.15877,,3.754522,7.137163,-1.863069,,0.434466,,-0.292035,0.317003,-2.60582,,2.896986,,1.485813,4.147254,-2.238831,,-0.892724,,-0.156332,0.622816,-3.921523,,2.561593,,3.457757,6.64958,-1.472686,,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,0.068058,0.028432,0.193794,0.138212,,,-0.151877,-0.384952,,,1.225838,0.789076,1.11058,1.102281,,,-0.5906,-0.625682,,,-0.543425,-0.547486,-0.7066,-0.667806,,,0.910558,0.914465,,,...,1.157671,,1.297679,1.281956,-2.427595,,0.024913,,-0.413607,-0.073672,-2.434546,,0.949879,,0.724655,1.622137,-2.20902,,-1.332492,,-0.586619,-1.040491,-3.946097,,0.98344,,1.357907,1.612348,-1.664544,,,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1
2,0,0.0,0.025134,0.027607,0.033406,0.03438,0.02397,-1,0.81278,-0.256156,0.806463,0.400221,-0.614188,-0.3548,,,5.448261,2.668029,,,3.836342,2.183258,3.902698,3.045431,,,-1.141082,-0.979962,,,-1.157585,-0.966803,-1.430973,-1.103432,,,5.131559,4.314714,,,...,2.420089,,0.800962,1.143663,-3.214578,,1.585939,,0.193996,0.953114,-2.674838,,2.200085,,0.537175,2.156228,-3.568648,,1.193823,,0.097345,0.796214,-4.090058,,2.548596,,0.882588,1.817895,-2.432424,,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2
3,0,0.0,-0.00473,-0.003273,-0.000461,-0.000476,-0.0032,-1,1.174378,0.34464,0.066872,0.009357,-1.006373,-0.676458,,,4.508206,2.48426,,,2.902176,1.799163,3.1927,2.848359,,,-1.401637,-1.428248,,,-1.421175,-1.487976,-1.756415,-1.647543,,,4.766182,4.528353,,,...,2.330484,,0.182066,1.088451,-3.527752,,-1.338859,,-1.257774,-1.194013,-1.719062,,-0.94019,,-1.510224,-1.781693,-3.373969,,2.513074,,0.424964,1.992887,-2.616856,,0.561528,,-0.994041,0.09956,-2.485993,,,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,-0.161518,-0.128149,-0.195006,-0.14378,,,2.683018,1.450991,,,1.257761,0.632336,0.905204,0.575275,,,2.550883,2.484082,,,2.502828,2.60644,2.731251,2.566561,,,-1.477905,-1.722451,,,...,4.345282,,2.737738,2.602937,-1.785502,,-0.172561,,-0.299516,-0.420021,-2.354611,,0.762192,,1.59862,0.623132,-1.74254,,-0.934675,,-0.373013,-1.21354,-3.677787,,2.684119,,2.861848,2.134804,-1.279284,,,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4


In [6]:
# inspect data.shape
data.shape

(2390491, 138)

In [9]:
# some data cleaning

# fill in NA values with the mean for minimal impact
# data.fillna(data.mean(),inplace=True)

# filling in NAN values

# based on https://www.kaggle.com/carlmcbrideellis/jane-street-eda-of-day-0-and-feature-importance 
# https://www.kaggle.com/nicholashojx/jane-street-keras-notebook-mlp

discrete_features = ['feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45']
data[discrete_features] = data[discrete_features].fillna(value = data[discrete_features].mean())
data.fillna(method="ffill", inplace=True)
data.fillna(method="bfill", inplace=True)

# set ts_id to index
data = data.set_index("ts_id")

# remove entries that do not contribute to learning because weight is zero
data = data[data['weight'] != 0]

# 'action' is effectively the y-value, either 1/0 representing buy/pass action. binary classification
data['action'] = ((data['resp'].values) > 0).astype(int)

data.head()

Unnamed: 0_level_0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,...,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100,feature_101,feature_102,feature_103,feature_104,feature_105,feature_106,feature_107,feature_108,feature_109,feature_110,feature_111,feature_112,feature_113,feature_114,feature_115,feature_116,feature_117,feature_118,feature_119,feature_120,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,action
ts_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,0.068058,0.028432,0.193794,0.138212,0.57609,0.303593,-0.151877,-0.384952,3.225978,3.368788,1.225838,0.789076,1.11058,1.102281,1.452184,0.827972,-0.5906,-0.625682,-6.785697,-3.77935,-0.543425,-0.547486,-0.7066,-0.667806,-2.861279,-2.412291,0.910558,0.914465,0.872447,0.739741,...,1.157671,5.253745,1.297679,1.281956,-2.427595,-0.093176,0.024913,3.915322,-0.413607,-0.073672,-2.434546,2.579373,0.949879,10.783993,0.724655,1.622137,-2.20902,-0.245208,-1.332492,2.426087,-0.586619,-1.040491,-3.946097,2.187677,0.98344,9.451299,1.357907,1.612348,-1.664544,5.543606,2.095326,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,0
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,-0.161518,-0.128149,-0.195006,-0.14378,0.57609,0.303593,2.683018,1.450991,3.225978,3.368788,1.257761,0.632336,0.905204,0.575275,1.452184,0.827972,2.550883,2.484082,-6.785697,-3.77935,2.502828,2.60644,2.731251,2.566561,-2.861279,-2.412291,-1.477905,-1.722451,0.872447,0.739741,...,4.345282,5.253745,2.737738,2.602937,-1.785502,-0.093176,-0.172561,3.915322,-0.299516,-0.420021,-2.354611,2.579373,0.762192,10.783993,1.59862,0.623132,-1.74254,-0.245208,-0.934675,2.426087,-0.373013,-1.21354,-3.677787,2.187677,2.684119,9.451299,2.861848,2.134804,-1.279284,5.543606,2.095326,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,0
6,0,0.190575,-0.001939,-0.002301,0.001088,0.005963,0.000709,-1,-3.172026,-3.093182,-0.030588,-0.043175,0.097058,0.053483,0.57609,0.303593,-6.299415,-4.202503,3.225978,3.368788,-3.490652,-2.675391,-3.568659,-4.095882,1.452184,0.827972,-3.848427,-4.144813,-6.785697,-3.77935,-3.75633,-4.3108,-4.363503,-4.510323,-2.861279,-2.412291,0.206028,0.126117,0.872447,0.739741,...,7.952549,5.253745,3.833684,5.016898,-1.619438,-0.093176,-0.033943,3.915322,-0.256141,-0.309591,-2.34367,2.579373,1.282053,10.783993,1.742529,1.22305,-1.723221,-0.245208,-0.07684,2.426087,-0.169793,-0.364885,-3.570331,2.187677,5.158737,9.451299,3.648643,4.029178,-1.215047,5.543606,2.095326,0.336873,4.076447,0.614783,6.622176,0.800618,5.231595,0.361506,3.921714,1
7,0,3.820844,0.017395,0.021361,0.031163,0.03697,0.033473,-1,0.44605,-0.46621,0.498751,0.244116,0.412528,0.22414,0.57609,0.303593,0.277257,-0.458632,3.225978,3.368788,2.637622,1.432136,1.374071,0.724772,1.452184,0.827972,-2.016985,-1.997592,-6.785697,-3.77935,-1.974451,-2.079689,-2.479893,-2.30535,-2.861279,-2.412291,1.161571,0.844683,0.872447,0.739741,...,0.199255,5.253745,-0.121806,-0.168498,-2.408017,-0.093176,-1.338859,3.915322,-1.257774,-1.194013,-2.647017,2.579373,-0.94019,10.783993,-1.510224,-1.781693,-3.313275,-0.245208,-1.949191,2.426087,-1.085762,-2.444205,-3.511121,2.187677,-1.519479,9.451299,-1.4683,-1.909212,-1.978238,5.543606,2.095326,2.101997,4.846202,1.479875,5.261328,2.305066,4.571762,2.201537,4.429745,1
8,0,0.116557,-0.00546,-0.007301,-0.009085,-0.003546,-0.001677,1,-3.172026,-3.093182,-0.363836,-0.291496,0.128422,0.096168,0.57609,0.303593,-3.727364,-2.451716,3.225978,3.368788,-2.825845,-2.304248,-2.534,-3.179482,1.452184,0.827972,0.004298,-0.05633,-6.785697,-3.77935,-0.011601,-0.053375,-0.001363,-0.059892,-2.861279,-2.412291,-2.263565,-2.955754,0.872447,0.739741,...,-0.22783,5.253745,-0.636395,-0.424707,-3.964324,-0.093176,-1.338859,3.915322,-0.769447,-1.194013,-2.956092,2.579373,-0.94019,10.783993,-0.827823,-1.781693,-4.047596,-0.245208,-2.416447,2.426087,-1.110782,-2.927062,-5.018341,2.187677,-2.176029,9.451299,-1.786967,-2.537213,-2.956049,5.543606,2.095326,1.537913,4.785838,1.637435,6.968002,2.354338,5.825499,1.778029,4.740577,0


In [10]:
# inspect data.shape after data cleaning
data.shape

(1981287, 138)

In [11]:
# just to have an understanding of the number of positive vs negative samples
data['action'].sum()/data.shape[0]

0.5044130406145096

In [12]:
# inspect data.date
data.date.value_counts()

44     13355
459    11909
45      9967
85      9743
47      9585
       ...  
14      1687
270      536
36       230
2         22
294       11
Name: date, Length: 500, dtype: int64

We have about 1.98 million rows in total

We can see that there are 500 different dates, and the description of the data on the official kaggle site indicates that these dates do not have any relationship with each other, but simply used to group entries and calculate an overall evaluation score. Therefore we can explore a subset of this data, which is grouped by their dates in order to reduce the size of the dataset (6gb is too big) and create our own test set as well.

In [13]:
data_date_keys = data.date.value_counts().keys()
data_date_keys

Int64Index([ 44, 459,  45,  85,  47,  38,  12,  43,  18, 161,
            ...
            171, 102, 122, 136, 113,  14, 270,  36,   2, 294],
           dtype='int64', length=500)

In [14]:
data_date_values = data.date.value_counts().values
data_date_values

array([13355, 11909,  9967,  9743,  9585,  9463,  9228,  8551,  8537,
        8467,  8056,  7856,  7847,  7706,  7694,  7617,  7523,  7301,
        7190,  7185,  7085,  6965,  6952,  6822,  6790,  6722,  6709,
        6581,  6462,  6412,  6382,  6362,  6342,  6275,  6274,  6230,
        6011,  6005,  5941,  5909,  5864,  5827,  5800,  5797,  5734,
        5725,  5707,  5685,  5672,  5656,  5638,  5632,  5619,  5605,
        5602,  5578,  5497,  5481,  5478,  5432,  5339,  5324,  5317,
        5304,  5293,  5285,  5280,  5219,  5198,  5142,  5135,  5126,
        5088,  5040,  5039,  5035,  5031,  5024,  5011,  5009,  5006,
        5001,  4989,  4976,  4971,  4920,  4914,  4907,  4900,  4897,
        4849,  4848,  4847,  4824,  4780,  4757,  4746,  4736,  4727,
        4710,  4707,  4703,  4688,  4687,  4664,  4646,  4628,  4625,
        4617,  4606,  4587,  4582,  4570,  4569,  4568,  4556,  4545,
        4500,  4490,  4479,  4479,  4448,  4421,  4384,  4383,  4380,
        4375,  4373,

# Getting the slice of data that we want

In [15]:
keys = []
key_counts = []
percentage = []
sum_percentage = []
for i, key in enumerate(data_date_keys):
  keys.append(key)
  key_counts.append(data_date_values[i])
  percentage.append(data_date_values[i]/data.shape[0])
  sum_percentage.append(sum(data_date_values[:i+1])/data.shape[0])

In [16]:
data_key_value = pd.DataFrame(list(zip(keys, key_counts, percentage, sum_percentage)), 
               columns =['keys', 'key_counts', 'percentage', 'sum_percentage']) 
data_key_value

Unnamed: 0,keys,key_counts,percentage,sum_percentage
0,44,13355,0.006741,0.006741
1,459,11909,0.006011,0.012751
2,45,9967,0.005031,0.017782
3,85,9743,0.004918,0.022699
4,47,9585,0.004838,0.027537
...,...,...,...,...
495,14,1687,0.000851,0.999597
496,270,536,0.000271,0.999867
497,36,230,0.000116,0.999983
498,2,22,0.000011,0.999994


We want to reduce our 6gb total dataset to the following: approx 1gb total, with 700mb train + 300mb test. We will get the appropriate slice of data using the dataframe above

In [17]:
nrows_original = 2390491
nrows_cleaned = data.shape[0]

# approximate size in megabytes 
size_cleaned = nrows_cleaned / nrows_original * 6000
size_cleaned

4972.920625929986

In [18]:
index_700mb = np.min(np.where(data_key_value.sum_percentage > 700/size_cleaned))
print(index_700mb)

index_1000mb = np.min(np.where(data_key_value.sum_percentage > 1000/size_cleaned))
print(index_1000mb)

35
56


In [19]:
# Therefore, we will slice the keys ordered by their count in descending order as such
# 1. train: [:index_700mb+1]
# 2. test: [index_700mb+1:index_1000mb+1]

train = data[data.date.isin(data_date_keys[0:index_700mb+1])]
print(train.shape)

test = data[data.date.isin(data_date_keys[index_700mb+1:index_1000mb+1])]
print(test.shape)

# verify that test is about 30% of the new dataset
test.shape[0] / (test.shape[0] + train.shape[0])

(280145, 138)
(120504, 138)


0.30077199743416305

In [20]:
# index_70_percent = np.min(np.where(data_key_value.sum_percentage > 0.7))
# print(index_70_percent)

In [21]:
# # Therefore, we will slice the keys ordered by their count in descending order as such
# # 1. train: [:index_70_percent+1]
# # 2. test: [index_70_percent+1:]

# train = data[data.date.isin(data_date_keys[:index_70_percent+1])]
# print(train.shape)

# test = data[data.date.isin(data_date_keys[index_70_percent+1:])]
# print(test.shape)

# # verify that test is about 30% of the new dataset
# test.shape[0] / (test.shape[0] + train.shape[0])

# Saving this slice in a separate csv

In [22]:
train.to_csv(root_dir/"data/train.csv", index = False)
test.to_csv(root_dir/"data/test.csv", index = False)
print("saved")

saved


In [23]:
# open up to check
train_check = pd.read_csv(root_dir/"data/train.csv")
train_check.head()

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,...,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100,feature_101,feature_102,feature_103,feature_104,feature_105,feature_106,feature_107,feature_108,feature_109,feature_110,feature_111,feature_112,feature_113,feature_114,feature_115,feature_116,feature_117,feature_118,feature_119,feature_120,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,action
0,1,9.821427,-0.014818,-0.019394,-0.019283,-0.000944,-0.007844,1,5.285973,3.011525,-1.393305,-0.735402,-0.90802,-0.471151,-0.058441,-0.221505,5.039879,2.003304,-1.118338,-1.584739,3.888114,1.897805,2.326891,1.192483,0.409369,0.258356,4.356158,3.063248,0.56156,0.161109,4.148956,3.256711,4.659288,3.25072,-0.772802,-0.94957,-0.854551,-0.568027,-1.244171,-1.812601,...,-1.515613,-0.646447,0.318745,0.637013,-2.433448,0.52113,0.922601,0.625503,0.577809,0.439945,-2.20459,-0.878449,1.249999,-0.394968,1.205414,1.116665,-2.806147,0.535458,-0.47949,0.32506,0.17239,-0.016806,-3.116017,-0.903685,0.265058,-0.869325,1.135212,0.882452,-1.872665,0.076443,-3.401221,4.728494,5.309723,3.199164,4.911131,4.783926,4.397508,5.122683,4.998204,0
1,1,0.83815,-0.002198,-0.002562,-0.016964,-0.029155,-0.025479,1,-3.172026,-3.093182,1.068607,0.506972,1.252115,0.665742,-0.058441,-0.221505,-5.424993,-3.081836,-1.118338,-1.584739,7.689749,4.512754,0.984244,-0.0166,0.409369,0.258356,1.158439,0.607663,0.56156,0.161109,8.307519,6.775615,1.862624,1.058943,-0.772802,-0.94957,-3.362551,-3.193465,-1.244171,-1.812601,...,5.110923,-0.646447,5.589854,5.105871,-1.844534,0.52113,-0.412149,0.625503,1.322766,0.139397,-2.070829,-0.878449,-0.027765,-0.394968,11.161628,4.178262,-1.028454,0.535458,-1.133088,0.32506,0.814667,-0.193307,-3.457956,-0.903685,2.775705,-0.869325,10.258894,5.395499,-0.995543,0.076443,-3.401221,1.998274,3.805928,3.327486,8.298933,4.331993,6.636454,2.742866,4.421242,0
2,1,0.115654,0.025969,0.033227,0.026518,-0.007137,0.005205,-1,-1.99578,-2.397085,0.631661,0.260796,-0.606878,-0.318215,-0.058441,-0.221505,0.204725,-0.701564,-1.118338,-1.584739,-0.603418,-0.861179,0.43908,-0.598599,0.409369,0.258356,-0.890208,-0.646109,0.56156,0.161109,-0.886307,-0.570991,-1.106967,-0.692671,-0.772802,-0.94957,1.127499,0.633285,-1.244171,-1.812601,...,0.119925,-0.646447,0.884953,0.188384,-2.765658,0.52113,-0.504551,0.625503,-0.430036,-0.46412,-2.685705,-0.878449,-0.459645,-0.394968,0.233487,-0.397084,-2.96352,0.535458,-2.44893,0.32506,-0.544723,-1.970004,-4.225394,-0.903685,-1.560801,-0.869325,0.728752,-0.509478,-2.054889,0.076443,-3.401221,0.57111,1.961126,0.115093,1.678828,0.269419,1.380691,0.414362,1.482215,1
3,1,0.051481,0.00405,0.010332,0.011625,0.015717,0.020223,-1,-3.172026,-3.093182,0.191,0.067753,0.51863,0.281647,-0.058441,-0.221505,0.441757,-0.384421,-1.118338,-1.584739,-0.691991,-0.850861,0.11522,-0.712472,0.409369,0.258356,-1.818579,-1.766257,0.56156,0.161109,-1.806182,-1.849151,-2.253638,-2.042811,-0.772802,-0.94957,1.263765,0.915293,-1.244171,-1.812601,...,9.352288,-0.646447,4.177239,6.526127,-1.006093,0.52113,-0.511856,0.625503,-0.544758,-0.69039,-2.664273,-0.878449,-0.416616,-0.394968,1.662032,-0.769501,-1.762745,0.535458,-0.747797,0.32506,-0.537692,-0.940779,-3.816933,-0.903685,5.3201,-0.869325,3.824343,4.534711,-0.9766,0.076443,-3.401221,0.030937,3.310013,0.616731,6.730575,0.775897,5.260658,0.133797,3.438925,1
4,1,0.401532,-0.003251,-0.00781,-0.009187,-0.016094,-0.018165,1,-1.668771,-2.172676,-0.380094,-0.234366,-0.610825,-0.366429,-0.058441,-0.221505,-3.193625,-1.963297,-1.118338,-1.584739,-2.259535,-1.622893,-2.970833,-3.263832,0.409369,0.258356,1.289194,0.870253,0.56156,0.161109,1.013953,0.595025,1.399014,0.856032,-0.772802,-0.94957,-1.014198,-0.88135,-1.244171,-1.812601,...,3.632851,-0.646447,4.839271,7.96502,-0.945646,0.52113,-0.338409,0.625503,-0.488419,-0.548905,-2.647692,-0.878449,0.638093,-0.394968,1.912758,0.562299,-1.728074,0.535458,-1.749586,0.32506,-0.425381,-0.575655,-3.761331,-0.903685,2.182024,-0.869325,4.376453,5.783612,-0.940999,0.076443,-3.401221,0.015795,3.271065,0.60238,6.678767,0.749335,5.200363,0.107684,3.377546,0


In [24]:
train_check.shape

(280145, 138)

In [25]:
# open up to check
test_check = pd.read_csv(root_dir/"data/test.csv")
test_check.head()

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,...,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100,feature_101,feature_102,feature_103,feature_104,feature_105,feature_106,feature_107,feature_108,feature_109,feature_110,feature_111,feature_112,feature_113,feature_114,feature_115,feature_116,feature_117,feature_118,feature_119,feature_120,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,action
0,3,7.481076,-0.005057,-0.008262,-0.014311,-0.018114,-0.015419,1,2.663412,3.184785,-0.108482,-0.101635,0.340776,0.337564,0.268371,0.208574,1.357578,1.293543,1.538665,2.419556,1.411396,1.448887,0.562188,0.986741,1.502681,2.025775,2.068331,2.909274,3.535245,2.574664,2.485053,3.677383,2.229291,3.007236,-0.158237,-0.232188,-0.529661,-0.740136,-0.192601,-0.287046,...,-0.406214,3.173434,-0.027215,-0.543127,-2.913095,0.798549,0.851514,1.160112,-0.023394,0.38465,-2.356273,4.166244,1.543714,3.762793,0.53152,1.449822,-2.818499,1.137364,-0.478218,0.698121,-0.383816,-0.836964,-3.720498,3.804839,0.743319,4.320519,0.345498,0.265471,-2.057983,-0.550845,2.073743,0.387822,5.05496,0.519173,7.369856,0.867638,6.39919,0.772381,5.851821,0
1,3,2.541412,0.00505,0.00956,0.021086,0.029987,0.023465,1,2.231333,0.929773,-0.076187,-0.073584,-0.704473,-0.400425,0.268371,0.208574,-0.80934,-1.135182,1.538665,2.419556,-0.968611,-0.989023,-0.752374,-1.582346,1.502681,2.025775,3.04536,2.316931,3.535245,2.574664,2.405342,1.877978,3.238101,2.390879,-0.158237,-0.232188,-2.552519,-2.531221,-0.192601,-0.287046,...,0.619696,3.173434,-0.617359,0.074816,-4.068894,0.798549,1.411089,1.160112,0.061512,0.818424,-2.604537,4.166244,-0.230667,3.762793,-0.985122,-0.717096,-5.136856,1.137364,2.874373,0.698121,0.570497,2.311909,-3.108549,3.804839,-0.570055,4.320519,-2.023391,-0.972688,-3.41103,-0.550845,2.073743,3.095879,-0.526921,2.911567,-0.650246,3.96617,-0.564345,3.375062,-0.643761,1
2,3,0.405529,0.005226,0.008275,0.010546,0.01244,0.011228,1,-0.817273,-0.204123,-0.108383,-0.10999,-0.835789,-1.189301,0.268371,0.208574,-0.912162,-0.157431,1.538665,2.419556,0.080956,0.618816,-1.046966,-1.123527,1.502681,2.025775,0.48134,1.038576,3.535245,2.574664,0.784753,1.706369,0.675991,1.359656,-0.158237,-0.232188,-0.106605,-0.135824,-0.192601,-0.287046,...,-1.515613,3.173434,-0.595608,-0.457747,-2.486037,0.798549,-0.026142,1.160112,-0.041383,-0.211637,-2.078765,4.166244,-0.138229,3.762793,0.080302,-0.437355,-2.75382,1.137364,-2.058798,0.698121,-0.458743,-1.757123,-2.972844,3.804839,-2.025423,4.320519,-0.489641,-1.578295,-1.878226,-0.550845,2.073743,-1.055147,2.307767,-1.005259,2.467189,-1.565015,2.016626,-1.326771,1.915892,1
3,3,0.492947,-0.001117,-0.000892,-0.007417,-0.013809,-0.006213,1,-3.172026,-3.093182,0.152462,0.095534,-0.778204,-0.641386,0.268371,0.208574,-2.853978,-1.845645,1.538665,2.419556,1.710502,1.294546,-2.151323,-2.709621,1.502681,2.025775,-0.11325,-0.177388,3.535245,2.574664,2.179006,2.594442,0.02313,-0.04494,-0.158237,-0.232188,-1.435509,-1.885774,-0.192601,-0.287046,...,-0.166387,3.173434,-0.31493,-0.080715,-2.786589,0.798549,-0.333619,1.160112,-0.627701,-0.56426,-2.965181,4.166244,-0.095864,3.762793,0.053721,-0.534112,-3.293924,1.137364,-2.463752,0.698121,-1.13731,-2.643127,-4.72883,3.804839,-1.184614,4.320519,-0.318396,-1.018013,-2.158885,-0.550845,2.073743,0.257676,5.837066,1.414953,13.321513,1.831488,10.93244,1.089419,8.258604,0
4,3,0.491824,0.000363,0.000379,-0.006306,-0.014401,-0.009981,1,-3.172026,-3.093182,0.117693,0.060338,-0.055102,-0.051371,0.268371,0.208574,-3.460218,-2.211427,1.538665,2.419556,2.176433,1.585178,-1.211952,-1.769649,1.502681,2.025775,-0.125835,-0.193056,3.535245,2.574664,2.620725,2.989435,0.020895,-0.057339,-0.158237,-0.232188,-1.991974,-2.477487,-0.192601,-0.287046,...,-0.437305,3.173434,-0.172517,-0.564273,-2.632831,0.798549,-0.062365,1.160112,0.25476,-0.211133,-2.146654,4.166244,0.178233,3.762793,1.483567,0.210266,-2.254241,1.137364,-1.945715,0.698121,-0.259185,-2.089565,-3.466355,3.804839,-1.018744,4.320519,1.167875,-0.928127,-1.742049,-0.550845,2.073743,-0.250886,2.720679,0.695885,7.404032,1.015041,6.152228,0.369038,4.270006,0


In [26]:
test_check.shape

(120504, 138)

# All looks good, we are ready to use this train/test set for machine learning