In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os.path as path
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
data_dir = "./ember2018"

# Uploading one subsample of the original dataset

In [3]:
fn_x_test = path.join(data_dir, 'X_test.dat')
fn_y_test = path.join(data_dir, 'y_test.dat')

In [4]:
X_test = np.memmap(fn_x_test, dtype='float32', mode='r', shape=(200000, 2381))
y_test = np.memmap(fn_y_test, dtype='float32', mode='r', shape=(200000,))

In [5]:
X_test.shape, y_test.shape

((200000, 2381), (200000,))

In [6]:
type(X_test), type(y_test)

(numpy.memmap, numpy.memmap)

In [7]:
X_test.shape, y_test.shape

((200000, 2381), (200000,))

In [10]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)

In [11]:
sss.get_n_splits(X_test, y_test)

5

# Taking a sample of 100K rows

In [13]:
%%time
for train_index, test_index in sss.split(X_test, y_test):
    print('Train: ', train_index, 'Test: ', test_index)
    #train_samp1, test_samp2 = X_test_s[train_index], y_test[test_index]
    X= X_test[train_index]
    y= y_test[train_index]

Train:  [146912  66072 175536 ...  58722 195481  23322] Test:  [ 99878  47205 106464 ...  38356 168948  29702]
Train:  [ 59320  15709 148367 ...  83745  88676  70146] Test:  [ 47256  39338  77648 ... 153905 106089 185906]
Train:  [101021 123742  45023 ... 104009 138881 168197] Test:  [167155 119998  59122 ... 176464  32642  25128]
Train:  [172377  71262  21143 ...  83329  64292 152824] Test:  [146680  54804  41485 ...  25295  18369  38197]
Train:  [ 80202  71319 194547 ...  60331 152556  98197] Test:  [ 56810  51205 105849 ...  98706 137818  97925]
CPU times: user 1.8 s, sys: 13 s, total: 14.8 s
Wall time: 4min 4s


In [14]:
X.shape, y.shape

((100000, 2381), (100000,))

In [15]:
y

array([1., 0., 1., ..., 0., 1., 0.], dtype=float32)

In [16]:
X

array([[1.09190784e-01, 4.44486458e-03, 8.36170465e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.55203685e-01, 1.52413500e-02, 6.67317724e-03, ...,
        3.05288000e+05, 0.00000000e+00, 0.00000000e+00],
       [5.66591844e-02, 8.29706714e-03, 6.16767630e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.88515157e-01, 1.41237341e-02, 7.58019090e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.92788597e-02, 3.28338030e-03, 2.14415393e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.96198833e-02, 4.85010119e-03, 4.23697056e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]], dtype=float32)

# Serialize the samples

In [17]:
fn_X_sample = path.join(data_dir, 'X_sample-100K.dat')
fn_y_sample = path.join(data_dir, 'y_sample-100K.dat')

In [18]:
X_sample = np.memmap(fn_X_sample, dtype='float32', mode='w+', shape=X.shape)
y_sample = np.memmap(fn_y_sample, dtype='float32', mode='w+', shape=y.shape)

In [19]:
X_sample[:] = X[:]
y_sample[:] = y[:]

In [20]:
X_sample.filename == path.abspath(fn_X_sample), y_sample.filename == path.abspath(fn_y_sample)

(True, True)

In [21]:
type(X_sample), type(X), type(y_sample), type(y)

(numpy.memmap, numpy.ndarray, numpy.memmap, numpy.ndarray)

In [22]:
X_sample.shape, X.shape, y_sample.shape, y.shape

((100000, 2381), (100000, 2381), (100000,), (100000,))

In [23]:
del X_sample

In [24]:
del y_sample