# 02 Split the data into training and test set

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
from datetime import datetime
import seaborn as sns
from pandas import Grouper
from missingpy import MissForest

from pandas.plotting import register_matplotlib_converters
from missingpy import MissForest

I read the data I have cleaned and slightly preprocessed in the EDA.

In [3]:
data = pd.read_pickle('tidy_dataframe_capstone.csv')

In [4]:
data.shape

(35048, 28)

In [5]:
data.head()

Unnamed: 0_level_0,Weekday,Season,Year,Month,Day,Hour,SO2,NO2,CO,O3,...,it_rained,Lag-1,Lag-2,Lag-3,Lag-4,Lag-5,Lag-6,Lag-7,Lag-8,EMA
Date and Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-01 16:00:00,Friday,winter,2013,3,1,16,7.0,9.0,300.0,72.0,...,0,12.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.925471
2013-03-01 17:00:00,Friday,winter,2013,3,1,17,8.0,14.0,300.0,67.0,...,0,11.0,12.0,3.0,3.0,3.0,3.0,3.0,3.0,5.75326
2013-03-01 18:00:00,Friday,winter,2013,3,1,18,9.0,20.0,400.0,58.0,...,0,13.0,11.0,12.0,3.0,3.0,3.0,3.0,3.0,6.15014
2013-03-01 19:00:00,Friday,winter,2013,3,1,19,7.0,29.0,500.0,50.0,...,0,10.0,13.0,11.0,12.0,3.0,3.0,3.0,3.0,6.480855
2013-03-01 20:00:00,Friday,winter,2013,3,1,20,6.0,29.0,500.0,47.0,...,0,10.0,10.0,13.0,11.0,12.0,3.0,3.0,3.0,6.840187


In [6]:
data.tail()

Unnamed: 0_level_0,Weekday,Season,Year,Month,Day,Hour,SO2,NO2,CO,O3,...,it_rained,Lag-1,Lag-2,Lag-3,Lag-4,Lag-5,Lag-6,Lag-7,Lag-8,EMA
Date and Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-02-28 19:00:00,Tuesday,winter,2017,2,28,19,3.0,24.0,400.0,72.0,...,0,11.0,11.0,6.0,11.0,3.0,3.0,6.0,4.0,57.043537
2017-02-28 20:00:00,Tuesday,winter,2017,2,28,20,3.0,41.0,500.0,50.0,...,0,11.0,11.0,11.0,6.0,11.0,3.0,3.0,6.0,56.799528
2017-02-28 21:00:00,Tuesday,winter,2017,2,28,21,4.0,38.0,500.0,54.0,...,0,13.0,11.0,11.0,11.0,6.0,11.0,3.0,3.0,56.562412
2017-02-28 22:00:00,Tuesday,winter,2017,2,28,22,4.0,30.0,400.0,59.0,...,0,14.0,13.0,11.0,11.0,11.0,6.0,11.0,3.0,56.315528
2017-02-28 23:00:00,Tuesday,winter,2017,2,28,23,4.0,38.0,600.0,49.0,...,0,12.0,14.0,13.0,11.0,11.0,11.0,6.0,11.0,56.075553


## Split the data into train and test data

I will split the dataset into train, and test sets. Since the time series data cannot be shufled and the order has to be maintained, I need to make sure that the test set covers a later period in time from the training set, to prevent any information about the future from "leaking" into the model during training. I decide to use the lasst six months of the data (i.e. the last month of the data set) for the test set.

In [7]:
# Define a function to split the data into training and test sets

def train_test_split(df, start_dt):
    
    '''
    This function performs train/test splitting for time-series data.
    
    Arguments
    ---------
    - df        : the data (features & target)
    - start_df  : the starting date for the test set
    
    Outputs
    -------
    - X_tr,y_tr : X/y arrays for train set
    - X_te,y_te : X/y arrays for test data
    '''

    # Train set
    X_tr = df[df.index < test_start_dt].drop('PM2.5',axis=1).values
    y_tr = df[df.index < test_start_dt]['PM2.5']

    # Test set
    X_te = df[df.index >= test_start_dt].drop('PM2.5',axis=1).values
    y_te = df[df.index >= test_start_dt]['PM2.5']
    
    print('Train set')
    print('---------')
    print('Features: {} Target: {}\n'.format(X_tr.shape,y_tr.shape))

    print('Test set')
    print('--------')    
    print('Features: {} Target: {}'.format(X_te.shape,y_te.shape))
    
    return (X_tr,X_te,y_tr,y_te)

In [8]:
test_start_dt = '2016-09-01'
X_tr, X_te, y_tr, y_te = train_test_split(data, test_start_dt)

Train set
---------
Features: (30704, 27) Target: (30704,)

Test set
--------
Features: (4344, 27) Target: (4344,)


In [9]:
pd.to_datetime('02-01-2013')

Timestamp('2013-02-01 00:00:00')

In [10]:
pd.to_datetime('01-03-2013')

Timestamp('2013-01-03 00:00:00')

In [11]:
pd.to_datetime('01-20-2013')

Timestamp('2013-01-20 00:00:00')

In [12]:
pd.to_datetime('01-04-2013')

Timestamp('2013-01-04 00:00:00')

In [13]:
pd.to_datetime('2016-09-01')

Timestamp('2016-09-01 00:00:00')

I am checking the last observation of the training set.

In [16]:
X_tr[30703]

array(['Wednesday', 'summer', 2016, 8, 31, 23, 2.0, 33.0, 200.0, 48.0,
       27.7, 991.2, -2.2, 0.0, 'NW', 1.1, 49.0, 0, 6.0, 13.0, 12.0, 12.0,
       11.0, 8.0, 4.0, 11.0, 35.67288448364514], dtype=object)

data_tr is the dataframe that corresponds to the training set.

In [17]:
data_tr = data[:'2016-08-31 23:00:00']

In [18]:
data_te = data['2016-09-01 00:00:00':]

In [19]:
data_tr.head(3)

Unnamed: 0_level_0,Weekday,Season,Year,Month,Day,Hour,SO2,NO2,CO,O3,...,it_rained,Lag-1,Lag-2,Lag-3,Lag-4,Lag-5,Lag-6,Lag-7,Lag-8,EMA
Date and Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-01 16:00:00,Friday,winter,2013,3,1,16,7.0,9.0,300.0,72.0,...,0,12.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.925471
2013-03-01 17:00:00,Friday,winter,2013,3,1,17,8.0,14.0,300.0,67.0,...,0,11.0,12.0,3.0,3.0,3.0,3.0,3.0,3.0,5.75326
2013-03-01 18:00:00,Friday,winter,2013,3,1,18,9.0,20.0,400.0,58.0,...,0,13.0,11.0,12.0,3.0,3.0,3.0,3.0,3.0,6.15014


In [20]:
data_tr.tail(3)

Unnamed: 0_level_0,Weekday,Season,Year,Month,Day,Hour,SO2,NO2,CO,O3,...,it_rained,Lag-1,Lag-2,Lag-3,Lag-4,Lag-5,Lag-6,Lag-7,Lag-8,EMA
Date and Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-08-31 21:00:00,Wednesday,summer,2016,8,31,21,2.0,22.0,200.0,68.0,...,0,12.0,12.0,11.0,8.0,4.0,11.0,11.0,19.0,36.004423
2016-08-31 22:00:00,Wednesday,summer,2016,8,31,22,2.0,24.0,200.0,63.0,...,0,13.0,12.0,12.0,11.0,8.0,4.0,11.0,11.0,35.838193
2016-08-31 23:00:00,Wednesday,summer,2016,8,31,23,2.0,33.0,200.0,48.0,...,0,6.0,13.0,12.0,12.0,11.0,8.0,4.0,11.0,35.672884


In [21]:
data_te.head(3)

Unnamed: 0_level_0,Weekday,Season,Year,Month,Day,Hour,SO2,NO2,CO,O3,...,it_rained,Lag-1,Lag-2,Lag-3,Lag-4,Lag-5,Lag-6,Lag-7,Lag-8,EMA
Date and Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-09-01 00:00:00,Thursday,summer,2016,9,1,0,2.0,36.0,300.0,36.0,...,0,6.0,6.0,13.0,12.0,12.0,11.0,8.0,4.0,35.541733
2016-09-01 01:00:00,Thursday,summer,2016,9,1,1,2.0,36.0,300.0,28.0,...,0,12.0,6.0,6.0,13.0,12.0,12.0,11.0,8.0,35.439008
2016-09-01 02:00:00,Thursday,summer,2016,9,1,2,2.0,33.0,300.0,28.0,...,0,17.0,12.0,6.0,6.0,13.0,12.0,12.0,11.0,35.259291


In [22]:
data_te.tail(3)

Unnamed: 0_level_0,Weekday,Season,Year,Month,Day,Hour,SO2,NO2,CO,O3,...,it_rained,Lag-1,Lag-2,Lag-3,Lag-4,Lag-5,Lag-6,Lag-7,Lag-8,EMA
Date and Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-02-28 21:00:00,Tuesday,winter,2017,2,28,21,4.0,38.0,500.0,54.0,...,0,13.0,11.0,11.0,11.0,6.0,11.0,3.0,3.0,56.562412
2017-02-28 22:00:00,Tuesday,winter,2017,2,28,22,4.0,30.0,400.0,59.0,...,0,14.0,13.0,11.0,11.0,11.0,6.0,11.0,3.0,56.315528
2017-02-28 23:00:00,Tuesday,winter,2017,2,28,23,4.0,38.0,600.0,49.0,...,0,12.0,14.0,13.0,11.0,11.0,11.0,6.0,11.0,56.075553


In [23]:
data_tr.shape

(30704, 28)

In [24]:
data_te.shape

(4344, 28)

In [25]:
X_tr.shape

(30704, 27)

In [26]:
X_te.shape

(4344, 27)

checking the last entry of the training set.

In [27]:
X_tr[30711:30712]

array([], shape=(0, 27), dtype=object)

In [28]:
X_tr

array([['Friday', 'winter', 2013, ..., 3.0, 3.0, 4.925471414010265],
       ['Friday', 'winter', 2013, ..., 3.0, 3.0, 5.753259698051195],
       ['Friday', 'winter', 2013, ..., 3.0, 3.0, 6.150140202624346],
       ...,
       ['Wednesday', 'summer', 2016, ..., 11.0, 19.0, 36.00442251994568],
       ['Wednesday', 'summer', 2016, ..., 11.0, 11.0, 35.838193032300545],
       ['Wednesday', 'summer', 2016, ..., 4.0, 11.0, 35.67288448364514]],
      dtype=object)

I now need to drop the target variable to have the same columns for the x_tr dataframe.

In [29]:
data_tr.columns

Index(['Weekday', 'Season', 'Year', 'Month', 'Day', 'Hour', 'SO2', 'NO2', 'CO',
       'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'PM10', 'PM2.5',
       'it_rained', 'Lag-1', 'Lag-2', 'Lag-3', 'Lag-4', 'Lag-5', 'Lag-6',
       'Lag-7', 'Lag-8', 'EMA'],
      dtype='object')

In [30]:
data_te.columns

Index(['Weekday', 'Season', 'Year', 'Month', 'Day', 'Hour', 'SO2', 'NO2', 'CO',
       'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'PM10', 'PM2.5',
       'it_rained', 'Lag-1', 'Lag-2', 'Lag-3', 'Lag-4', 'Lag-5', 'Lag-6',
       'Lag-7', 'Lag-8', 'EMA'],
      dtype='object')

I save the data_tr and data_te because they will be used for the missing value imputation of 'wd'.

In [31]:
data_tr.to_pickle('training_dataset_df.csv')
# df = pd.read_pickle(file_name)

In [32]:
data_te.to_pickle('test_dataset_df.csv')

I save the target for training and testing subset.

In [33]:
np.save('y_tr', y_tr)
np.save('y_te', y_te)

In [39]:
y_tr.to_pickle('y_tr.csv')
y_te.to_pickle('y_te.csv')

In [37]:
pd.DataFrame(y_te)

Unnamed: 0_level_0,PM2.5
Date and Time,Unnamed: 1_level_1
2016-09-01 00:00:00,12.0
2016-09-01 01:00:00,17.0
2016-09-01 02:00:00,3.0
2016-09-01 03:00:00,8.0
2016-09-01 04:00:00,14.0
2016-09-01 05:00:00,13.0
2016-09-01 06:00:00,7.0
2016-09-01 07:00:00,8.0
2016-09-01 08:00:00,10.0
2016-09-01 09:00:00,3.0
