In [40]:
import pandas as pd
import src.util as utils
from sklearn.model_selection import train_test_split

In [41]:
config = utils.load_config()

In [42]:
dataset = pd.read_csv(config["data_path"])

In [43]:
dataset

Unnamed: 0,UTC,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,CNT,Fire Alarm
0,1654733331,20.000,57.36,0,400,12306,18520,939.735,0.00,0.00,0.00,0.000,0.000,0,0
1,1654733332,20.015,56.67,0,400,12345,18651,939.744,0.00,0.00,0.00,0.000,0.000,1,0
2,1654733333,20.029,55.96,0,400,12374,18764,939.738,0.00,0.00,0.00,0.000,0.000,2,0
3,1654733334,20.044,55.28,0,400,12390,18849,939.736,0.00,0.00,0.00,0.000,0.000,3,0
4,1654733335,20.059,54.69,0,400,12403,18921,939.744,0.00,0.00,0.00,0.000,0.000,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62625,1655130047,18.438,15.79,625,400,13723,20569,936.670,0.63,0.65,4.32,0.673,0.015,5739,0
62626,1655130048,18.653,15.87,612,400,13731,20588,936.678,0.61,0.63,4.18,0.652,0.015,5740,0
62627,1655130049,18.867,15.84,627,400,13725,20582,936.687,0.57,0.60,3.95,0.617,0.014,5741,0
62628,1655130050,19.083,16.04,638,400,13712,20566,936.680,0.57,0.59,3.92,0.611,0.014,5742,0


In [44]:
dataset.isnull().sum()

UTC               0
Temperature[C]    0
Humidity[%]       0
TVOC[ppb]         0
eCO2[ppm]         0
Raw H2            0
Raw Ethanol       0
Pressure[hPa]     0
PM1.0             0
PM2.5             0
NC0.5             0
NC1.0             0
NC2.5             0
CNT               0
Fire Alarm        0
dtype: int64

In [45]:
dataset.dtypes

UTC                 int64
Temperature[C]    float64
Humidity[%]       float64
TVOC[ppb]           int64
eCO2[ppm]           int64
Raw H2              int64
Raw Ethanol         int64
Pressure[hPa]     float64
PM1.0             float64
PM2.5             float64
NC0.5             float64
NC1.0             float64
NC2.5             float64
CNT                 int64
Fire Alarm          int64
dtype: object

In [46]:
dataset.describe()

Unnamed: 0,UTC,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,CNT,Fire Alarm
count,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0,62630.0
mean,1654792000.0,15.970424,48.539499,1942.057528,670.021044,12942.453936,19754.257912,938.627649,100.594309,184.46777,491.463608,203.586487,80.049042,10511.386157,0.714626
std,110002.5,14.359576,8.865367,7811.589055,1905.885439,272.464305,609.513156,1.331344,922.524245,1976.305615,4265.661251,2214.738556,1083.383189,7597.870997,0.451596
min,1654712000.0,-22.01,10.74,0.0,400.0,10668.0,15317.0,930.852,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1654743000.0,10.99425,47.53,130.0,400.0,12830.0,19435.0,938.7,1.28,1.34,8.82,1.384,0.033,3625.25,0.0
50%,1654762000.0,20.13,50.15,981.0,400.0,12924.0,19501.0,938.816,1.81,1.88,12.45,1.943,0.044,9336.0,1.0
75%,1654778000.0,25.4095,53.24,1189.0,438.0,13109.0,20078.0,939.418,2.09,2.18,14.42,2.249,0.051,17164.75,1.0
max,1655130000.0,59.93,75.2,60000.0,60000.0,13803.0,21410.0,939.861,14333.69,45432.26,61482.03,51914.68,30026.438,24993.0,1.0


In [47]:
dataset.UTC = pd.to_datetime(dataset.UTC, unit = "s")

In [48]:
def check_data(input_data, config):
    # Measure the range of input data
    len_input_data = len(input_data)

    # Check data types
    assert input_data.select_dtypes("datetime").columns.to_list() == config["datetime_columns"], "an error occurs in datetime column(s)."
    assert input_data.select_dtypes("int").columns.to_list() == config["int_columns"], "an error occurs in int column(s)."
    assert input_data.select_dtypes("float").columns.to_list() == config["float_columns"], "an error occurs in float column(s)."

    # Check range of data
    assert input_data[config["float_columns"][0]].between(config["range_temperature"][0], config["range_temperature"][1]).sum() == len_input_data, "an error occurs in temperature range."
    assert input_data[config["float_columns"][1]].between(config["range_humidity"][0], config["range_humidity"][1]).sum() == len_input_data, "an error occurs in humidity range."
    assert input_data[config["float_columns"][2]].between(config["range_pressure"][0], config["range_pressure"][1]).sum() == len_input_data, "an error occurs in pressure range."
    assert input_data[config["float_columns"][3]].between(config["range_pm1"][0], config["range_pm1"][1]).sum() == len_input_data, "an error occurs in pm1 range."
    assert input_data[config["float_columns"][4]].between(config["range_pm25"][0], config["range_pm25"][1]).sum() == len_input_data, "an error occurs in pm25 range."
    assert input_data[config["float_columns"][5]].between(config["range_nc05"][0], config["range_nc05"][1]).sum() == len_input_data, "an error occurs in nc05 range."
    assert input_data[config["float_columns"][6]].between(config["range_nc1"][0], config["range_nc1"][1]).sum() == len_input_data, "an error occurs in nc1 range."
    assert input_data[config["float_columns"][7]].between(config["range_nc25"][0], config["range_nc25"][1]).sum() == len_input_data, "an error occurs in nc25 range."
    assert input_data[config["int_columns"][0]].between(config["range_tvoc"][0], config["range_tvoc"][1]).sum() == len_input_data, "an error occurs in tvoc range."
    assert input_data[config["int_columns"][1]].between(config["range_eco2"][0], config["range_eco2"][1]).sum() == len_input_data, "an error occurs in eco2 range."
    assert input_data[config["int_columns"][2]].between(config["range_raw_h2"][0], config["range_raw_h2"][1]).sum() == len_input_data, "an error occurs in raw h2 range."
    assert input_data[config["int_columns"][3]].between(config["range_raw_ethanol"][0], config["range_raw_ethanol"][1]).sum() == len_input_data, "an error occurs in raw ethanol range."
    assert input_data[config["int_columns"][5]].between(config["range_fire_alarm"][0], config["range_fire_alarm"][1]).sum() == len_input_data, "an error occurs in fire alarm range."

In [49]:
check_data(dataset, config)

In [50]:
x = dataset[config["data_columns"]].copy()
y = dataset[config["label"]].copy()

In [51]:
x

Unnamed: 0,Temperature[C],Humidity[%],Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,CNT
0,20.000,57.36,939.735,0.00,0.00,0.00,0.000,0.000,0,400,12306,18520,0
1,20.015,56.67,939.744,0.00,0.00,0.00,0.000,0.000,0,400,12345,18651,1
2,20.029,55.96,939.738,0.00,0.00,0.00,0.000,0.000,0,400,12374,18764,2
3,20.044,55.28,939.736,0.00,0.00,0.00,0.000,0.000,0,400,12390,18849,3
4,20.059,54.69,939.744,0.00,0.00,0.00,0.000,0.000,0,400,12403,18921,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62625,18.438,15.79,936.670,0.63,0.65,4.32,0.673,0.015,625,400,13723,20569,5739
62626,18.653,15.87,936.678,0.61,0.63,4.18,0.652,0.015,612,400,13731,20588,5740
62627,18.867,15.84,936.687,0.57,0.60,3.95,0.617,0.014,627,400,13725,20582,5741
62628,19.083,16.04,936.680,0.57,0.59,3.92,0.611,0.014,638,400,13712,20566,5742


In [52]:
y

0        0
1        0
2        0
3        0
4        0
        ..
62625    0
62626    0
62627    0
62628    0
62629    0
Name: Fire Alarm, Length: 62630, dtype: int64

In [53]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify = y)

In [54]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [36]:
utils.pickle_dump(dataset, config["data_cleaned_path"])

utils.pickle_dump(x_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(x_valid, config["valid_set_path"][0])
utils.pickle_dump(y_valid, config["valid_set_path"][1])

utils.pickle_dump(x_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])