# Preparing the Missing Data

In [10]:
import pandas as pd
import numpy as np


def create_mask_dataset(data_x,miss_rate):
    rows,columns=data_x.shape
    # Set the percentage of 0s and 1s
    percent_zeros = miss_rate/100
    percent_ones = 1 - percent_zeros

    # Generate a random array with 0s and 1s based on the specified percentages
    result_array = np.random.choice([0, 1], size=(rows, columns), p=[percent_zeros, percent_ones])
    
    return result_array
    

def data_loader(data_name, miss_rate):
    ## Load data
    df_par=pd.read_parquet('dataset.parquet')
    ### We need to select columns
    df_class=df_par['sleeping']
    data_label=df_class.to_numpy()
    
    df_par=df_par[['step','timestamp','year','month','day','hour','minute','anglez','enmo']]
    data_x=df_par.to_numpy()

    ##Parameters
    no, dim = data_x.shape

    ##Introducing missing data
    data_m=create_mask_dataset(data_x,miss_rate)
    miss_data_x=data_x.copy()
    miss_data_x[data_m==0] = np.nan

    return data_x,miss_data_x,data_m,data_label


miss_rate=20
ori_data_x,missing_data_x,data_m,data_label=data_loader('child_mind',miss_rate)

## Storing as Pandas DataFrame 
columns=['step','timestamp', 'year','month','day', 'hour','minute','anglez', 'enmo']
ori_df=pd.DataFrame(ori_data_x, columns=columns)
missing_df=pd.DataFrame(missing_data_x, columns=columns)
mask_df=pd.DataFrame(data_m, columns=columns)
label_df=pd.DataFrame(data_label, columns=['sleeping'])

## Saving as parquet file
ori_df.to_parquet('original_data.parquet')
missing_df.to_parquet('missing_data.parquet')
mask_df.to_parquet('mask_data.parquet')
label_df.to_parquet('label_data.parquet')


In [11]:
ori_df.head(20)

Unnamed: 0,step,timestamp,year,month,day,hour,minute,anglez,enmo
0,0,2018-11-05 14:00:00+00:00,2018,11,5,14,0,-30.845301,0.0447
1,1,2018-11-05 14:00:05+00:00,2018,11,5,14,0,-34.181801,0.0443
2,2,2018-11-05 14:00:10+00:00,2018,11,5,14,0,-33.877102,0.0483
3,3,2018-11-05 14:00:15+00:00,2018,11,5,14,0,-34.282101,0.068
4,4,2018-11-05 14:00:20+00:00,2018,11,5,14,0,-34.385799,0.0768
5,5,2018-11-05 14:00:25+00:00,2018,11,5,14,0,-34.925598,0.0511
6,6,2018-11-05 14:00:30+00:00,2018,11,5,14,0,-30.513399,0.1073
7,7,2018-11-05 14:00:35+00:00,2018,11,5,14,0,-30.509399,0.0649
8,8,2018-11-05 14:00:40+00:00,2018,11,5,14,0,-32.8806,0.0485
9,9,2018-11-05 14:00:45+00:00,2018,11,5,14,0,-34.674999,0.0462


In [12]:
missing_df.head(20)

Unnamed: 0,step,timestamp,year,month,day,hour,minute,anglez,enmo
0,0.0,NaT,2018.0,11.0,5.0,14.0,0.0,-30.845301,0.0447
1,,2018-11-05 14:00:05+00:00,2018.0,,5.0,14.0,,,0.0443
2,2.0,2018-11-05 14:00:10+00:00,2018.0,11.0,5.0,14.0,0.0,-33.877102,0.0483
3,3.0,NaT,,11.0,5.0,14.0,0.0,-34.282101,0.068
4,4.0,NaT,2018.0,11.0,5.0,14.0,0.0,-34.385799,0.0768
5,5.0,2018-11-05 14:00:25+00:00,2018.0,11.0,5.0,14.0,0.0,-34.925598,
6,6.0,NaT,2018.0,,5.0,,,-30.513399,0.1073
7,7.0,2018-11-05 14:00:35+00:00,,11.0,5.0,14.0,,-30.509399,0.0649
8,8.0,NaT,2018.0,11.0,,14.0,0.0,-32.8806,0.0485
9,9.0,NaT,2018.0,11.0,5.0,,0.0,,0.0462


In [13]:
mask_df.head(20)

Unnamed: 0,step,timestamp,year,month,day,hour,minute,anglez,enmo
0,1,0,1,1,1,1,1,1,1
1,0,1,1,0,1,1,0,0,1
2,1,1,1,1,1,1,1,1,1
3,1,0,0,1,1,1,1,1,1
4,1,0,1,1,1,1,1,1,1
5,1,1,1,1,1,1,1,1,0
6,1,0,1,0,1,0,0,1,1
7,1,1,0,1,1,1,0,1,1
8,1,0,1,1,0,1,1,1,1
9,1,0,1,1,1,0,1,0,1


In [None]:
label_df.head()