In [1]:
!pip install xgboost



In [2]:
!pip install tensorflow



In [3]:
### utils

# coding=utf-8
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''Utility functions for GAIN.

(1) normalization: MinMax Normalizer
(2) renormalization: Recover the data from normalzied data
(3) rounding: Handlecategorical variables after imputation
(4) rmse_loss: Evaluate imputed data in terms of RMSE
(5) xavier_init: Xavier initialization
(6) binary_sampler: sample binary random variables
(7) uniform_sampler: sample uniform random variables
(8) sample_batch_index: sample random batch index
'''
 
# Necessary packages
import numpy as np
# import tensorflow as tf
##IF USING TF 2 use following import to still use TF < 2.0 Functionalities
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()


def normalization (data, parameters=None):
    '''
    Normalize data in [0, 1] range.

    Args:
    - data: original data

    Returns:
    - norm_data: normalized data
    - norm_parameters: min_val, max_val for each feature for renormalization
    '''
    
    # Parameters
    _,dim = data.shape
    norm_data = data.copy()
  
    if parameters is None:
  
        # MixMax normalization
        min_val = np.zeros(dim)
        max_val = np.zeros(dim)
    
        # For each dimension
        for i in range(dim):
            min_val[i] = np.nanmin(norm_data[:,i])
            norm_data[:,i] = norm_data[:,i] - np.nanmin(norm_data[:,i])
            max_val[i] = np.nanmax(norm_data[:,i])
            norm_data[:,i] = norm_data[:,i] / (np.nanmax(norm_data[:,i]) + 1e-6)   
      
        # Return norm_parameters for renormalization
        norm_parameters = {'min_val': min_val,
                       'max_val': max_val}

    else:
        min_val = parameters['min_val']
        max_val = parameters['max_val']
    
        # For each dimension
        for i in range(dim):
            norm_data[:,i] = norm_data[:,i] - min_val[i]
            norm_data[:,i] = norm_data[:,i] / (max_val[i] + 1e-6)  
      
        norm_parameters = parameters    
      
    return norm_data, norm_parameters


def renormalization (norm_data, norm_parameters):
    '''
    Renormalize data from [0, 1] range to the original range.

    Args:
    - norm_data: normalized data
    - norm_parameters: min_val, max_val for each feature for renormalization

    Returns:
    - renorm_data: renormalized original data
    '''
    min_val = norm_parameters['min_val']
    max_val = norm_parameters['max_val']

    _, dim = norm_data.shape
    renorm_data = norm_data.copy()
    
    for i in range(dim):
        renorm_data[:,i] = renorm_data[:,i] * (max_val[i] + 1e-6)   
        renorm_data[:,i] = renorm_data[:,i] + min_val[i]
    
    return renorm_data


def rounding (imputed_data, data_x):
    '''
    Round imputed data for categorical variables.

    Args:
    - imputed_data: imputed data
    - data_x: original data with missing values

    Returns:
    - rounded_data: rounded imputed data
    '''
  
    _, dim = data_x.shape
    rounded_data = imputed_data.copy()
  
    for i in range(dim):
        temp = data_x[~np.isnan(data_x[:, i]), i]
        # Only for the categorical variable
        if len(np.unique(temp)) < 20:
            rounded_data[:, i] = np.round(rounded_data[:, i])
      
    return rounded_data


def rmse_loss (ori_data, imputed_data, data_m):
    '''
    Compute RMSE loss between ori_data and imputed_data

    Args:
    - ori_data: original data without missing values
    - imputed_data: imputed data
    - data_m: indicator matrix for missingness

    Returns:
    - rmse: Root Mean Squared Error
    '''
  
    ori_data, norm_parameters = normalization(ori_data)
    imputed_data, _ = normalization(imputed_data, norm_parameters)
    
    # Only for missing values
    nominator = np.sum(((1-data_m) * ori_data - (1-data_m) * imputed_data)**2)
    denominator = np.sum(1-data_m)
  
    rmse = np.sqrt(nominator/float(denominator))
  
    return rmse


def xavier_init(size):
    '''
    Xavier initialization.
    Args:
    - size: vector size

    Returns:
    - initialized random vector.
    '''
    in_dim = size[0]
    xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
    return tf.random_normal(shape = size, stddev = xavier_stddev)
      

def binary_sampler(p, rows, cols):
    '''
    Sample binary random variables.

    Args:
    - p: probability of 1
    - rows: the number of rows
    - cols: the number of columns

    Returns:
    - binary_random_matrix: generated binary random matrix.
    '''
    unif_random_matrix = np.random.uniform(0., 1., size = [rows, cols])
    binary_random_matrix = 1*(unif_random_matrix < p)
    return binary_random_matrix


def uniform_sampler(low, high, rows, cols):
    '''
    Sample uniform random variables.

    Args:
    - low: low limit
    - high: high limit
    - rows: the number of rows
    - cols: the number of columns

    Returns:
    - uniform_random_matrix: generated uniform random matrix.
    '''
    return np.random.uniform(low, high, size = [rows, cols])       


def sample_batch_index(total, batch_size):
    '''
    Sample index of the mini-batch.

    Args:
    - total: total number of samples
    - batch_size: batch size

    Returns:
    - batch_idx: batch index
    '''
    total_idx = np.random.permutation(total)
    batch_idx = total_idx[:batch_size]
    return batch_idx

Instructions for updating:
non-resource variables are not supported in the long term


In [45]:
# coding=utf-8
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''GAIN function.
Date: 2020/02/28
Reference: J. Yoon, J. Jordon, M. van der Schaar, "GAIN: Missing Data 
           Imputation using Generative Adversarial Nets," ICML, 2018.
Paper Link: http://proceedings.mlr.press/v80/yoon18a/yoon18a.pdf
Contact: jsyoon0823@gmail.com
'''

# Necessary packages
#import tensorflow as tf
##IF USING TF 2 use following import to still use TF < 2.0 Functionalities
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import numpy as np
from tqdm import tqdm

# from utils import normalization, renormalization, rounding
# from utils import xavier_init
# from utils import binary_sampler, uniform_sampler, sample_batch_index


d_loss=[]
g_loss=[]

def gain (data_x, gain_parameters):
    '''
    Impute missing values in data_x

    Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations

    Returns:
    - imputed_data: imputed data
    '''
    # Define mask matrix
    data_m = 1-np.isnan(data_x)
  
    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']
  
    # Other parameters
    no, dim = data_x.shape
  
    # Hidden state dimensions
    h_dim = int(dim)
  
    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)
  
    ## GAIN architecture   
    # Input placeholders
    # Data vector
    X = tf.placeholder(tf.float32, shape = [None, dim])
    # Mask vector 
    M = tf.placeholder(tf.float32, shape = [None, dim])
    # Hint vector
    H = tf.placeholder(tf.float32, shape = [None, dim])
  
    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim*2, h_dim])) # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
  
    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
  
    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape = [dim]))  # Multi-variate outputs
  
    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
  
  #Generator variables
  # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim*2, h_dim]))  
    G_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
  
    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
  
    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape = [dim]))
  
    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
  
  ## GAIN functions
  # Generator
    def generator(x,m):
        # Concatenate Mask and Data
        inputs = tf.concat(values = [x, m], axis = 1) 
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) 
        return G_prob
      
    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values = [x, h], axis = 1) 
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)  
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob
  
    ## GAIN structure
    # Generator
    G_sample = generator(X, M)
 
    # Combine with observed data
    Hat_X = X * M + G_sample * (1-M)
  
    # Discriminator
    D_prob = discriminator(Hat_X, H)
  
    ## GAIN loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                + (1-M) * tf.log(1. - D_prob + 1e-8)) 
  
    G_loss_temp = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))
  
    MSE_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)
  
    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss 
  
    ## GAIN solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)
  
    ## Iterations
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
   
    # Start Iterations
    for it in tqdm(range(iterations)):    
      
        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]  
        M_mb = data_m[batch_idx, :]  
        # Sample random vectors  
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim) 
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        H_mb = M_mb * H_mb_temp
      
        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
      
        _, D_loss_curr = sess.run([D_solver, D_loss_temp], 
                              feed_dict = {M: M_mb, X: X_mb, H: H_mb})
        _, G_loss_curr, MSE_loss_curr = sess.run([G_solver, G_loss_temp, MSE_loss],
             feed_dict = {X: X_mb, M: M_mb, H: H_mb})
        d_loss.append(D_loss_curr)
        g_loss.append(G_loss_curr)
            
        ## Return imputed data      
        Z_mb = uniform_sampler(0, 0.01, no, dim) 
        M_mb = data_m
        X_mb = norm_data_x          
        X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
      
        imputed_data = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0]
  
        imputed_data = data_m * norm_data_x + (1-data_m) * imputed_data
  
        # Renormalization
        imputed_data = renormalization(imputed_data, norm_parameters)  
  
        # Rounding
        imputed_data = rounding(imputed_data, data_x)
    
    d_loss_df=pd.DataFrame(d_loss)
    g_loss_df=pd.DataFrame(g_loss)
    
#     d_loss_df.to_csv(f'parquet/20/d_loss.csv')
#     g_loss_df.to_csv(f'parquet/20/g_loss.csv')
#     d_loss_df.to_csv('parquet/50/d_loss.csv')
#     g_loss_df.to_csv('parquet/50/g_loss.csv')
    d_loss_df.to_csv('parquet/80/d_loss.csv')
    g_loss_df.to_csv('parquet/80/g_loss.csv')
    
          
    return imputed_data


In [44]:
import argparse
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

def create_mask_dataset(data_x,miss_rate):
    rows,columns=data_x.shape
    # Set the percentage of 0s and 1s
    percent_zeros = miss_rate/100
    percent_ones = 1 - percent_zeros

    # Generate a random array with 0s and 1s based on the specified percentages
    result_array = np.random.choice([0, 1], size=(rows, columns), p=[percent_zeros, percent_ones])
    
    return result_array
    

def data_loader(data_name, miss_rate):
    ## Load data
    df_par=pd.read_parquet('dataset.parquet')
    ### We need to select columns
    df_class=df_par['sleeping']
    data_label=df_class.to_numpy()
    
    df_par=df_par[['step','year','month','day','hour','minute','anglez','enmo']]
    data_x=df_par.to_numpy()

    ##Parameters
    no, dim = data_x.shape

    ##Introducing missing data
    data_m=create_mask_dataset(data_x,miss_rate)
    miss_data_x=data_x.copy()
    miss_data_x[data_m==0] = np.nan
    
    

    return data_x,miss_data_x,data_m,data_label

    
data_name='child_sleep'   
miss_rate=80 ## CHANGE THIS FOR OTHER RATES
### Step 1: Preparing Data and Loading Data
ori_data_x,missing_data_x,data_m,data_label=data_loader('child_sleep',miss_rate)

## Storing as Pandas DataFrame 
columns=['step', 'year','month','day', 'hour','minute','anglez', 'enmo']
ori_df=pd.DataFrame(ori_data_x, columns=columns)
missing_df=pd.DataFrame(missing_data_x, columns=columns)
mask_df=pd.DataFrame(data_m, columns=columns)
label_df=pd.DataFrame(data_label, columns=['sleeping'])

# ## Saving as parquet file

# if miss_rate==20:
#     ori_df.to_parquet(r'parquet/20/original_data_X.parquet')
#     missing_df.to_parquet(r'parquet/20/missing_data_X.parquet')
#     mask_df.to_parquet(r'parquet/20/mask_data_X.parquet')
#     label_df.to_parquet(r'parquet/20/label_data.parquet')
# if miss_rate==50:
#     ori_df.to_parquet(r'parquet/50/original_data_X.parquet')
#     missing_df.to_parquet(r'parquet/50/missing_data_X.parquet')
#     mask_df.to_parquet(r'parquet/50/mask_data_X.parquet')
#     label_df.to_parquet(r'parquet/50/label_data.parquet')
# if miss_rate==80:
#     ori_df.to_parquet(r'parquet/20/original_data_X.parquet')
#     missing_df.to_parquet(r'parquet/20/missing_data_X.parquet')
#     mask_df.to_parquet(r'parquet/20/mask_data_X.parquet')
#     label_df.to_parquet(r'parquet/20/label_data.parquet')

In [46]:
import argparse
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score


def main(data_name,miss_rate, batch_size, hint_rate, alpha, iterations):

    data_name=data_name    
    miss_rate=miss_rate
    
    gain_parameters = {'batch_size': batch_size,
                     'hint_rate': hint_rate,
                     'alpha': alpha,
                     'iterations': iterations}
    
    columns=['step', 'year','month','day', 'hour','minute','anglez', 'enmo']
            
    ### Step 2: Split dataset

    # Specify the number of samples for training
    num_training_samples = int((1-(miss_rate/100))*len(ori_data_x))
    
    # Split the indices without shuffling
    training_idx, test_idx = np.arange(num_training_samples), np.arange(num_training_samples, ori_data_x.shape[0])
    
    # Use the indices to create training and test sets and also training missing and test missing
    train_ori_data_X, test_ori_data_X = ori_data_x[training_idx, :], ori_data_x[test_idx, :]
    train_miss_data_X, test_miss_data_X= missing_data_x[training_idx,:],missing_data_x[test_idx,:]
    train_data_m, test_data_m=data_m[training_idx,:], data_m[test_idx,:]

    labels_array=label_df.to_numpy()
    train_label,test_label=labels_array[training_idx],labels_array[test_idx]
    
    
    ##### SAVE Training Dataset, Test Dataset, missing training set and missing test set for bfill and ffill imputation. 
    train_x_df=pd.DataFrame(train_ori_data_X)
    train_x_df.to_parquet(f'parquet/{miss_rate}/train_x.parquet')
    test_x_df=pd.DataFrame(test_ori_data_X)
    test_x_df.to_parquet(f'parquet/{miss_rate}/test_x.parquet')
    train_miss_x_df=pd.DataFrame(train_miss_data_X)
    train_miss_x_df.to_parquet(f'parquet/{miss_rate}/train_miss_x.parquet')
    test_miss_x_df=pd.DataFrame(test_miss_data_X)
    test_miss_x_df.to_parquet(f'parquet/{miss_rate}/test_miss_x.parquet')
    train_label_df=pd.DataFrame(train_label)
    train_label_df.to_parquet(f'parquet/{miss_rate}/train_y.parquet')
    test_label_df=pd.DataFrame(test_label)
    test_label_df.to_parquet(f'parquet/{miss_rate}/test_y.parquet')
    
    
    x_train=train_ori_data_X
    x_test=test_ori_data_X
    y_train=labels_array[training_idx]
    y_test= labels_array[test_idx]
    

    test_label_imputed_classfication_df=pd.DataFrame(y_test, columns = ['sleeping'])

    
    x_test_complete_data=pd.DataFrame(x_test,columns=['step', 'year','month','day', 'hour','minute','anglez', 'enmo'])
    x_test_complete_label=y_test
    
    
    ## KNN Classfier trained on complete training data (no missing values)
    neigh = KNeighborsClassifier(n_neighbors=10)
    neigh.fit(x_train, np.ravel(y_train,order='C'))
    
    
    # Make predictions on the training data
    y_train_pred = neigh.predict(x_train)

    # Calculate accuracy on the training data
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Train Accuracy on KNN: {train_accuracy}")
    
    ## XGB Classfier trained on complete training data (no missing values)
    xgb_model=xgb.XGBClassifier(objective='binary:logistic')
    xgb_model.fit(x_train,y_train)
    
    # Make predictions on the training data
    y_train_pred = xgb_model.predict(x_train)

    # Calculate accuracy on the training data
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Train Accuracy on XGB: {train_accuracy}")

    #Impute missing data for training split
    train_imputed_data_X = gain(missing_data_x[training_idx,:], gain_parameters) 

    # Report the RMSE performance
    rmse = rmse_loss (x_train, train_imputed_data_X, train_data_m) 

    ##Store rmse for training split    
    rmse_train=rmse
    print('train_rmse',rmse_train)

    #Impute missing data for test split
    test_imputed_data_X = gain(test_miss_data_X, gain_parameters)
    
     # Report the RMSE performance
    rmse_test = rmse_loss(x_test, test_imputed_data_X, test_data_m)
    
    ## to imputed with ffill and bfill, we need to convert ndarray to pandas Dataframe. Save as parquet file
    test_miss_data_X_df=pd.DataFrame(test_miss_data_X)
    
#     ### ************* DEBUG************##############
#     print(test_miss_data_X_df.isnull().sum())
#     print("Before forward fill:", test_miss_data_X_df.shape)
#     test_imputed_data_X_ffill = test_miss_data_X_df.fillna(method='ffill')
#     print("After forward fill:", test_imputed_data_X_ffill.shape)
    
#     print(test_imputed_data_X_ffill)
    
#     test_imputed_data_X_bfill=test_miss_data_X_df.fillna(method='bfill')
    
    #ori_data_x er jaygay test_X, data_m er jaygay test_M
#     rmse_ffill=rmse_loss(test_ori_data_X, test_imputed_data_X_ffill.to_numpy(), test_data_m)
#     rmse_bfill=rmse_loss(test_ori_data_X, test_imputed_data_X_bfill.to_numpy(), test_data_m)

    # Impute missing data

    if miss_rate==20:
        test_imputed_data_X.tofile(f'parquet/{miss_rate}/test_imputed_data_X.parquet')
    if miss_rate==50:
        test_imputed_data_X.tofile(f'parquet/{miss_rate}/test_imputed_data_X.parquet')
    if miss_rate==80:
        test_imputed_data_X.tofile(f'parquet/{miss_rate}/test_imputed_data_X.parquet')
    
    
    test_dataframe_data_imputed_read=pd.DataFrame(test_imputed_data_X)
    

    print('\n')
    print('Train RMSE Performance: ' + str(np.around(rmse_train, 4)))
    print('Test RMSE Performance GAIN : ' + str(np.around(rmse_test, 4)))
#     print('Test RMSE Performance ffill : ' + str(np.around(rmse_ffill, 4)))
#     print('Test RMSE Performance bfill : ' + str(np.around(rmse_bfill, 4)))
    


    ####CLASSIFICATION TASK: PREPARE TEST DATA ##################

    #STEP 1: append x_test with y_test 
    imputed_test_set = pd.concat([test_dataframe_data_imputed_read, pd.DataFrame(test_label)], axis=1, ignore_index=True)
#     print(imputed_test_set.shape)
#     imputed_test_set_ffill = pd.concat([test_imputed_data_X_ffill, pd.DataFrame(test_label)], axis=1, ignore_index=True)
#     print(imputed_test_set_ffill.shape)
#     imputed_test_set_bfill = pd.concat([test_imputed_data_X_bfill, pd.DataFrame(test_label)], axis=1, ignore_index=True)
#     print(imputed_test_set_bfill.shape)
    complete_test_set=pd.concat([ x_test_complete_data, pd.DataFrame(test_label)], axis=1, ignore_index=True)
#     print(complete_test_set.shape)
    ##append###
    full_test_set = pd.concat([imputed_test_set,complete_test_set])
#     print(full_test_set.shape)
    
#     full_test_set_ffill = pd.concat([imputed_test_set_ffill,complete_test_set])
#     print(full_test_set_ffill.shape)
    
#     full_test_set_bfill = pd.concat([imputed_test_set_bfill,complete_test_set])
#     print(full_test_set_bfill.shape)

 
    full_test_set.to_csv(f'parquet/{miss_rate}/test_data.csv')
    full_test_x=full_test_set.iloc[:,:-1]
#     full_test_x_ffill=full_test_set_ffill.iloc[:,:-1]
#     full_test_x_bfill=full_test_set_bfill.iloc[:,:-1]
#     print('full_test_x len')
#     print(full_test_x.shape)
    
    full_test_y=full_test_set.iloc[:,-1] # last column of data frame (sleeping)
#     print(full_test_y.shape)
#     full_test_y_ffill=full_test_set_ffill.iloc[:,-1] # last column of data frame (sleeping)
#     print(full_test_y_ffill.shape)
#     full_test_y_bfill=full_test_set_bfill.iloc[:,-1] # last column of data frame (sleeping)
#     print(full_test_y_bfill.shape)
    

    print('\nKNN Classification')
    print('Classification Report:')
    
    
    predictions_knn = neigh.predict(full_test_x)
#     predictions_knn_fill = neigh.predict(full_test_x_ffill)
#     predictions_knn_bfill = neigh.predict(full_test_x_bfill)
    
    print(classification_report(full_test_y, predictions_knn))
#     print(classification_report(full_test_y_ffill, predictions_knn_ffill))
#     print(classification_report(full_test_y_bfill, predictions_knn_bfill))
    print('\n')
    print("Accuracy Score of KNN Classifier: ")
    
    acc_knn = accuracy_score(full_test_y, predictions_knn)
#     acc_knn_ffill = accuracy_score(full_test_y_bfill, predictions_knn_ffill)
#     acc_knn_bfill = accuracy_score(full_test_y_ffill, predictions_knn_bfill)
    print(acc_knn)
#     print(acc_knn_ffill)
#     print(acc_knn_bfill)
    print('\n')
    
    print('\nXGB Classification')

    # Make predictions on the training data
    y_train_pred = xgb_model.predict(x_train)

    # Calculate accuracy on the training data
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Train Accuracy: {train_accuracy}")
    predictions_xgb=xgb_model.predict(full_test_x)
#     predictions_xgb_ffill=xgb_model.predict(full_test_x_ffill)
#     predictions_xgb_bfill=xgb_model.predict(full_test_x_bfill)
    print(classification_report(full_test_y, predictions_xgb))
#     print(classification_report(full_test_y_ffill, predictions_xgb_ffill))
#     print(classification_report(full_test_y_bfill, predictions_xgb_bfill))
    print('\n')
    print("Accuracy Score of XGB Classifier: ")
    acc_xgb = accuracy_score(full_test_y, predictions_xgb)
#     acc_xgb_ffill = accuracy_score(full_test_y_ffill, predictions_xgb_ffill)
#     acc_xgb_bfill = accuracy_score(full_test_y_bfill, predictions_xgb_bfill)
    print(acc_xgb)
#     print(acc_xgb_ffill)
#     print(acc_xgb_bfill)
    print('\n')
    
    return rmse

In [42]:
## arguments of main: dataset_name, miss_rate, batch_size, hint_rate, alpha, iterations
rmse=main('child_sleep', 20, 128, 0.9, 100, 1500)

  if _pandas_api.is_sparse(col):


Train Accuracy on KNN: 0.9590118080810843
Train Accuracy on XGB: 0.9326885829391229


100%|████████████████████████████████████████████████████████████████████████████| 1500/1500 [2:34:44<00:00,  6.19s/it]


train_rmse 0.2772557404581131


100%|██████████████████████████████████████████████████████████████████████████████| 1500/1500 [37:51<00:00,  1.51s/it]




Train RMSE Performance: 0.2773
Test RMSE Performance GAIN : 0.2897

KNN Classification
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.66      0.58   2786568
           1       0.45      0.31      0.37   2479656

    accuracy                           0.50   5266224
   macro avg       0.48      0.49      0.47   5266224
weighted avg       0.49      0.50      0.48   5266224



Accuracy Score of KNN Classifier: 
0.49583078881566756



XGB Classification
Train Accuracy: 0.9326885829391229
              precision    recall  f1-score   support

           0       0.51      0.64      0.57   2786568
           1       0.44      0.32      0.37   2479656

    accuracy                           0.49   5266224
   macro avg       0.48      0.48      0.47   5266224
weighted avg       0.48      0.49      0.48   5266224



Accuracy Score of XGB Classifier: 
0.4884089624748207




In [43]:
############### CHANGE filepath in GAIN g_loss and d_loss

## arguments of main: dataset_name, miss_rate, batch_size, hint_rate, alpha, iterations
rmse=main('child_sleep', 50, 128, 0.9, 100, 1500)

  if _pandas_api.is_sparse(col):


Train Accuracy on KNN: 0.9722030813729154
Train Accuracy on XGB: 0.9727268722333118


100%|████████████████████████████████████████████████████████████████████████████| 1500/1500 [1:37:18<00:00,  3.89s/it]


train_rmse 0.27054536546091384


100%|████████████████████████████████████████████████████████████████████████████| 1500/1500 [1:37:14<00:00,  3.89s/it]




Train RMSE Performance: 0.2705
Test RMSE Performance GAIN : 0.2843

KNN Classification
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.68      0.64   7924560
           1       0.40      0.32      0.35   5241000

    accuracy                           0.54  13165560
   macro avg       0.50      0.50      0.50  13165560
weighted avg       0.52      0.54      0.52  13165560



Accuracy Score of KNN Classifier: 
0.5358075159734945



XGB Classification
Train Accuracy: 0.9727268722333118
              precision    recall  f1-score   support

           0       0.61      0.70      0.66   7924560
           1       0.43      0.33      0.37   5241000

    accuracy                           0.56  13165560
   macro avg       0.52      0.52      0.51  13165560
weighted avg       0.54      0.56      0.54  13165560



Accuracy Score of XGB Classifier: 
0.5559610833113061




In [47]:
############## CHANGE filepath in GAIN g_loss and d_loss

## arguments of main: dataset_name, miss_rate, batch_size, hint_rate, alpha, iterations
rmse=main('child_sleep', 80, 128, 0.9, 100, 1500)

  if _pandas_api.is_sparse(col):


Train Accuracy on KNN: 0.9780540964661194
Train Accuracy on XGB: 0.9924970880452818


100%|██████████████████████████████████████████████████████████████████████████████| 1500/1500 [30:21<00:00,  1.21s/it]


train_rmse 0.403437994069898


100%|████████████████████████████████████████████████████████████████████████████| 1500/1500 [2:01:36<00:00,  4.86s/it]




Train RMSE Performance: 0.4034
Test RMSE Performance GAIN : 0.4013

KNN Classification
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.69      0.65  12932738
           1       0.39      0.32      0.35   8132160

    accuracy                           0.55  21064898
   macro avg       0.50      0.50      0.50  21064898
weighted avg       0.53      0.55      0.54  21064898



Accuracy Score of KNN Classifier: 
0.5454763179959381



XGB Classification
Train Accuracy: 0.9924970880452818
              precision    recall  f1-score   support

           0       0.62      0.82      0.70  12932738
           1       0.41      0.20      0.27   8132160

    accuracy                           0.58  21064898
   macro avg       0.51      0.51      0.49  21064898
weighted avg       0.54      0.58      0.54  21064898



Accuracy Score of XGB Classifier: 
0.5784416805626118




## Forward Fill, Backward Fill Imputation on 20%, 50%, 80% missing data on KNN, XGB

In [81]:
def rmse_loss (ori_data, imputed_data, data_m):
    '''
    Compute RMSE loss between ori_data and imputed_data

    Args:
    - ori_data: original data without missing values
    - imputed_data: imputed data
    - data_m: indicator matrix for missingness

    Returns:
    - rmse: Root Mean Squared Error
    '''
    def has_nan(array):
        return np.any(np.isnan(array))
    
#     print(has_nan(ori_data))
#     print(has_nan(imputed_data))
#     print(has_nan(data_m))
    
    ori_data, norm_parameters = normalization(ori_data)
    imputed_data, _ = normalization(imputed_data, norm_parameters)
    
    # Only for missing values
    nominator = np.sum(((1-data_m) * ori_data - (1-data_m) * imputed_data)**2)
    denominator = np.sum(1-data_m)
  
    rmse = np.sqrt(nominator/float(denominator))
#     print(rmse)
  
    return rmse

In [82]:
miss_rate="80"

#### read parquet file with missing test data x
missing_test_df=pd.read_parquet(f"parquet/{miss_rate}/test_miss_x.parquet")
missing_test_data=missing_test_df.to_numpy()
### read parquet file with test (no missing) data x
ori_test_df=pd.read_parquet(f"parquet/{miss_rate}/test_x.parquet")
ori_test_data=ori_test_df.to_numpy()
### build mask test data x
mask_test_df = pd.notna(missing_test_df).astype(int)
mask_test_data=mask_test_df.to_numpy()

### impute using ffill
imputed_test_df_ffill=missing_test_df.fillna(method='ffill')
# print(imputed_test_df_ffill.isnull().sum()) ## One missing value instance left
imputed_test_data_ffill=imputed_test_df_ffill.to_numpy()

### impute using bfill
imputed_test_df_bfill=missing_test_df.fillna(method='bfill')
# print(imputed_test_df_bfill.isnull().sum())
imputed_test_data_bfill=imputed_test_df_bfill.to_numpy()


###Note There are still missing values so we need to fill those while preparing test sets

###prepare test data
## original test data
test_x_df=pd.read_parquet(f'parquet/{miss_rate}/test_x.parquet')
test_y_df=pd.read_parquet(f'parquet/{miss_rate}/test_y.parquet')
test_df=pd.concat([ test_x_df, test_y_df], axis=1, ignore_index=True)

## original test+imputed test bfill
imputed_test_set_ffill = pd.concat([imputed_test_df_ffill,test_y_df ], axis=1, ignore_index=True)
# print(imputed_test_set_ffill.shape) # (2633112, 9)
full_test_set_ffill=pd.concat([imputed_test_set_ffill,test_df])

##original test+imputed test bfill
imputed_test_set_bfill = pd.concat([imputed_test_df_bfill, test_y_df ], axis=1, ignore_index=True)
# print(imputed_test_set_bfill.shape) # (2633112, 9)
full_test_set_bfill= pd.concat([imputed_test_set_bfill,test_df])


###### Drop rows/Impute rows where there are Nan values
# print('Before drop')
# print(full_test_set_ffill.shape)
# full_test_set_ffill=full_test_set_ffill.dropna()
full_test_set_ffill=full_test_set_ffill.fillna(method='bfill')
# print("After drop")
# print(full_test_set_ffill.shape)


####Drop rows where there are Nan values
# print('Before drop')
# print(full_test_set_bfill.shape)
full_test_set_bfill=full_test_set_bfill.fillna(method='ffill')
# print("After drop")
# print(full_test_set_bfill.shape)

# print("Check concat dfs: label vs data")
# print(test_y_df.shape) # (2633112, 1)
# print(imputed_test_df_ffill.shape) #(2633112, 8) #Has missing
# print(imputed_test_df_bfill.shape)  #(2633112, 8)

print('RMSE calculation')
### compute rmse for ffill imputation
imputed_test_set_ffill=imputed_test_set_ffill.fillna(method='bfill')
rmse_ffill=rmse_loss(ori_test_data, imputed_test_set_ffill.iloc[:, :-1].to_numpy(), mask_test_data)
print('rmse_ffill')
print(rmse_ffill)


### compute rmse for bfill imputation
imputed_test_set_bfill=imputed_test_set_bfill.fillna(method='ffill')
rmse_bfill=rmse_loss(ori_test_data, imputed_test_set_bfill.iloc[:, :-1].to_numpy(), mask_test_data)
print('rmse_bfill')
print(rmse_bfill)



### classification task

### read trainx
train_x_df=pd.read_parquet(f"parquet/{miss_rate}/train_x.parquet")
x_train=train_x_df.to_numpy()
### read train y
train_y_df=pd.read_parquet(f"parquet/{miss_rate}/train_y.parquet")
y_train=train_y_df.to_numpy()

# ## KNN Classfier trained on complete training data (no missing values)
# neigh = KNeighborsClassifier(n_neighbors=10)
# neigh.fit(x_train, np.ravel(y_train,order='C'))
    
# # Make predictions on the training data
# y_train_pred = neigh.predict(x_train)

# # Calculate accuracy on the training data
# train_accuracy = accuracy_score(y_train, y_train_pred)
# print(f"Train Accuracy on KNN: {train_accuracy}")

# ## XGB Classfier trained on complete training data (no missing values)
# xgb_model=xgb.XGBClassifier(objective='binary:logistic')
# xgb_model.fit(x_train,y_train)

# # Make predictions on the training data
# y_train_pred = xgb_model.predict(x_train)

# # Calculate accuracy on the training data
# train_accuracy = accuracy_score(y_train, y_train_pred)
# print(f"Train Accuracy on XGB: {train_accuracy}")

print('ACCURACY CALCULATION')

##find acc for knn ffill
predictions_knn_ffill = neigh.predict(full_test_set_ffill.iloc[:, :-1])
print(classification_report(full_test_set_ffill.iloc[:, -1], predictions_knn_ffill))
print("Accuracy Score of KNN Classifier (Forward Fill): ")
acc_knn_ffill = accuracy_score(full_test_set_ffill.iloc[:, -1], predictions_knn_ffill)

print(acc_knn_ffill)

### find acc for knn bfill
predictions_knn_bfill = neigh.predict(full_test_set_bfill.iloc[:, :-1])
print(classification_report(full_test_set_bfill.iloc[:, -1], predictions_knn_bfill))
print("Accuracy Score of KNN Classifier: (Backward Fill)")
acc_knn_bfill = accuracy_score(full_test_set_ffill.iloc[:, -1], predictions_knn_bfill)

print(acc_knn_bfill)

###find acc for xgboost ffill
predictions_xgb_ffill=xgb_model.predict(full_test_set_ffill.iloc[:, :-1])
print(classification_report(full_test_set_ffill.iloc[:, -1], predictions_xgb_ffill))
print("Accuracy Score of XGB Classifier (Forward Fill): ")
acc_xgb_ffill = accuracy_score(full_test_set_ffill.iloc[:, -1], predictions_xgb_ffill)

print(acc_xgb_ffill)

##find acc for knn bfill
predictions_xgb_bfill=xgb_model.predict(full_test_set_bfill.iloc[:, :-1])
print(classification_report(full_test_set_bfill.iloc[:, -1], predictions_xgb_bfill))
print("Accuracy Score of XGB Classifier (Backward Fill): ")
acc_xgb_bfill = accuracy_score(full_test_set_bfill.iloc[:, -1], predictions_xgb_bfill)


print(acc_xgb_bfill)



  imputed_test_df_ffill=missing_test_df.fillna(method='ffill')
  imputed_test_df_bfill=missing_test_df.fillna(method='bfill')
  full_test_set_ffill=full_test_set_ffill.fillna(method='bfill')
  full_test_set_bfill=full_test_set_bfill.fillna(method='ffill')


RMSE calculation


  imputed_test_set_ffill=imputed_test_set_ffill.fillna(method='bfill')


rmse_ffill
0.05235069069783292


  imputed_test_set_bfill=imputed_test_set_bfill.fillna(method='ffill')


rmse_bfill
0.052138546123293904
ACCURACY CALCULATION
              precision    recall  f1-score   support

           0       0.84      0.91      0.87  12932738
           1       0.83      0.73      0.78   8132160

    accuracy                           0.84  21064898
   macro avg       0.84      0.82      0.83  21064898
weighted avg       0.84      0.84      0.84  21064898

Accuracy Score of KNN Classifier (Forward Fill): 
0.8388493977041807
              precision    recall  f1-score   support

           0       0.84      0.91      0.87  12932738
           1       0.83      0.73      0.78   8132160

    accuracy                           0.84  21064898
   macro avg       0.84      0.82      0.83  21064898
weighted avg       0.84      0.84      0.84  21064898

Accuracy Score of KNN Classifier: (Backward Fill)
0.8387854524621956
              precision    recall  f1-score   support

           0       0.83      0.89      0.86  12932738
           1       0.81      0.70      0.75   

In [83]:
miss_rate="50"

#### read parquet file with missing test data x
missing_test_df=pd.read_parquet(f"parquet/{miss_rate}/test_miss_x.parquet")
missing_test_data=missing_test_df.to_numpy()
### read parquet file with test (no missing) data x
ori_test_df=pd.read_parquet(f"parquet/{miss_rate}/test_x.parquet")
ori_test_data=ori_test_df.to_numpy()
### build mask test data x
mask_test_df = pd.notna(missing_test_df).astype(int)
mask_test_data=mask_test_df.to_numpy()

### impute using ffill
imputed_test_df_ffill=missing_test_df.fillna(method='ffill')
# print(imputed_test_df_ffill.isnull().sum()) ## One missing value instance left
imputed_test_data_ffill=imputed_test_df_ffill.to_numpy()

### impute using bfill
imputed_test_df_bfill=missing_test_df.fillna(method='bfill')
# print(imputed_test_df_bfill.isnull().sum())
imputed_test_data_bfill=imputed_test_df_bfill.to_numpy()


###Note There are still missing values so we need to fill those while preparing test sets

###prepare test data
## original test data
test_x_df=pd.read_parquet(f'parquet/{miss_rate}/test_x.parquet')
test_y_df=pd.read_parquet(f'parquet/{miss_rate}/test_y.parquet')
test_df=pd.concat([ test_x_df, test_y_df], axis=1, ignore_index=True)

## original test+imputed test bfill
imputed_test_set_ffill = pd.concat([imputed_test_df_ffill,test_y_df ], axis=1, ignore_index=True)
# print(imputed_test_set_ffill.shape) # (2633112, 9)
full_test_set_ffill=pd.concat([imputed_test_set_ffill,test_df])

##original test+imputed test bfill
imputed_test_set_bfill = pd.concat([imputed_test_df_bfill, test_y_df ], axis=1, ignore_index=True)
# print(imputed_test_set_bfill.shape) # (2633112, 9)
full_test_set_bfill= pd.concat([imputed_test_set_bfill,test_df])


###### Drop rows/Impute rows where there are Nan values
# print('Before drop')
# print(full_test_set_ffill.shape)
# full_test_set_ffill=full_test_set_ffill.dropna()
full_test_set_ffill=full_test_set_ffill.fillna(method='bfill')
# print("After drop")
# print(full_test_set_ffill.shape)


####Drop rows where there are Nan values
# print('Before drop')
# print(full_test_set_bfill.shape)
full_test_set_bfill=full_test_set_bfill.fillna(method='ffill')
# print("After drop")
# print(full_test_set_bfill.shape)

# print("Check concat dfs: label vs data")
# print(test_y_df.shape) # (2633112, 1)
# print(imputed_test_df_ffill.shape) #(2633112, 8) #Has missing
# print(imputed_test_df_bfill.shape)  #(2633112, 8)

print('RMSE calculation')
### compute rmse for ffill imputation
imputed_test_set_ffill=imputed_test_set_ffill.fillna(method='bfill')
rmse_ffill=rmse_loss(ori_test_data, imputed_test_set_ffill.iloc[:, :-1].to_numpy(), mask_test_data)
print('rmse_ffill')
print(rmse_ffill)


### compute rmse for bfill imputation
imputed_test_set_bfill=imputed_test_set_bfill.fillna(method='ffill')
rmse_bfill=rmse_loss(ori_test_data, imputed_test_set_bfill.iloc[:, :-1].to_numpy(), mask_test_data)
print('rmse_bfill')
print(rmse_bfill)



### classification task

### read trainx
train_x_df=pd.read_parquet(f"parquet/{miss_rate}/train_x.parquet")
x_train=train_x_df.to_numpy()
### read train y
train_y_df=pd.read_parquet(f"parquet/{miss_rate}/train_y.parquet")
y_train=train_y_df.to_numpy()

# ## KNN Classfier trained on complete training data (no missing values)
# neigh = KNeighborsClassifier(n_neighbors=10)
# neigh.fit(x_train, np.ravel(y_train,order='C'))
    
# # Make predictions on the training data
# y_train_pred = neigh.predict(x_train)

# # Calculate accuracy on the training data
# train_accuracy = accuracy_score(y_train, y_train_pred)
# print(f"Train Accuracy on KNN: {train_accuracy}")

# ## XGB Classfier trained on complete training data (no missing values)
# xgb_model=xgb.XGBClassifier(objective='binary:logistic')
# xgb_model.fit(x_train,y_train)

# # Make predictions on the training data
# y_train_pred = xgb_model.predict(x_train)

# # Calculate accuracy on the training data
# train_accuracy = accuracy_score(y_train, y_train_pred)
# print(f"Train Accuracy on XGB: {train_accuracy}")

print('ACCURACY CALCULATION')

##find acc for knn ffill
predictions_knn_ffill = neigh.predict(full_test_set_ffill.iloc[:, :-1])
print(classification_report(full_test_set_ffill.iloc[:, -1], predictions_knn_ffill))
print("Accuracy Score of KNN Classifier (Forward Fill): ")
acc_knn_ffill = accuracy_score(full_test_set_ffill.iloc[:, -1], predictions_knn_ffill)

print(acc_knn_ffill)

### find acc for knn bfill
predictions_knn_bfill = neigh.predict(full_test_set_bfill.iloc[:, :-1])
print(classification_report(full_test_set_bfill.iloc[:, -1], predictions_knn_bfill))
print("Accuracy Score of KNN Classifier: (Backward Fill)")
acc_knn_bfill = accuracy_score(full_test_set_ffill.iloc[:, -1], predictions_knn_bfill)

print(acc_knn_bfill)

###find acc for xgboost ffill
predictions_xgb_ffill=xgb_model.predict(full_test_set_ffill.iloc[:, :-1])
print(classification_report(full_test_set_ffill.iloc[:, -1], predictions_xgb_ffill))
print("Accuracy Score of XGB Classifier (Forward Fill): ")
acc_xgb_ffill = accuracy_score(full_test_set_ffill.iloc[:, -1], predictions_xgb_ffill)

print(acc_xgb_ffill)

##find acc for knn bfill
predictions_xgb_bfill=xgb_model.predict(full_test_set_bfill.iloc[:, :-1])
print(classification_report(full_test_set_bfill.iloc[:, -1], predictions_xgb_bfill))
print("Accuracy Score of XGB Classifier (Backward Fill): ")
acc_xgb_bfill = accuracy_score(full_test_set_bfill.iloc[:, -1], predictions_xgb_bfill)


print(acc_xgb_bfill)



  imputed_test_df_ffill=missing_test_df.fillna(method='ffill')
  imputed_test_df_bfill=missing_test_df.fillna(method='bfill')
  full_test_set_ffill=full_test_set_ffill.fillna(method='bfill')
  full_test_set_bfill=full_test_set_bfill.fillna(method='ffill')


RMSE calculation


  imputed_test_set_ffill=imputed_test_set_ffill.fillna(method='bfill')


rmse_ffill
0.033358506134696875


  imputed_test_set_bfill=imputed_test_set_bfill.fillna(method='ffill')


rmse_bfill
0.03346554852162805
ACCURACY CALCULATION
              precision    recall  f1-score   support

           0       0.78      0.86      0.82   7924560
           1       0.75      0.62      0.68   5241000

    accuracy                           0.77  13165560
   macro avg       0.76      0.74      0.75  13165560
weighted avg       0.76      0.77      0.76  13165560

Accuracy Score of KNN Classifier (Forward Fill): 
0.7667789292669662
              precision    recall  f1-score   support

           0       0.78      0.86      0.82   7924560
           1       0.75      0.62      0.68   5241000

    accuracy                           0.77  13165560
   macro avg       0.76      0.74      0.75  13165560
weighted avg       0.77      0.77      0.76  13165560

Accuracy Score of KNN Classifier: (Backward Fill)
0.7667866007978392
              precision    recall  f1-score   support

           0       0.76      0.85      0.80   7924560
           1       0.72      0.59      0.65   5

In [84]:
miss_rate="20"

#### read parquet file with missing test data x
missing_test_df=pd.read_parquet(f"parquet/{miss_rate}/test_miss_x.parquet")
missing_test_data=missing_test_df.to_numpy()
### read parquet file with test (no missing) data x
ori_test_df=pd.read_parquet(f"parquet/{miss_rate}/test_x.parquet")
ori_test_data=ori_test_df.to_numpy()
### build mask test data x
mask_test_df = pd.notna(missing_test_df).astype(int)
mask_test_data=mask_test_df.to_numpy()

### impute using ffill
imputed_test_df_ffill=missing_test_df.fillna(method='ffill')
# print(imputed_test_df_ffill.isnull().sum()) ## One missing value instance left
imputed_test_data_ffill=imputed_test_df_ffill.to_numpy()

### impute using bfill
imputed_test_df_bfill=missing_test_df.fillna(method='bfill')
# print(imputed_test_df_bfill.isnull().sum())
imputed_test_data_bfill=imputed_test_df_bfill.to_numpy()


###Note There are still missing values so we need to fill those while preparing test sets

###prepare test data
## original test data
test_x_df=pd.read_parquet(f'parquet/{miss_rate}/test_x.parquet')
test_y_df=pd.read_parquet(f'parquet/{miss_rate}/test_y.parquet')
test_df=pd.concat([ test_x_df, test_y_df], axis=1, ignore_index=True)

## original test+imputed test bfill
imputed_test_set_ffill = pd.concat([imputed_test_df_ffill,test_y_df ], axis=1, ignore_index=True)
# print(imputed_test_set_ffill.shape) # (2633112, 9)
full_test_set_ffill=pd.concat([imputed_test_set_ffill,test_df])

##original test+imputed test bfill
imputed_test_set_bfill = pd.concat([imputed_test_df_bfill, test_y_df ], axis=1, ignore_index=True)
# print(imputed_test_set_bfill.shape) # (2633112, 9)
full_test_set_bfill= pd.concat([imputed_test_set_bfill,test_df])


###### Drop rows/Impute rows where there are Nan values
# print('Before drop')
# print(full_test_set_ffill.shape)
# full_test_set_ffill=full_test_set_ffill.dropna()
full_test_set_ffill=full_test_set_ffill.fillna(method='bfill')
# print("After drop")
# print(full_test_set_ffill.shape)


####Drop rows where there are Nan values
# print('Before drop')
# print(full_test_set_bfill.shape)
full_test_set_bfill=full_test_set_bfill.fillna(method='ffill')
# print("After drop")
# print(full_test_set_bfill.shape)

# print("Check concat dfs: label vs data")
# print(test_y_df.shape) # (2633112, 1)
# print(imputed_test_df_ffill.shape) #(2633112, 8) #Has missing
# print(imputed_test_df_bfill.shape)  #(2633112, 8)

print('RMSE calculation')
### compute rmse for ffill imputation
imputed_test_set_ffill=imputed_test_set_ffill.fillna(method='bfill')
rmse_ffill=rmse_loss(ori_test_data, imputed_test_set_ffill.iloc[:, :-1].to_numpy(), mask_test_data)
print('rmse_ffill')
print(rmse_ffill)


### compute rmse for bfill imputation
imputed_test_set_bfill=imputed_test_set_bfill.fillna(method='ffill')
rmse_bfill=rmse_loss(ori_test_data, imputed_test_set_bfill.iloc[:, :-1].to_numpy(), mask_test_data)
print('rmse_bfill')
print(rmse_bfill)



### classification task

### read trainx
train_x_df=pd.read_parquet(f"parquet/{miss_rate}/train_x.parquet")
x_train=train_x_df.to_numpy()
### read train y
train_y_df=pd.read_parquet(f"parquet/{miss_rate}/train_y.parquet")
y_train=train_y_df.to_numpy()

# ## KNN Classfier trained on complete training data (no missing values)
# neigh = KNeighborsClassifier(n_neighbors=10)
# neigh.fit(x_train, np.ravel(y_train,order='C'))
    
# # Make predictions on the training data
# y_train_pred = neigh.predict(x_train)

# # Calculate accuracy on the training data
# train_accuracy = accuracy_score(y_train, y_train_pred)
# print(f"Train Accuracy on KNN: {train_accuracy}")

# ## XGB Classfier trained on complete training data (no missing values)
# xgb_model=xgb.XGBClassifier(objective='binary:logistic')
# xgb_model.fit(x_train,y_train)

# # Make predictions on the training data
# y_train_pred = xgb_model.predict(x_train)

# # Calculate accuracy on the training data
# train_accuracy = accuracy_score(y_train, y_train_pred)
# print(f"Train Accuracy on XGB: {train_accuracy}")

print('ACCURACY CALCULATION')

##find acc for knn ffill
predictions_knn_ffill = neigh.predict(full_test_set_ffill.iloc[:, :-1])
print(classification_report(full_test_set_ffill.iloc[:, -1], predictions_knn_ffill))
print("Accuracy Score of KNN Classifier (Forward Fill): ")
acc_knn_ffill = accuracy_score(full_test_set_ffill.iloc[:, -1], predictions_knn_ffill)

print(acc_knn_ffill)

### find acc for knn bfill
predictions_knn_bfill = neigh.predict(full_test_set_bfill.iloc[:, :-1])
print(classification_report(full_test_set_bfill.iloc[:, -1], predictions_knn_bfill))
print("Accuracy Score of KNN Classifier: (Backward Fill)")
acc_knn_bfill = accuracy_score(full_test_set_ffill.iloc[:, -1], predictions_knn_bfill)

print(acc_knn_bfill)

###find acc for xgboost ffill
predictions_xgb_ffill=xgb_model.predict(full_test_set_ffill.iloc[:, :-1])
print(classification_report(full_test_set_ffill.iloc[:, -1], predictions_xgb_ffill))
print("Accuracy Score of XGB Classifier (Forward Fill): ")
acc_xgb_ffill = accuracy_score(full_test_set_ffill.iloc[:, -1], predictions_xgb_ffill)

print(acc_xgb_ffill)

##find acc for knn bfill
predictions_xgb_bfill=xgb_model.predict(full_test_set_bfill.iloc[:, :-1])
print(classification_report(full_test_set_bfill.iloc[:, -1], predictions_xgb_bfill))
print("Accuracy Score of XGB Classifier (Backward Fill): ")
acc_xgb_bfill = accuracy_score(full_test_set_bfill.iloc[:, -1], predictions_xgb_bfill)


print(acc_xgb_bfill)



  imputed_test_df_ffill=missing_test_df.fillna(method='ffill')
  imputed_test_df_bfill=missing_test_df.fillna(method='bfill')
  full_test_set_ffill=full_test_set_ffill.fillna(method='bfill')
  full_test_set_bfill=full_test_set_bfill.fillna(method='ffill')


RMSE calculation


  imputed_test_set_ffill=imputed_test_set_ffill.fillna(method='bfill')


rmse_ffill
0.033479263288629574


  imputed_test_set_bfill=imputed_test_set_bfill.fillna(method='ffill')


rmse_bfill
0.033710399181793166
ACCURACY CALCULATION
              precision    recall  f1-score   support

           0       0.52      0.66      0.58   2786568
           1       0.45      0.30      0.36   2479656

    accuracy                           0.49   5266224
   macro avg       0.48      0.48      0.47   5266224
weighted avg       0.48      0.49      0.48   5266224

Accuracy Score of KNN Classifier (Forward Fill): 
0.49375871592245224
              precision    recall  f1-score   support

           0       0.52      0.66      0.58   2786568
           1       0.45      0.31      0.36   2479656

    accuracy                           0.49   5266224
   macro avg       0.48      0.48      0.47   5266224
weighted avg       0.48      0.49      0.48   5266224

Accuracy Score of KNN Classifier: (Backward Fill)
0.49383011432859675
              precision    recall  f1-score   support

           0       0.51      0.65      0.57   2786568
           1       0.43      0.29      0.35 