# Preprocessing

In this notebook, we will be preprocessing the data and conducting any necessary transformations for our models.

***

# Initialization

Importing libraries and notebooks.

In [1]:
# Importing libraries
import import_ipynb

# Importing A_eda notebook
from A_eda import *

In [2]:
# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":
    
    # Checking python hash seed
    print('Make sure the following says \'0\': ', os.environ.get('PYTHONHASHSEED'))

Make sure the following says '0':  0


***

# Data cleaning: encoding + imputation

We will perform any necessary data cleaning steps in this section. In the previous notebook, we already dropped the duplicates, and established that the data is pretty well-balanced. However, we must deal with the many missing data. We will be using the **MICE (Multivariate Imputation of Chained Equations)** in order to impute the data.

The preliminary step before imputing is to encode the categorical labels.

In [3]:
# Creating a copy to preserve original folds
original_folds = folds.copy()

***

## Ordinal encoding: `time_of_day`

Since `time_of_day` is an ordinal feature, we will be encoding it using `OrdinalEncoder` with the specified order.

In [4]:
# Ordinal encode
from sklearn.preprocessing import OrdinalEncoder

# `time_of_day` should be encoded ordinally
# Creating a function to encode the splits
def KFoldOrdinalEncoder(folds, columns, orders):
    '''Perform ordinal encoding on all the columns specified in "columns" 
    and their corresponding label orders in "orders" for the feature 
    sets for each fold in "folds". Return the encoders for transformation later.'''
    encoders = {}
    
    for i, fold in enumerate(folds):
        X_train, X_test, y_train, y_test = fold
        fold_encoders = {}
        
        for col, order in zip(columns, orders):
            train_set = X_train.copy()
            test_set = X_test.copy()
            
            # Initialize encoder
            ordinal_encoder = OrdinalEncoder(categories=[order])
            
            # Encode (for both train and test)
            train_encoded = ordinal_encoder.fit_transform(X_train[[col]])
            test_encoded = ordinal_encoder.transform(X_test[[col]])
            
            # Create new encoded columns
            train_set.loc[:, f'{col}_en'] = train_encoded
            test_set.loc[:, f'{col}_en'] = test_encoded
            
            X_train = train_set
            X_test = test_set
            
            fold_encoders[col] = ordinal_encoder
        
        fold[0] = X_train
        fold[1] = X_test
        
        encoders[i + 1] = fold_encoders

    return encoders

In [5]:
# Ordinally encode `time_of_day`
oe_columns = ['time_of_day']
time_orders = ['Morning', 'Afternoon', 'Night', 'Evening', np.nan]
oe_orders = []
oe_orders.append(time_orders)

ordinal_encoders = KFoldOrdinalEncoder(folds, oe_columns, oe_orders)

***

## Label encoding the rest of the categorical features

We will now have to use `LabelEncoder` to encode the rest of our categorical features.

In [6]:
# Label encode
from sklearn.preprocessing import LabelEncoder

def KFoldLabelEncoder(folds, columns):
    '''Perform label encoding on all the columns specified in "columns" for each
    fold in "folds". Return the encoders for transformation later and a dict of the 
    labels and their corresponding values for when we process NaNs later.'''
    
    encoders = {}
    fold_labels = {}
    
    for i, fold in enumerate(folds):
        X_train, X_test, y_train, y_test = fold
        
        # Initialize encoders
        fold_encoders = {col: LabelEncoder() for col in columns}
        col_labels = {}
        
        for col, encoder in fold_encoders.items():
            train_set = X_train.copy()
            test_set = X_test.copy()
            
            # Encode (for both train and test)
            train_encoded = encoder.fit_transform(train_set.loc[:, col])
            test_encoded = encoder.transform(test_set.loc[:, col])
            
            # Add labels to col_labels
            col_labels[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
            
            # Create new encoded columns
            train_set.loc[:, f'{col}_en'] = train_encoded
            test_set.loc[:, f'{col}_en'] = test_encoded
            
            X_train = train_set
            X_test = test_set
        
        # Add column labels to each fold
        fold_labels[i + 1] = col_labels
        encoders[i + 1] = fold_encoders
        
        fold[0] = X_train
        fold[1] = X_test
    
    return encoders, fold_labels

In [7]:
# Label encode our categorical features
label_encoders, fold_labels = KFoldLabelEncoder(folds, categoricals)

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    # Check how our data is looking
    display(folds[0][0].head())
    # display(fold_labels)

Unnamed: 0,id,age,gender,device_type,ad_position,browsing_history,time_of_day,sample_weight,time_of_day_en,gender_en,device_type_en,ad_position_en,browsing_history_en
0,670,22.0,,Desktop,Top,Shopping,Afternoon,0.22332,0.0,3,0,2,3
1,3044,,Male,Desktop,Top,,,0.255223,4.0,1,0,2,5
2,5912,41.0,Non-Binary,,Side,Education,Night,0.22332,3.0,2,3,1,0
3,5418,34.0,Male,,,Entertainment,Evening,0.178656,1.0,1,3,3,1
4,9452,39.0,Non-Binary,,,Social Media,Morning,1.786563,2.0,2,3,3,4


In [8]:
# Combine encoders
encoders_full = dict()

for fold, encoders in label_encoders.items():
    folds_oe = ordinal_encoders[fold]
    
    complete_dict = encoders.copy()
    complete_dict.update(folds_oe)
    
    encoders_full[fold] = complete_dict

***

## Reinstating NaNs for imputation

Before we impute, we will reinstate NaNs for our encoded columns before imputation.

In [9]:
# Reinstate NaNs
def KFoldNaNReinstater(folds, columns, fold_labels):
    '''Reinstate NaNs in every column in "columns" according to the label in "labels"
    for every fold in "folds".'''
    
    for fold, fold_no in zip(folds, fold_labels):
        X_train, X_test, y_train, y_test = fold
        labels = fold_labels[fold_no]
        
        # Unpack the labels
        for col, col_labels in labels.items():
            
            # Reinstate NaNs
            encode_col = f'{col}_en'
            X_train[encode_col] = X_train[encode_col].map(lambda x: np.nan if x == col_labels[np.nan] else x)

In [10]:
# Put back NaNs for folds
KFoldNaNReinstater(folds, categoricals, fold_labels)

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    # Check how data is looking
    display(folds[0][0].head())

Unnamed: 0,id,age,gender,device_type,ad_position,browsing_history,time_of_day,sample_weight,time_of_day_en,gender_en,device_type_en,ad_position_en,browsing_history_en
0,670,22.0,,Desktop,Top,Shopping,Afternoon,0.22332,0.0,,0.0,2.0,3.0
1,3044,,Male,Desktop,Top,,,0.255223,,1.0,0.0,2.0,
2,5912,41.0,Non-Binary,,Side,Education,Night,0.22332,3.0,2.0,,1.0,0.0
3,5418,34.0,Male,,,Entertainment,Evening,0.178656,1.0,1.0,,,1.0
4,9452,39.0,Non-Binary,,,Social Media,Morning,1.786563,2.0,2.0,,,4.0


***

## Impute with MICE

With our NaNs reinstates, we can now impute using the MICE technique. We will briefly clean up the folds by removing the non-encoded columns and perform the imputation.

In [None]:
# Cleaning up the data before imputation
def KFoldSimplifier(folds, drop_columns):
    '''Remove the specified columns in "drop_columns" for each of the folds in "folds".'''
    for fold in folds:
        X_train, X_test, y_train, y_test = fold
        
        # Drop the columns
        X_train.drop(columns=drop_columns, inplace=True)
        
        X_test_cols = list(X_test.columns)
        drop_test_columns = list(set(drop_columns).intersection(set(X_test_cols)))
        X_test.drop(columns=drop_test_columns, inplace=True)

In [12]:
# Remove non-encoded columns
drop_cols = categoricals.copy()
drop_cols.append('sample_weight')

KFoldSimplifier(folds, drop_cols)

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    print(drop_cols)
    
    # Check how data is looking
    display(folds[0][0].head())

['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day', 'sample_weight']


Unnamed: 0,id,age,time_of_day_en,gender_en,device_type_en,ad_position_en,browsing_history_en
0,670,22.0,0.0,,0.0,2.0,3.0
1,3044,,,1.0,0.0,2.0,
2,5912,41.0,3.0,2.0,,1.0,0.0
3,5418,34.0,1.0,1.0,,,1.0
4,9452,39.0,2.0,2.0,,,4.0


In [20]:
# Impute function
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

def KFoldMICE(folds):
    '''Perform MICE imputation for each fold in "folds".'''
    for i, fold in enumerate(folds):
        # Unpack fold
        X_train, X_test, y_train, y_test = fold
        train_columns = X_train.columns
        test_columns = X_test.columns
        
        imputer = IterativeImputer(max_iter=10)
        
        # Impute the features in the fold
        train_imputed = imputer.fit_transform(X_train)
        test_imputed = imputer.transform(X_test)
        
        train_imputed = np.round(train_imputed).astype('int')
        test_imputed = np.round(train_imputed).astype('int')
        
        # Convert back to dataframe
        train_imputed_df = pd.DataFrame(data=train_imputed, columns=train_columns)
        test_imputed_df = pd.DataFrame(data=test_imputed, columns=test_columns)
        
        fold[0] = train_imputed_df
        fold[1] = test_imputed_df

In [24]:
# Impute
KFoldMICE(folds)

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    # Check how data is looking
    # The training set of the first fold
    display(folds[0][0].head())

Unnamed: 0,id,age,time_of_day_en,gender_en,device_type_en,ad_position_en,browsing_history_en
0,670,22,0,1,0,2,3
1,3044,40,2,1,0,2,2
2,5912,41,3,2,1,1,0
3,5418,34,1,1,1,1,1
4,9452,39,2,2,1,1,4


We will now transform the encoded values back and perform one-hot encoding for our model.