# Preprocessing

In this notebook, we will be preprocessing the data and conducting any necessary transformations for our models.

***

# Initialization

Importing libraries and notebooks.

In [1]:
# Importing libraries
import import_ipynb

# Importing A_eda notebook
from A_eda import *

In [2]:
# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":
    
    # Checking python hash seed
    print('Make sure the following says \'0\': ', os.environ.get('PYTHONHASHSEED'))

Make sure the following says '0':  0


In [3]:
# Creating a copy to preserve original folds
original_folds = folds.copy()

***

## Check the structure of our fold data

Let's check the structure of our folds before proceeding to transforming our data.

In [4]:
# Check the structure of folds

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":
    
    for i, fold in enumerate(folds):
        X_train, X_test, y_train, y_test = fold
        print(f'Fold {i + 1}')
        print(f'Feature columns in training set: {list(X_train.columns)}')
        print(f'Feature columns in testing set: {list(X_test.columns)}\n')

Fold 1
Feature columns in training set: ['age', 'gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']
Feature columns in testing set: ['age', 'gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']

Fold 2
Feature columns in training set: ['age', 'gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']
Feature columns in testing set: ['age', 'gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']

Fold 3
Feature columns in training set: ['age', 'gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']
Feature columns in testing set: ['age', 'gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']

Fold 4
Feature columns in training set: ['age', 'gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']
Feature columns in testing set: ['age', 'gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']

Fold 5
Feature columns in training set: ['ag

Everything looks good!

***

# Ordinal encoding: `time_of_day`

Since `time_of_day` is an ordinal feature, we will be encoding it using `OrdinalEncoder` with the specified order.

In [5]:
# Ordinal encode
from sklearn.preprocessing import OrdinalEncoder

# `time_of_day` should be encoded ordinally
# Creating a function to encode the splits
def KFoldOrdinalEncoder(folds, columns, orders):
    '''Perform ordinal encoding on all the columns specified in "columns" 
    and their corresponding label orders in "orders" for the feature 
    sets for each fold in "folds". Return the encoders for transformation later.'''
    encoders = {}
    
    for i, fold in enumerate(folds):
        X_train, X_test, y_train, y_test = fold
        fold_encoders = {}
        
        for col, order in zip(columns, orders):
            train_set = X_train.copy()
            test_set = X_test.copy()
            
            # Initialize encoder
            ordinal_encoder = OrdinalEncoder(categories=[order])
            
            # Encode (for both train and test)
            train_encoded = ordinal_encoder.fit_transform(X_train[[col]])
            test_encoded = ordinal_encoder.transform(X_test[[col]])
            
            # Create new encoded columns
            train_set.loc[:, f'{col}_en'] = train_encoded
            test_set.loc[:, f'{col}_en'] = test_encoded
            
            X_train = train_set
            X_test = test_set
            
            fold_encoders[col] = ordinal_encoder
        
        fold[0] = X_train
        fold[1] = X_test
        
        encoders[i + 1] = fold_encoders

    return encoders

In [6]:
# Ordinally encode `time_of_day`
oe_columns = ['time_of_day']
time_orders = ['Morning', 'Afternoon', 'Night', 'Evening', np.nan]
oe_orders = []
oe_orders.append(time_orders)

ordinal_encoders = KFoldOrdinalEncoder(folds, oe_columns, oe_orders)

***

# Label encoding the rest of the categorical features

We will now have to use `LabelEncoder` to encode the rest of our categorical features.

In [7]:
# Label encode
from sklearn.preprocessing import LabelEncoder

def KFoldLabelEncoder(folds, columns):
    '''Perform label encoding on all the columns specified in "columns" for each
    fold in "folds". Return the encoders for transformation later and a dict of the 
    labels and their corresponding values for when we process NaNs later.'''
    
    encoders = {}
    fold_labels = {}
    
    for i, fold in enumerate(folds):
        X_train, X_test, y_train, y_test = fold
        
        # Initialize encoders
        fold_encoders = {col: LabelEncoder() for col in columns}
        col_labels = {}
        
        for col, encoder in fold_encoders.items():
            train_set = X_train.copy()
            test_set = X_test.copy()
            
            # Encode (for both train and test)
            train_encoded = encoder.fit_transform(train_set.loc[:, col])
            test_encoded = encoder.transform(test_set.loc[:, col])
            
            # Add labels to col_labels
            col_labels[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
            
            # Create new encoded columns
            train_set.loc[:, f'{col}_en'] = train_encoded
            test_set.loc[:, f'{col}_en'] = test_encoded
            
            X_train = train_set
            X_test = test_set
        
        # Add column labels to each fold
        fold_labels[i + 1] = col_labels
        encoders[i + 1] = fold_encoders
        
        fold[0] = X_train
        fold[1] = X_test
    
    return encoders, fold_labels

In [8]:
# Label encode our categorical features
label_encoders, fold_labels = KFoldLabelEncoder(folds, categoricals)

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    # Check how our data is looking
    display(folds[0][0].head())
    # display(fold_labels)

Unnamed: 0,age,gender,device_type,ad_position,browsing_history,time_of_day,time_of_day_en,gender_en,device_type_en,ad_position_en,browsing_history_en
0,22.0,,Desktop,Top,Shopping,Afternoon,0.0,3,0,2,3
1,,Male,Desktop,Top,,,4.0,1,0,2,5
2,41.0,Non-Binary,,Side,Education,Night,3.0,2,3,1,0
3,34.0,Male,,,Entertainment,Evening,1.0,1,3,3,1
4,39.0,Non-Binary,,,Social Media,Morning,2.0,2,3,3,4


In [9]:
# Combine encoders
encoders_full = dict()

for fold, encoders in label_encoders.items():
    folds_oe = ordinal_encoders[fold]
    
    complete_dict = encoders.copy()
    complete_dict.update(folds_oe)
    
    encoders_full[fold] = complete_dict

***

# Reinstating NaNs for imputation

Before we impute, we will reinstate NaNs for our encoded columns before imputation.

In [10]:
# Reinstate NaNs
def KFoldNaNReinstater(folds, columns, fold_labels):
    '''Reinstate NaNs in every column in "columns" according to the label in "labels"
    for every fold in "folds".'''
    
    for fold, fold_no in zip(folds, fold_labels):
        X_train, X_test, y_train, y_test = fold
        labels = fold_labels[fold_no]
        
        # Unpack the labels
        for col, col_labels in labels.items():
            
            # Reinstate NaNs
            encode_col = f'{col}_en'
            X_train[encode_col] = X_train[encode_col].map(lambda x: np.nan if x == col_labels[np.nan] else x)

In [11]:
# Put back NaNs for folds
KFoldNaNReinstater(folds, categoricals, fold_labels)

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    # Check how data is looking
    display(folds[0][0].head())

Unnamed: 0,age,gender,device_type,ad_position,browsing_history,time_of_day,time_of_day_en,gender_en,device_type_en,ad_position_en,browsing_history_en
0,22.0,,Desktop,Top,Shopping,Afternoon,0.0,,0.0,2.0,3.0
1,,Male,Desktop,Top,,,,1.0,0.0,2.0,
2,41.0,Non-Binary,,Side,Education,Night,3.0,2.0,,1.0,0.0
3,34.0,Male,,,Entertainment,Evening,1.0,1.0,,,1.0
4,39.0,Non-Binary,,,Social Media,Morning,2.0,2.0,,,4.0


***

# Impute with MICE

With our NaNs reinstates, we can now impute using the MICE technique. We will briefly clean up the folds by removing the non-encoded columns and perform the imputation.

In [12]:
# Cleaning up the data before imputation
def KFoldSimplifier(folds, drop_columns):
    '''Remove the specified columns in "drop_columns" for each of the folds in "folds".'''
    for fold in folds:
        X_train, X_test, y_train, y_test = fold
        
        # Drop the columns
        X_train.drop(columns=drop_columns, inplace=True)
        
        X_test_cols = list(X_test.columns)
        drop_test_columns = list(set(drop_columns).intersection(set(X_test_cols)))
        X_test.drop(columns=drop_test_columns, inplace=True)

In [13]:
# Remove non-encoded columns
drop_cols = categoricals.copy()

KFoldSimplifier(folds, drop_cols)

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    print(drop_cols)
    
    # Check how data is looking
    display(folds[0][0].head())

['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']


Unnamed: 0,age,time_of_day_en,gender_en,device_type_en,ad_position_en,browsing_history_en
0,22.0,0.0,,0.0,2.0,3.0
1,,,1.0,0.0,2.0,
2,41.0,3.0,2.0,,1.0,0.0
3,34.0,1.0,1.0,,,1.0
4,39.0,2.0,2.0,,,4.0


In [14]:
# Impute function
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

def KFoldMICE(folds):
    '''Perform MICE imputation for each fold in "folds".'''
    for i, fold in enumerate(folds):
        # Unpack fold
        X_train, X_test, y_train, y_test = fold
        train_columns = X_train.columns
        test_columns = X_test.columns
        
        rf_regressor = RandomForestRegressor(n_estimators=100)
        imputer = IterativeImputer(estimator=rf_regressor, max_iter=50, missing_values=np.nan)
        
        # Impute the features in the fold
        train_imputed = imputer.fit_transform(X_train)
        test_imputed = imputer.transform(X_test)
        
        train_imputed = np.round(train_imputed).astype('int')
        test_imputed = np.round(train_imputed).astype('int')
        
        # Convert back to dataframe
        train_imputed_df = pd.DataFrame(data=train_imputed, columns=train_columns)
        test_imputed_df = pd.DataFrame(data=test_imputed, columns=test_columns)
        
        fold[0] = train_imputed_df
        fold[1] = test_imputed_df

In [15]:
# Impute
KFoldMICE(folds)

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    # Check how data is looking
    # The training set of the first fold
    display(folds[0][0].head())



Unnamed: 0,age,time_of_day_en,gender_en,device_type_en,ad_position_en,browsing_history_en
0,22,0,1,0,2,3
1,37,2,1,0,2,2
2,41,3,2,1,1,0
3,34,1,1,1,2,1
4,39,2,2,0,1,4


In [19]:
# Checking remaining Na's

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":
    # Checking remaining Na's
    for i, fold in enumerate(folds):
        print(f'Fold {i + 1}')
        X_train, X_test, y_train, y_test = fold
        print('Still missing in train set:\n', (X_train == -1).sum())
        print('\nStill missing in test set:\n', (X_test== -1).sum(), '\n')

Fold 1
Still missing in train set:
 age                    0
time_of_day_en         0
gender_en              0
device_type_en         0
ad_position_en         0
browsing_history_en    0
dtype: int64

Still missing in test set:
 age                    0
time_of_day_en         0
gender_en              0
device_type_en         0
ad_position_en         0
browsing_history_en    0
dtype: int64 

Fold 2
Still missing in train set:
 age                    0
time_of_day_en         0
gender_en              0
device_type_en         0
ad_position_en         0
browsing_history_en    0
dtype: int64

Still missing in test set:
 age                    0
time_of_day_en         0
gender_en              0
device_type_en         0
ad_position_en         0
browsing_history_en    0
dtype: int64 

Fold 3
Still missing in train set:
 age                    0
time_of_day_en         0
gender_en              0
device_type_en         0
ad_position_en         0
browsing_history_en    0
dtype: int64

Still missing 

Luckily, we have no persisting NaN values! Since the imputation was computationally complex, to avoid running it a second time, we will export the data. Then, we will now transform the encoded values back (inverse transform) and perform one-hot encoding before feature selection.

***

# Exporting data + reimporting

To prevent re-running our computationally heavy imputation, we will export the data and re-import it to preserve it.

In [21]:
# Exporting folds
folds_path = 'data/folds'

if not os.path.exists(folds_path):
    os.makedirs(folds_path)
    

for i, fold in enumerate(folds):
    fold_path = folds_path + '/' + f'{i + 1}'
    
    if not os.path.exists(fold_path):
        os.makedirs(fold_path)
        
    X_train, X_test, y_train, y_test = fold
    X_train.to_csv(fold_path + '/' + 'X_train.csv', index=False)
    X_test.to_csv(fold_path + '/' + 'X_test.csv', index=False)
    y_train.to_csv(fold_path + '/' + 'y_train.csv', index=False, header=['click'])
    y_test.to_csv(fold_path + '/' + 'y_test.csv', index=False, header=['click'])


In [22]:
# If we want to revisit the folds
imputed_folds = folds.copy()

In [24]:
# Importing folds
folds = list()
folds_path = Path('data/folds')

# Iterate over each fold directory
for fold in sorted(folds_path.iterdir()):  # Ensure directories are processed in sorted order
    if fold.is_dir():
        # Load data from each fold's files
        X_train = pd.read_csv(fold / 'X_train.csv')
        X_test = pd.read_csv(fold / 'X_test.csv')
        y_train = pd.read_csv(fold / 'y_train.csv')
        y_test = pd.read_csv(fold / 'y_test.csv')

        # Append the data as a list
        folds.append([X_train, X_test, y_train, y_test])

In [None]:
# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    # Check how imported data is looking
    display(folds[0][0].head())

Unnamed: 0,age,time_of_day_en,gender_en,device_type_en,ad_position_en,browsing_history_en
0,22,0,1,0,2,3
1,37,2,1,0,2,2
2,41,3,2,1,1,0
3,34,1,1,1,2,1
4,39,2,2,0,1,4


***

# Inverse Transforming and One-Hot Encoding

In [None]:
# Inverse transform function

def KFoldInverseTransform(folds, transform_columns, encoders):
    # Inverse transform for each fold
    for i, fold in enumerate(folds):
        
        # Unpack folds and encoders
        X_train, X_test, y_train, y_test = fold
        column_encoders = encoders[i + 1]
        
        # Inverse transform each of the columns
        for column in transform_columns:
            encoded_col_train = X_train.loc[:, f'{column}_en']
            encoded_col_test = X_test.loc[:, f'{column}_en']
            
            column_encoder = column_encoders[column]
            
            X_train.loc[:, f'{column}'] = column_encoder.inverse_transform(encoded_col_train)
            X_test.loc[:, f'{column}'] = column_encoder.inverse_transform(encoded_col_test)
        
        fold[0] = X_train
        fold[1] = X_test

In [None]:
# Inverse transform the folds

# Do not transform time_of_day (it is already properly encoded)
nominal_categories = categoricals.copy()
nominal_categories.remove('time_of_day')

KFoldInverseTransform(folds, nominal_categories, encoders_full)

# NOTEBOOK EXCLUSIVE CODE
if __name__ == "__main__":

    # Check how data is looking
    # The training set of the first fold
    display(encoded_folds[0][0].head())