# Data Proprocessing

## 1. Preparing Environment

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

## 2. Data Loading

In [2]:
import sys
sys.path.append('..')

from src.utils.utils import load_config
from src.data.load_data import load_data

config = load_config('../configs/config.yaml')
train_df, test_df = load_data(config=config)

Loading training data from ../data/raw/train.csv
Loading test data from ../data/raw/test.csv
Train shape: (61609, 62)
Test shape: (41074, 61)


## 3. Handling Missing Values


### 3.1 Impute using KNN and Mode 

In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import time
from src.utils.utils import load_config, save_dataframe
import os

# 1. Identify numerical and categorical columns
num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

if 'target' in num_cols:
    num_cols.remove('target')

print(f"Identified {len(num_cols)} numerical columns and {len(cat_cols)} categorical columns for imputation.")

# 2. KNN Imputation for numerical columns
print("\n[INFO] Starting KNN imputation for numerical features...")
start_time = time.time()
imputer = KNNImputer(n_neighbors=5, weights='uniform')
train_df_imputed = train_df.copy()
test_df_imputed = test_df.copy()

train_df_imputed[num_cols] = imputer.fit_transform(train_df[num_cols])
test_df_imputed[num_cols] = imputer.transform(test_df[num_cols])
elapsed = time.time() - start_time
print(f"[INFO] KNN imputation for numerical features completed in {elapsed:.2f} seconds.")

# 3. Mode imputation for categorical columns
print("\n[INFO] Starting mode imputation for categorical features...")
start_time = time.time()
for col in cat_cols:
    mode = train_df[col].mode()[0]
    train_df_imputed[col] = train_df[col].fillna(mode)
    test_df_imputed[col] = test_df[col].fillna(mode)
elapsed = time.time() - start_time
print(f"[INFO] Mode imputation for categorical features completed in {elapsed:.2f} seconds.")

# 4. Check missing values after imputation
print("\n[INFO] Top columns with missing values in train after imputation:")
print(train_df_imputed.isnull().sum().sort_values(ascending=False).head(10))

print("\n[INFO] Top columns with missing values in test after imputation:")
print(test_df_imputed.isnull().sum().sort_values(ascending=False).head(10))

# 5. Save preprocessed data
print("\n[INFO] Saving preprocessed datasets...")
config = load_config('../configs/config.yaml')
processed_dir = config['data']['processed_dir']

# Define save paths
train_save_path = os.path.join(processed_dir, "train_processed.csv")
test_save_path = os.path.join(processed_dir, "test_processed.csv")

# Save DataFrames
save_dataframe(train_df_imputed, train_save_path)
save_dataframe(test_df_imputed, test_save_path)

print("[INFO] Preprocessed datasets saved successfully.")

Identified 52 numerical columns and 9 categorical columns for imputation.

[INFO] Starting KNN imputation for numerical features...
[INFO] KNN imputation for numerical features completed in 641.62 seconds.

[INFO] Starting mode imputation for categorical features...
[INFO] Mode imputation for categorical features completed in 0.27 seconds.

[INFO] Top columns with missing values in train after imputation:
id                     0
emotional_charge_2     0
groove_efficiency_1    0
beat_frequency_1       0
organic_texture_2      0
composition_label_0    0
harmonic_scale_1       0
intensity_index_0      0
duration_ms_0          0
album_name_length      0
dtype: int64

[INFO] Top columns with missing values in test after imputation:
id                     0
emotional_charge_2     0
groove_efficiency_1    0
beat_frequency_1       0
organic_texture_2      0
composition_label_0    0
harmonic_scale_1       0
intensity_index_0      0
duration_ms_0          0
album_name_length      0
dtype: int64

## 4. Scaling the numerical features

In [6]:
from src.data.preprocess import DataStandardizer
from src.utils.utils import load_config, save_dataframe
import os

# 1. Initialize and fit on train
standardizer = DataStandardizer()
train_standardized = standardizer.fit_transform(train_df_imputed, target_col='target')
test_standardized = standardizer.transform(test_df_imputed)

# 2. Save standardized data
config = load_config('../configs/config.yaml')
processed_dir = config['data']['processed_dir']
train_save_path = os.path.join(processed_dir, "train_standardized.csv")
test_save_path = os.path.join(processed_dir, "test_standardized.csv")

save_dataframe(train_standardized, train_save_path)
save_dataframe(test_standardized, test_save_path)

print("[INFO] Standardized datasets saved successfully.")

[INFO] DataFrame saved to ../data/processed\train_standardized.csv
[INFO] DataFrame saved to ../data/processed\test_standardized.csv
[INFO] Standardized datasets saved successfully.
