# Data Proprocessing

## 1. Preparing Environment

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

## 2. Data Loading

In [6]:
import sys
sys.path.append('..')

from src.utils.utils import load_config
from src.data.load_data import load_data

config = load_config('../configs/config.yaml')
train_df, test_df = load_data(config=config)

Loading training data from ../data/raw/train.csv
Loading test data from ../data/raw/test.csv
Train shape: (61609, 62)
Test shape: (41074, 61)


## 3. Handling Missing Values


### 3.1 Impute using KNN and Mode 

In [21]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import time
from src.utils.utils import load_config, save_dataframe
import os

# 1. Identify numerical and categorical columns
num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

if 'target' in num_cols:
    num_cols.remove('target')

print(f"Identified {len(num_cols)} numerical columns and {len(cat_cols)} categorical columns for imputation.")

# 2. KNN Imputation for numerical columns
print("\n[INFO] Starting KNN imputation for numerical features...")
start_time = time.time()
imputer = KNNImputer(n_neighbors=5, weights='uniform')
train_df_imputed = train_df.copy()
test_df_imputed = test_df.copy()

train_df_imputed[num_cols] = imputer.fit_transform(train_df[num_cols])
test_df_imputed[num_cols] = imputer.transform(test_df[num_cols])
elapsed = time.time() - start_time
print(f"[INFO] KNN imputation for numerical features completed in {elapsed:.2f} seconds.")

# 3. Mode imputation for categorical columns
print("\n[INFO] Starting mode imputation for categorical features...")
start_time = time.time()
for col in cat_cols:
    mode = train_df[col].mode()[0]
    train_df_imputed[col] = train_df[col].fillna(mode)
    test_df_imputed[col] = test_df[col].fillna(mode)
elapsed = time.time() - start_time
print(f"[INFO] Mode imputation for categorical features completed in {elapsed:.2f} seconds.")

# 4. Check missing values after imputation
print("\n[INFO] Top columns with missing values in train after imputation:")
print(train_df_imputed.isnull().sum().sort_values(ascending=False).head(10))

print("\n[INFO] Top columns with missing values in test after imputation:")
print(test_df_imputed.isnull().sum().sort_values(ascending=False).head(10))

# 5. Save preprocessed data
print("\n[INFO] Saving preprocessed datasets...")
config = load_config('../configs/config.yaml')
processed_dir = config['data']['processed_dir']

# Define save paths
train_save_path = os.path.join(processed_dir, "train_processed.csv")
test_save_path = os.path.join(processed_dir, "test_processed.csv")

# Save DataFrames
save_dataframe(train_df_imputed, train_save_path)
save_dataframe(test_df_imputed, test_save_path)

print("[INFO] Preprocessed datasets saved successfully.")

Identified 52 numerical columns and 9 categorical columns for imputation.

[INFO] Starting KNN imputation for numerical features...
[INFO] KNN imputation for numerical features completed in 797.80 seconds.

[INFO] Starting mode imputation for categorical features...
[INFO] Mode imputation for categorical features completed in 0.59 seconds.

[INFO] Top columns with missing values in train after imputation:
id                     0
emotional_charge_2     0
groove_efficiency_1    0
beat_frequency_1       0
organic_texture_2      0
composition_label_0    0
harmonic_scale_1       0
intensity_index_0      0
duration_ms_0          0
album_name_length      0
dtype: int64

[INFO] Top columns with missing values in test after imputation:
id                     0
emotional_charge_2     0
groove_efficiency_1    0
beat_frequency_1       0
organic_texture_2      0
composition_label_0    0
harmonic_scale_1       0
intensity_index_0      0
duration_ms_0          0
album_name_length      0
dtype: int64

## 4. Encode Categorical Columns

In [2]:
categorical_cols = [
    'composition_label_0',
    'composition_label_1',
    'weekday_of_release',
    'season_of_release',
    'lunar_phase',
    'creator_collective',
    'composition_label_2',
    'track_identifier'
]

high_card_cols = [
    'composition_label_0',
    'composition_label_1',
    'composition_label_2',
    'creator_collective',
    'track_identifier'
]

low_card_cols = [
    'weekday_of_release',
    'season_of_release',
    'lunar_phase',
    'release_period_in_month'
]

### 4.1 Frequency Encording for High-Cardinality Columns + One Hot Encoding for Low-Cardinality Columns

This step is done after the time related features had  been created

In [7]:
import sys
sys.path.append('..')
from src.data.preprocess import CombinedFrequencyEncoder
from src.data.load_data import load_data
from src.utils.utils import save_dataframe
import os

train_df, test_df = load_data(train_path="../data/processed/train_new1.csv",
                              test_path="../data/processed/test_new1.csv"
                              )

# Instantiate the encoder
freq_encoder = CombinedFrequencyEncoder(high_card_cols)

# Apply encoding
train_df, test_df = freq_encoder.fit_transform(train_df, test_df)

# Optionally drop the original high-cardinality columns
train_df.drop(columns=high_card_cols, inplace=True)
test_df.drop(columns=high_card_cols, inplace=True)

# Print results
print("Frequency encoding applied.")

    
# Apply OHE 
train_df = pd.get_dummies(train_df, columns=low_card_cols)
test_df = pd.get_dummies(test_df, columns=low_card_cols)

# Align columns to ensure train and test have the same features
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)
test_df = test_df.drop(columns=['target'])

# Print results
print("One Hot encoding applied.")

# Convert boolean columns to integers (0/1)
bool_cols_train = train_df.select_dtypes(include='bool').columns
train_df[bool_cols_train] = train_df[bool_cols_train].astype(int)
bool_cols_test = test_df.select_dtypes(include='bool').columns
test_df[bool_cols_test] = test_df[bool_cols_test].astype(int)

# Save the encoded data
config = load_config('../configs/config.yaml')
processed_dir = config['data']['processed_dir']

train_save_path = os.path.join(processed_dir, "train_encoded.csv")
test_save_path = os.path.join(processed_dir, "test_encoded.csv")

save_dataframe(train_df, train_save_path)
save_dataframe(test_df, test_save_path)

print("[INFO] Encoded datasets saved successfully.")

Loading training data from ../data/processed/train_new1.csv
Loading test data from ../data/processed/test_new1.csv
Train shape: (61609, 82)
Test shape: (41074, 81)
Frequency encoding applied.
One Hot encoding applied.
[INFO] DataFrame saved to ../data/processed\train_encoded.csv
[INFO] DataFrame saved to ../data/processed\test_encoded.csv
[INFO] Encoded datasets saved successfully.


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61609 entries, 0 to 61608
Data columns (total 96 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   id                                  61609 non-null  float64
 1   emotional_charge_2                  61609 non-null  float64
 2   groove_efficiency_1                 61609 non-null  float64
 3   beat_frequency_1                    61609 non-null  float64
 4   organic_texture_2                   61609 non-null  float64
 5   harmonic_scale_1                    61609 non-null  float64
 6   intensity_index_0                   61609 non-null  float64
 7   duration_ms_0                       61609 non-null  float64
 8   album_name_length                   61609 non-null  float64
 9   beat_frequency_0                    61609 non-null  float64
 10  beat_frequency_2                    61609 non-null  float64
 11  artist_count                        61609

In [9]:
train_df.head()

Unnamed: 0,id,emotional_charge_2,groove_efficiency_1,beat_frequency_1,organic_texture_2,harmonic_scale_1,intensity_index_0,duration_ms_0,album_name_length,beat_frequency_0,...,season_of_release_spring,season_of_release_summer,season_of_release_winter,lunar_phase_full,lunar_phase_new,lunar_phase_waning,lunar_phase_waxing,release_period_in_month_early,release_period_in_month_late,release_period_in_month_mid
0,76339.0,0.48285,1.169231,80.018,0.0201,1.0,0.789,154586.0,13.8,95.992,...,0,1,0,0,0,1,0,1,0,0
1,80006.0,0.267862,1.321321,147.966,0.334,6.0,0.715,46874.0,15.0,148.076,...,0,1,0,0,0,1,0,1,0,0
2,83501.0,0.242606,1.285319,142.98,0.111,4.0,0.7288,264665.0,7.0,124.738,...,0,0,0,1,0,0,0,0,0,1
3,81530.0,0.4264,1.279435,123.063,0.196,5.0,0.685,209208.0,5.0,99.5758,...,0,0,0,1,0,0,0,0,1,0
4,60534.0,0.0,0.974906,132.722,0.0811,6.0,0.856,215346.0,5.0,118.006,...,0,0,1,0,0,0,1,1,0,0
