# Feature Engineering

This notebook is used for creating new features from the existing ones

## 1. Setting up the Environment

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

## 2. Load Data

In [2]:
import sys
sys.path.append('..')


from src.data.load_data import load_data


train_df, test_df = load_data(train_path='../data/processed/train_processed.csv',
                              test_path='../data/processed/test_processed.csv')

Loading training data from ../data/processed/train_processed.csv
Loading test data from ../data/processed/test_processed.csv
Train shape: (61609, 62)
Test shape: (41074, 61)


## 3. Derive New Features from 'Publication_Timestamp'

In [3]:
from src.data.build_features import create_date_features_train_test

train_df, test_df = create_date_features_train_test(train_df, test_df, date_col='publication_timestamp', target_col='target')

In [4]:
train_df.drop('publication_timestamp', axis=1, inplace=True)
test_df.drop('publication_timestamp', axis=1, inplace=True)



In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61609 entries, 0 to 61608
Data columns (total 82 columns):
 #   Column                              Non-Null Count  Dtype   
---  ------                              --------------  -----   
 0   id                                  61609 non-null  float64 
 1   emotional_charge_2                  61609 non-null  float64 
 2   groove_efficiency_1                 61609 non-null  float64 
 3   beat_frequency_1                    61609 non-null  float64 
 4   organic_texture_2                   61609 non-null  float64 
 5   composition_label_0                 61609 non-null  object  
 6   harmonic_scale_1                    61609 non-null  float64 
 7   intensity_index_0                   61609 non-null  float64 
 8   duration_ms_0                       61609 non-null  float64 
 9   album_name_length                   61609 non-null  float64 
 10  beat_frequency_0                    61609 non-null  float64 
 11  beat_frequency_2            

In [6]:
train_df.head()

Unnamed: 0,id,emotional_charge_2,groove_efficiency_1,beat_frequency_1,organic_texture_2,composition_label_0,harmonic_scale_1,intensity_index_0,duration_ms_0,album_name_length,...,is_new_year_release,month_sin,month_cos,day_sin,day_cos,songs_released_that_year,songs_released_that_month,release_period_in_month,release_month_popularity_score,release_dayofweek_popularity_score
0,76339.0,0.48285,1.169231,80.018,0.0201,Country Stuff (feat. Jake Owen),1.0,0.789,154586.0,13.8,...,0,1.224647e-16,-1.0,0.724793,0.688967,11679,1413,early,53.437764,56.875643
1,80006.0,0.267862,1.321321,147.966,0.334,Solitude,6.0,0.715,46874.0,15.0,...,0,-0.5,-0.8660254,0.201299,0.97953,6104,426,early,49.497283,48.965015
2,83501.0,0.242606,1.285319,142.98,0.111,BDFFRNT (Saved from Conformity),4.0,0.7288,264665.0,7.0,...,0,-0.5,0.8660254,-0.485302,-0.874347,2071,212,mid,54.297864,50.66579
3,81530.0,0.4264,1.279435,123.063,0.196,Headlights (feat. Ilsey),5.0,0.685,209208.0,5.0,...,0,-1.0,-1.83697e-16,-0.937752,0.347305,2757,289,late,52.833078,56.875643
4,60534.0,0.0,0.974906,132.722,0.0811,Afraid,6.0,0.856,215346.0,5.0,...,1,0.5,0.8660254,0.201299,0.97953,718,300,early,49.883403,44.80519


In [7]:
train_df.to_csv('../data/processed/train_new1.csv', index=False)
test_df.to_csv('../data/processed/test_new1.csv', index=False)

## 4. Create features from Highly Correlated Features

In [1]:
import sys
sys.path.append('..')

from src.data.load_data import load_data
from src.data.build_features import create_composition_features


train_df, test_df = load_data(train_path="../data/processed/train_encoded.csv",
                              test_path="../data/processed/test_encoded.csv"
                              )
train_df, test_df = create_composition_features(train_df, test_df, n_clusters=5)



Loading training data from ../data/processed/train_encoded.csv
Loading test data from ../data/processed/test_encoded.csv
Train shape: (61609, 97)
Test shape: (41074, 96)


In [6]:
from src.utils.utils import save_dataframe, load_config
import os

# Save the encoded data
config = load_config('../configs/config.yaml')
processed_dir = config['data']['processed_dir']

train_save_path = os.path.join(processed_dir, "train_encoded.csv")
test_save_path = os.path.join(processed_dir, "test_encoded.csv")

save_dataframe(train_df, train_save_path)
save_dataframe(test_df, test_save_path)

print("[INFO] Encoded datasets saved successfully.")

[INFO] DataFrame saved to ../data/processed\train_encoded.csv
[INFO] DataFrame saved to ../data/processed\test_encoded.csv
[INFO] Encoded datasets saved successfully.
