# Feature Engineering

In [1]:
# Importing required libraries and modules
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime

In [2]:
processed_data = pd.read_csv('../data/processed/cleaned_data.csv')

In [3]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20161 entries, 0 to 20160
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   track                   20161 non-null  object 
 1   artist                  20161 non-null  object 
 2   uri                     20161 non-null  object 
 3   danceability            20161 non-null  float64
 4   energy                  20161 non-null  float64
 5   key                     20161 non-null  int64  
 6   loudness                20161 non-null  float64
 7   mode                    20161 non-null  int64  
 8   speechiness             20161 non-null  float64
 9   acousticness            20161 non-null  float64
 10  instrumentalness        20161 non-null  float64
 11  liveness                20161 non-null  float64
 12  valence                 20161 non-null  float64
 13  tempo                   20161 non-null  float64
 14  duration_ms             20161 non-null

### Generating days since release

In [4]:
curr_date = datetime.now()

processed_data['release_date'] = processed_data.release_date.apply(lambda x: pd.to_datetime(x))

processed_data['days_since_release'] = (curr_date - processed_data['release_date']).dt.days

### Generating number of artists

In [5]:
processed_data['num_artists'] = processed_data.artists.apply(lambda x: len(x.split('&&')))
processed_data['has_features'] = processed_data.num_artists.apply(lambda x: True if x > 1 else False)

### Generating length of song title and album title (in words)

In [6]:
processed_data['num_words_in_title'] = processed_data.track.apply(lambda x: len(x.split(' ')))
processed_data['num_words_in_album'] = processed_data.album.apply(lambda x: len(x.split(' ')))

1. Partition the dataset by the target
2. find the most common words in the song and album names for each (say 100)
3. 


## Approach 2

1. find the most common words in all hit songs
2. extract the top 100 words
3. In each track, add the number of total words that were included
4. I guess repeat it for flop songs too?


## Basic NLP on track title and album title

### Track title

In [7]:
hit_song_vectorizer = CountVectorizer(stop_words='english', max_features=100)
flop_song_vectorizer = CountVectorizer(stop_words='english', max_features=100)

hit_tracks = processed_data[processed_data.target == 1].track.copy()
flop_tracks = processed_data[processed_data.target == 0].track.copy()

In [8]:
hit_song_vectorizer.fit(hit_tracks)
flop_song_vectorizer.fit(flop_tracks)

In [9]:
hit_song_vectorizer.get_feature_names_out()

array(['ain', 'angel', 'away', 'baby', 'bad', 'beautiful', 'believe',
       'best', 'better', 'blue', 'body', 'boy', 'come', 'country',
       'crazy', 'dance', 'day', 'days', 'don', 'dream', 'dreams', 'eyes',
       'fall', 'feel', 'fly', 'forever', 'girl', 'girls', 'god', 'goes',
       'gone', 'gonna', 'good', 'goodbye', 'got', 'hard', 'heart',
       'heaven', 'hey', 'hold', 'home', 'hot', 'just', 'kiss', 'know',
       'la', 'lady', 'let', 'life', 'light', 'like', 'little', 'live',
       'll', 'lonely', 'long', 'look', 'love', 'make', 'man', 'mind',
       'miss', 'money', 'need', 'new', 'night', 'party', 'people', 'rain',
       'real', 'right', 'rock', 'roll', 'run', 'say', 'somebody', 'song',
       'stay', 'stop', 'summer', 'sweet', 'talk', 'tell', 'thing',
       'think', 'time', 'tonight', 'touch', 'town', 've', 'wait', 'wanna',
       'want', 'way', 'wild', 'woman', 'won', 'world', 'ya', 'young'],
      dtype=object)

In [10]:
num_hit_words_list = []
num_flop_words_list = []

transformed_hit_track_array = hit_song_vectorizer.inverse_transform(hit_song_vectorizer.transform(processed_data.track.copy()))
transformed_flop_track_array = flop_song_vectorizer.inverse_transform(flop_song_vectorizer.transform(processed_data.track.copy()))

for hit_words, flop_words in zip(transformed_hit_track_array, transformed_flop_track_array):
    num_hit_words_list.append(len(hit_words))
    num_flop_words_list.append(len(flop_words))

In [11]:
processed_data['num_hit_words_track'] = num_hit_words_list
processed_data['num_flop_words_track'] = num_flop_words_list

### Album title

In [12]:
hit_album_vectorizer = CountVectorizer(stop_words='english', max_features=100)
flop_album_vectorizer = CountVectorizer(stop_words='english', max_features=100)

hit_albums = processed_data[processed_data.target == 1].album.copy()
flop_albums = processed_data[processed_data.target == 0].album.copy()

In [13]:
hit_album_vectorizer.fit(hit_albums)
flop_album_vectorizer.fit(flop_albums)

In [14]:
flop_album_vectorizer.get_feature_names_out()

array(['20', 'album', 'ambient', 'american', 'anniversary', 'anthems',
       'anthology', 'best', 'big', 'black', 'blood', 'bluegrass', 'blues',
       'bossa', 'boxset', 'christmas', 'classic', 'classical',
       'collection', 'complete', 'corazón', 'country', 'dance', 'death',
       'del', 'deluxe', 'edition', 'el', 'en', 'ep', 'essential',
       'exitos', 'expanded', 'fantasy', 'feat', 'final', 'folk', 'girl',
       'goes', 'good', 'gospel', 'greatest', 'guitar', 'hardcore',
       'harry', 'heavy', 'hip', 'hits', 'hop', 'house', 'ii', 'ipanema',
       'jazz', 'la', 'land', 'life', 'live', 'lo', 'loops', 'los', 'love',
       'mejor', 'metal', 'motion', 'music', 'new', 'night', 'nova',
       'original', 'piano', 'picture', 'pop', 'potter', 'ps', 'pt',
       'punk', 'remastered', 'remasterizado', 'rock', 'roll', 'series',
       'sessions', 'sleep', 'songs', 'soul', 'soundtrack', 'street',
       'super', 'thrash', 'time', 'trance', 'trio', 'version', 'vol',
       'volume', 

In [15]:
num_hit_words_album_list = []
num_flop_words_album_list = []

transformed_hit_albums_array = hit_album_vectorizer.inverse_transform(hit_album_vectorizer.transform(processed_data.album.copy()))
transformed_flop_albums_array = flop_album_vectorizer.inverse_transform(flop_album_vectorizer.transform(processed_data.album.copy()))

for hit_albums, flop_albums in zip(transformed_hit_albums_array, transformed_flop_albums_array):
    num_hit_words_album_list.append(len(hit_albums))
    num_flop_words_album_list.append(len(flop_albums))

In [16]:
processed_data['num_hit_words_album'] = num_hit_words_album_list
processed_data['num_flop_words_album'] = num_flop_words_album_list

In [17]:
processed_data.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,target_popularity,days_since_release,num_artists,has_features,num_words_in_title,num_words_in_album,num_hit_words_track,num_flop_words_track,num_hit_words_album,num_flop_words_album
0,Suddenly Last Summer,The Motels,spotify:track:4fLIM0B1WwrLux9RdnMvze,0.716,0.753,2,-5.682,1,0.0286,0.162,...,0,8136,1,False,3,2,1,0,1,1
1,Sanctuary,Béla Fleck,spotify:track:3DwlNfiCQSdj0GOxYkR9Rq,0.36,0.542,5,-13.885,1,0.0339,0.368,...,0,13250,6,True,1,1,0,0,0,0
2,The Wild Rover,The Pogues,spotify:track:6JyYNPLalPgGa7XnclF5FO,0.656,0.512,7,-11.872,1,0.029,0.585,...,1,14711,1,False,3,6,1,0,3,2
3,In The Driver's Seat,John Schneider,spotify:track:6jJi8OXF5qaFdysB6sjWIT,0.642,0.889,2,-5.62,0,0.0494,0.375,...,0,16902,1,False,4,4,0,0,0,0
4,Young Wild And Free,Brighton Rock,spotify:track:7EBpncUwlHjLhQTetSLb9O,0.454,0.734,2,-15.559,1,0.041,0.124,...,0,13980,1,False,4,4,2,0,1,0


In [18]:
processed_data.drop(['artist','uri','target','release_date_precision','popularities'], axis=1, inplace=True)
processed_data.to_csv('../data/engineered/final_data.csv', index=False)