### Cleaning the Data

In [None]:
import numpy as np
import pandas as pd

In [None]:
music = pd.read_csv("../../data/raw/song_data.csv.zip")

In [None]:
songs = music.loc[0:499]
songs

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,Boulevard of Broken Dreams,73,262333,0.005520,0.496,0.682,0.000029,8,0.0589,-4.095,1,0.0294,167.060,4,0.474
1,In The End,66,216933,0.010300,0.542,0.853,0.000000,3,0.1080,-6.407,0,0.0498,105.256,4,0.370
2,Seven Nation Army,76,231733,0.008170,0.737,0.463,0.447000,0,0.2550,-7.828,1,0.0792,123.881,4,0.324
3,By The Way,74,216933,0.026400,0.451,0.970,0.003550,0,0.1020,-4.938,1,0.1070,122.444,4,0.198
4,How You Remind Me,56,223826,0.000954,0.447,0.766,0.000000,10,0.1130,-5.065,1,0.0313,172.011,4,0.574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,China Grove,67,195306,0.319000,0.623,0.910,0.000448,9,0.0810,-8.448,1,0.0384,145.624,4,0.745
496,Foreplay / Long Time,64,467640,0.009750,0.436,0.657,0.007480,5,0.0931,-8.868,1,0.0541,118.693,4,0.210
497,Come Sail Away,59,365000,0.160000,0.287,0.562,0.000000,0,0.1030,-7.549,1,0.0304,147.622,4,0.126
498,Give A Little Bit,72,248173,0.069400,0.531,0.818,0.009600,2,0.2630,-5.358,1,0.0452,90.767,4,0.471


In [None]:
songs_cleaned = songs.drop(['audio_mode', 'time_signature'], axis=1)
songs_cleaned
#Cleaning the data. Here I am cleaning the data by removing 2 columns that are not significant in my research or are just repetitive.

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,audio_valence
0,Boulevard of Broken Dreams,73,262333,0.005520,0.496,0.682,0.000029,8,0.0589,-4.095,0.0294,167.060,0.474
1,In The End,66,216933,0.010300,0.542,0.853,0.000000,3,0.1080,-6.407,0.0498,105.256,0.370
2,Seven Nation Army,76,231733,0.008170,0.737,0.463,0.447000,0,0.2550,-7.828,0.0792,123.881,0.324
3,By The Way,74,216933,0.026400,0.451,0.970,0.003550,0,0.1020,-4.938,0.1070,122.444,0.198
4,How You Remind Me,56,223826,0.000954,0.447,0.766,0.000000,10,0.1130,-5.065,0.0313,172.011,0.574
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,China Grove,67,195306,0.319000,0.623,0.910,0.000448,9,0.0810,-8.448,0.0384,145.624,0.745
496,Foreplay / Long Time,64,467640,0.009750,0.436,0.657,0.007480,5,0.0931,-8.868,0.0541,118.693,0.210
497,Come Sail Away,59,365000,0.160000,0.287,0.562,0.000000,0,0.1030,-7.549,0.0304,147.622,0.126
498,Give A Little Bit,72,248173,0.069400,0.531,0.818,0.009600,2,0.2630,-5.358,0.0452,90.767,0.471


In [None]:
songs_cleaned['danceable_valence'] = songs_cleaned['danceability'] + songs_cleaned['audio_valence']
# These 2 variables have very similar values so I combined them
songs_cleaned

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,audio_valence,danceable_valence
0,Boulevard of Broken Dreams,73,262333,0.005520,0.496,0.682,0.000029,8,0.0589,-4.095,0.0294,167.060,0.474,0.970
1,In The End,66,216933,0.010300,0.542,0.853,0.000000,3,0.1080,-6.407,0.0498,105.256,0.370,0.912
2,Seven Nation Army,76,231733,0.008170,0.737,0.463,0.447000,0,0.2550,-7.828,0.0792,123.881,0.324,1.061
3,By The Way,74,216933,0.026400,0.451,0.970,0.003550,0,0.1020,-4.938,0.1070,122.444,0.198,0.649
4,How You Remind Me,56,223826,0.000954,0.447,0.766,0.000000,10,0.1130,-5.065,0.0313,172.011,0.574,1.021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,China Grove,67,195306,0.319000,0.623,0.910,0.000448,9,0.0810,-8.448,0.0384,145.624,0.745,1.368
496,Foreplay / Long Time,64,467640,0.009750,0.436,0.657,0.007480,5,0.0931,-8.868,0.0541,118.693,0.210,0.646
497,Come Sail Away,59,365000,0.160000,0.287,0.562,0.000000,0,0.1030,-7.549,0.0304,147.622,0.126,0.413
498,Give A Little Bit,72,248173,0.069400,0.531,0.818,0.009600,2,0.2630,-5.358,0.0452,90.767,0.471,1.002


## Wrangling data

#### Since one of my research questions has to do with genres of a song, but I cannot find all the genres because it is not only time consuming, but there are multiple songs with that one title and I will not be able to figure out all of it so I am just going to find the top 30 popular songs and 30 least popular songs and their genres and attempt to answer my question that way.

In [None]:
most_popular = songs_cleaned.sort_values("song_popularity", ascending=False).head(30)
#these songs are the top popular songs in this dataset


In [None]:
new_most_popular = most_popular.drop(labels=[353,126,155,173,364], axis=0)
#when individually finding the genre of each song I saw that my data had duplicate rows but with different numbers so I just removed the 2 duplicate rows so I will not get false data.Also, I will remove the song "Bleeding love" simply because there are many songs with that title and I cannot differentiate them.


In [None]:
# After all my sorting, I now have the genres of the top 25 songs in my dataset and I will add them into my data under a new column titled "genre".
Genre = ['Reggae', 'Hip Hop', 'Pop', 'Pop', 'Hip Hop', 'Pop', 'Hip Hop', 'Reggae', 'Reggae', 'R&B', 'Pop', 'Pop', 'Pop', 'Rock', 'Pop', 'Rock', 'Pop', 'Rock', 'R&B', 'Rock',  'Rock', 'Metal','Rock', 'Rock', 'R&B'] 
new_most_popular['Genre'] = Genre


In [None]:
new_most_popular_2 = new_most_popular.reindex(columns=['song_name', 'Genre',
'song_popularity', 'song_duration_ms', 'acousticness','danceability', 'energy', 'key', 'liveness',
'loudness' , 'spechiness' , 'tempo' 'audio_valence' , 'danceable_valence'])


#### Now I am going to take account of the 30 least popular songs in the dataset in order for me to have a nice correlation between the top songs and bottom songs.


In [None]:
least_popular = songs_cleaned.sort_values("song_popularity", ascending =True).head(30)
#these songs are the least popular songs in this dataset


In [None]:
#Same thing I did for the most popular songs is what I will do for the least popular songs now.
Genre = ['Latin Pop', 'Son cubano', 'Bolero', 'Hip Hop', 'Rock', 'R&B', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Salsa', 'Rock', 'Pop', 'R&B', 'Rock', 'Pop' , 'R&B', 'Rock', 'Rock', 'Country', 'R&B', 'Pop', 'Rock', 'Rock', 'Hip Hop',] 
least_popular['Genre'] = Genre

In [None]:
new_least_popular = least_popular.reindex(columns=['song_name', 'Genre',
'song_popularity', 'song_duration_ms', 'acousticness','danceability', 'energy', 'key', 'liveness',
'loudness' , 'spechiness' , 'tempo' 'audio_valence' , 'danceable_valence'])

In [None]:
conjoined_df = pd.concat([new_most_popular_2, new_least_popular], ignore_index=True)
# This code conjoins the top 30 and least 30 songs in terms of popularity 

## Method Chain building 

In [None]:
#This builds the method chain of the above codes for the top 30 songs
Genre = ['Reggae', 'Hip Hop', 'Pop', 'Pop', 'Hip Hop', 'Pop', 'Hip Hop', 'Reggae', 'Reggae', 'R&B', 'Pop', 'Pop', 'Pop', 'Rock', 'Pop', 'Rock', 'Pop', 'Rock', 'R&B', 'Rock',  'Rock', 'Metal','Rock', 'Rock', 'R&B'] 

new_most_popular = (songs_cleaned
                   .sort_values("song_popularity", ascending=False)
                   .head(30)
                   .drop(labels=[353,126,155,173,364], axis=0)
                   .assign(Genre=Genre))


In [None]:
#This builds the method chain of the above codes for the least popular songs
Genre = ['Latin Pop', 'Son cubano', 'Bolero', 'Hip Hop', 'Rock', 'R&B', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Salsa', 'Rock', 'Pop', 'R&B', 'Rock', 'Pop' , 'R&B', 'Rock', 'Rock', 'Country', 'R&B', 'Pop', 'Rock', 'Rock', 'Hip Hop',]

new_least_popular = (songs_cleaned
                   .sort_values("song_popularity", ascending=True)
                   .head(30)
                   .assign(Genre=Genre))


## Wrapping Method Chain into function

In [None]:
#This code wraps my data into a function for both the most and least popular. 
def process_songs(songs_cleaned):
    most_popular_genre = ['Reggae', 'Hip Hop', 'Pop', 'Pop', 'Hip Hop', 'Pop', 'Hip Hop', 'Reggae', 'Reggae', 'R&B', 'Pop', 'Pop', 'Pop', 'Rock', 'Pop', 'Rock', 'Pop', 'Rock', 'R&B', 'Rock', 'Rock', 'Metal','Rock', 'Rock', 'R&B']
    least_popular_genre = ['Latin Pop', 'Son cubano', 'Bolero', 'Hip Hop', 'Rock', 'R&B', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Salsa', 'Rock', 'Pop', 'R&B', 'Rock', 'Pop' , 'R&B', 'Rock', 'Rock', 'Country', 'R&B', 'Pop', 'Rock', 'Rock', 'Hip Hop']
    
    new_most_popular = (songs_cleaned
                       .sort_values("song_popularity", ascending=False)
                       .head(30)
                       .drop(labels=[353,126,155,173,364], axis=0)
                       .assign(Genre=most_popular_genre)
                      )
    
    new_least_popular = (songs_cleaned
                       .sort_values("song_popularity", ascending=True)
                       .head(30)
                       .assign(Genre=least_popular_genre)
                      )
    
    conjoined_df = pd.concat([new_most_popular[new_least_popular.columns], new_least_popular], ignore_index=True)
    
    return conjoined_df
processed_songs = process_songs(songs_cleaned)