In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns


In [None]:
songdata = pd.read_csv("../../data/raw/song_data.csv.zip")

### Cleaning Data 


In [None]:
songdata= songdata.loc [1000:1510]
songdata
#this step returns 500 unique songs.

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
1000,You Get What You Give,75,300773,0.17100,0.618,0.896,0.000000,2,0.0875,-5.357,1,0.0307,113.961,4,0.711
1001,Lump,70,134200,0.00468,0.499,0.873,0.000000,6,0.1650,-3.902,1,0.0386,142.726,4,0.864
1002,Someday - Remastered,63,243333,0.01080,0.743,0.670,0.070200,1,0.2200,-4.547,0,0.0293,110.832,4,0.934
1003,All I Want,60,196333,0.05580,0.600,0.811,0.000007,5,0.1090,-7.693,1,0.0788,124.303,4,0.622
1004,Flagpole Sitta,64,217266,0.00184,0.351,0.899,0.000116,7,0.2840,-4.119,1,0.0432,145.015,4,0.508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1506,little light - acoustic version,65,256415,0.68500,0.670,0.294,0.000000,7,0.1470,-8.114,1,0.0341,119.856,4,0.515
1507,I Will Spend My Whole Life Loving You,68,227619,0.88700,0.637,0.129,0.000000,11,0.0719,-17.082,1,0.0346,125.802,4,0.298
1508,Best Days - Acoustic,49,179063,0.91900,0.655,0.127,0.000000,5,0.1050,-11.036,1,0.0467,150.407,4,0.436
1509,Love on the Weekend,68,212626,0.54700,0.729,0.546,0.002870,7,0.0651,-9.305,1,0.0285,119.941,4,0.406


In [None]:
songdata_cleaned = songdata.drop(['audio_mode', 'time_signature'],axis=1)
# Here I drop the audio mode and time signature columns as they are redundant in my analysis

In [None]:
songdata_cleaned = songdata.dropna(axis=0)
songdata_cleaned.shape

(511, 15)

In [None]:
songdata.columns

Index(['song_name', 'song_popularity', 'song_duration_ms', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'audio_mode', 'speechiness', 'tempo', 'time_signature',
       'audio_valence'],
      dtype='object')

In [None]:
songdata_cleaned= songdata.drop_duplicates(subset= 'song_name')
#In this step, I drop any duplicates in the song name column to ensure that all the song title I am analysing are unique.

In [None]:
songdata_cleaned.nunique(axis=0) 
# This step allows me to identify how many unique values I have for each element and recognize any duplicates. 

song_name           492
song_popularity      89
song_duration_ms    494
acousticness        445
danceability        329
energy              377
instrumentalness    264
key                  12
liveness            302
loudness            486
audio_mode            2
speechiness         323
tempo               500
time_signature        4
audio_valence       371
dtype: int64

## Processing Data

In [None]:
songdata_sorted = songdata_cleaned.sort_values(by='song_popularity', ascending=False).reset_index()
songdata_sorted

Unnamed: 0,index,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,1229,I Love It (& Lil Pump),99,127946,0.01140,0.901,0.522,0.000000,2,0.2590,-8.304,1,0.3300,104.053,4,0.329
1,1230,Falling Down,97,196400,0.01750,0.669,0.574,0.002940,4,0.1460,-6.442,0,0.0286,120.013,4,0.273
2,1250,SICKO MODE,97,312820,0.00513,0.834,0.730,0.000000,8,0.1240,-3.714,1,0.2220,155.008,4,0.446
3,1251,Lucky You (feat. Joyner Lucas),96,244679,0.06270,0.876,0.786,0.000000,10,0.1270,-4.884,0,0.3060,153.068,4,0.575
4,1258,FEFE (feat. Nicki Minaj & Murda Beatz),96,179404,0.08800,0.931,0.387,0.000000,1,0.1360,-9.127,1,0.4120,125.978,4,0.376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,1330,Insoportablemente Bella,3,219293,0.30900,0.315,0.370,0.000050,9,0.0642,-10.912,0,0.0307,66.547,4,0.256
488,1273,Payaso,2,196226,0.76400,0.765,0.498,0.000014,9,0.1390,-7.293,1,0.0280,92.024,4,0.510
489,1272,No Me Platiques Mas,1,192967,0.69500,0.245,0.355,0.000000,8,0.1250,-13.039,1,0.0427,85.560,4,0.398
490,1233,Transformer (feat. Nicki Minaj),0,196333,0.00151,0.753,0.616,0.000007,2,0.2910,-7.340,1,0.1650,156.830,4,0.287


In [None]:
def multiply_columns(row):
    return row['energy'] * row['loudness']
songdata_sorted['Loud_Enrgy_Prod'] = songdata_sorted.aggregate(multiply_columns, axis=1) 
# This creates a new column that multiplies each loudness value to the corresponding energy value
songdata_sorted1= songdata_sorted


In [None]:
def multiply_columns(row):
    return row['audio_valence'] * row['danceability']
songdata_sorted['Vale_Dance_Prod'] = songdata_sorted.aggregate(multiply_columns, axis=1) 
# This creates a new column that multiplies each valence value to the corresponding danceability value
songdata_sorted2= songdata_sorted


In this processing step, I have ranked my data by song popularity in descending order and created a new variable Loud_Enrgy_Prod that is an aggregate of energy and loudness. I chose to agreegate these two variables because I found in my initaial analysis that loudness and energy  had the biggest correlation 

Because valence is a measure of positive feelings conveyed in a track, and my understanding of danceablity is that it is influenced by one's feelings toward a song, I thought it natural to combine the two metrics. This is supported by the corelation data provided above, in which valence and danceability have a moderately strong correlation coefficient of 0.36. In doing this, I am then able to conduct some further exploration of this new "aggregate" metric, with respect to song popularity.

# Wrangling 

In [None]:
songdata_top = songdata_sorted.head(50)

In [None]:
songdata_top ['genre'] = ['Trap', 'Pop Rock', 'Trap', 'Hip Hop', 'Hip Hop', 'Pop', 'Pop Rock', 'Trap', 'Hip Hop', 'Trap', 'Hip Hop', 'Hip Hop', 'Hip Hop', 'Hip Hop', 'Trap', 'Pop', 'Trap', 'Hip Hop', 'Trap', 'Trap', 'Hip Hop', 'Trap', 'Hip Hop', 'Indie', 'Pop Rock', 'Pop', 'Hip Hop', 'Pop', 'Electronic', 'Acoustic Pop', 'Pop', 'Hip Hop', 'Alternative Rock', 'Pop', 'Hip Hop', 'Alternative Rock', 'Rock', 'Hip Hop', 'Pop Rock', 'Pop', 'Pop', 'Hip Hop', 'Trap', 'Pop Rock', 'Grunge', 'Hip Hop', 'Hip Hop', 'Pop', 'Alternative Rock', 'Pop Rock']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songdata_top ['genre'] = ['Trap', 'Pop Rock', 'Trap', 'Hip Hop', 'Hip Hop', 'Pop', 'Pop Rock', 'Trap', 'Hip Hop', 'Trap', 'Hip Hop', 'Hip Hop', 'Hip Hop', 'Hip Hop', 'Trap', 'Pop', 'Trap', 'Hip Hop', 'Trap', 'Trap', 'Hip Hop', 'Trap', 'Hip Hop', 'Indie', 'Pop Rock', 'Pop', 'Hip Hop', 'Pop', 'Electronic', 'Acoustic Pop', 'Pop', 'Hip Hop', 'Alternative Rock', 'Pop', 'Hip Hop', 'Alternative Rock', 'Rock', 'Hip Hop', 'Pop Rock', 'Pop', 'Pop', 'Hip Hop', 'Trap', 'Pop Rock', 'Grunge', 'Hip Hop', 'Hip Hop', 'Pop', 'Alternative Rock', 'Pop Rock']


In [None]:
songdata_top = songdata_top.reindex(columns= ['song_name', 'genre', 'song_popularity', 'song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence', 'Loud_Enrgy_Prod', 'Vale_Dance_Prod'])

In [None]:
songdata_bottom = songdata_sorted.tail(50).sort_values(by=['song_popularity'], ascending = True)

In [None]:
songdata_bottom ['genre'] = ['Alternative Rock', 'Hip Hop', 'Bolero', 'Latin Pop', 'Latin Pop', 'Alternative Rock', 'Bolero', 'Alternative Rock', 'Latin Pop', 'Bolero', 'Alternative Rock', 'Bolero', 'Bolero', 'Rock', 'Alternative Rock', 'Alternative Rock', 'Bolero', 'Norteño', 'Alternative Rock', 'Rock', 'Alternative Rock', 'Latin Pop', 'Rock', 'Alternative Rock', 'Pop', 'Electronic', 'Hip Hop', 'Latin Pop', 'Bachata', 'Alternative Rock', 'Pop', 'Pop', 'Norteño', 'Grunge', 'Acoustic Pop', 'Gospel', 'Bolero', 'Pop', 'Indie', 'Alternative Rock', 'Alternative Rock', 'Reggae', 'Pop', 'Indie', 'Pop', 'R&B', 'Pop Rock', 'Indie', 'Pop', 'Pop']

In [None]:
songdata_bottom = songdata_bottom.reindex(columns=['song_name', 'genre', 'song_popularity', 'song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence', 'Loud_Enrgy_Prod', 'Vale_Dance_Prod'])


In [None]:
concatenated_df = pd.concat([songdata_top, songdata_bottom], ignore_index=False) #Here I have created a concatenated (combined) version of the top 50 songs and the bottom 50 songs of my sample 

 In this wrangling step, I have created method chains in which I seperate the top 50 popular songs and the least 50 popular songs, reindex them, and I add a new colum that classifies the songs by music genre. Lastly I create a new dataframe (concatenated_df) that combines the top 50 popular songs and the bottom 50 popular songs.

# Method Chains 

In [None]:
songdata_cleaned = (
    pd.read_csv("../../data/raw/song_data.csv.zip")
    .loc[1000:1510]
    .drop(['audio_mode', 'time_signature'], axis=1)
    .dropna(axis=0)
    .drop_duplicates(subset='song_name')
    .sort_values(by='song_popularity', ascending=False)
    .reset_index()
    .assign(Loud_Enrgy_Prod = lambda x: x['energy'] * x['loudness'])
    .assign(Vale_Dance_Prod = lambda x: x['audio_valence'] * x['danceability'])
)

In [None]:
concatenated_df = (
    songdata_sorted
    .head(50)
    .assign(genre=['Trap', 'Pop Rock', 'Trap', 'Hip Hop', 'Hip Hop', 'Pop', 'Pop Rock', 'Trap', 'Hip Hop', 'Trap', 'Hip Hop', 'Hip Hop', 'Hip Hop', 'Hip Hop', 'Trap', 'Pop', 'Trap', 'Hip Hop', 'Trap', 'Trap', 'Hip Hop', 'Trap', 'Hip Hop', 'Indie', 'Pop Rock', 'Pop', 'Hip Hop', 'Pop', 'Electronic', 'Acoustic Pop', 'Pop', 'Hip Hop', 'Alternative Rock', 'Pop', 'Hip Hop', 'Alternative Rock', 'Rock', 'Hip Hop', 'Pop Rock', 'Pop', 'Pop', 'Hip Hop', 'Trap', 'Pop Rock', 'Grunge', 'Hip Hop', 'Hip Hop', 'Pop', 'Alternative Rock', 'Pop Rock'])
    .reindex(columns=['song_name', 'genre', 'song_popularity', 'song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence', 'Loud_Enrgy_Prod', 'Vale_Dance_Prod'])
    .append(
        songdata_sorted
        .tail(50)
        .sort_values(by=['song_popularity'], ascending=True)
        .assign(genre=['Alternative Rock', 'Hip Hop', 'Bolero', 'Latin Pop', 'Latin Pop', 'Alternative Rock', 'Bolero', 'Alternative Rock', 'Latin Pop', 'Bolero', 'Alternative Rock', 'Bolero', 'Bolero', 'Rock', 'Alternative Rock', 'Alternative Rock', 'Bolero', 'Norteño', 'Alternative Rock', 'Rock', 'Alternative Rock', 'Latin Pop', 'Rock', 'Alternative Rock', 'Pop', 'Electronic', 'Hip Hop', 'Latin Pop', 'Bachata', 'Alternative Rock', 'Pop', 'Pop', 'Norteño', 'Grunge', 'Acoustic Pop', 'Gospel', 'Bolero', 'Pop', 'Indie', 'Alternative Rock', 'Alternative Rock', 'Reggae', 'Pop', 'Indie', 'Pop', 'R&B', 'Pop Rock', 'Indie', 'Pop', 'Pop'])
        .reindex(columns=['song_name', 'genre', 'song_popularity', 'song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence', 'Loud_Enrgy_Prod', 'Vale_Dance_Prod'])
    )
)


  .append(


# Wrapping the 2 method chains into a function 

In [None]:
def clean_and_concat(songdata):
    songdata_cleaned = (
    pd.read_csv("../../data/raw/song_data.csv.zip")
    .loc[1000:1510]
    .drop(['audio_mode', 'time_signature'], axis=1)
    .dropna(axis=0)
    .drop_duplicates(subset='song_name')
    .sort_values(by='song_popularity', ascending=False)
    .reset_index()
    .assign(Loud_Enrgy_Prod = lambda x: x['energy'] * x['loudness'])
    .assign(Vale_Dance_Prod = lambda x: x['audio_valence'] * x['danceability'])
)
    
    concatenated_df = (
        songdata_cleaned
        .head(50)
        .assign(genre=['Trap', 'Pop Rock', 'Trap', 'Hip Hop', 'Hip Hop', 'Pop', 'Pop Rock', 'Trap', 'Hip Hop', 'Trap', 'Hip Hop', 'Hip Hop', 'Hip Hop', 'Hip Hop', 'Trap', 'Pop', 'Trap', 'Hip Hop', 'Trap', 'Trap', 'Hip Hop', 'Trap', 'Hip Hop', 'Indie', 'Pop Rock', 'Pop', 'Hip Hop', 'Pop', 'Electronic', 'Acoustic Pop', 'Pop', 'Hip Hop', 'Alternative Rock', 'Pop', 'Hip Hop', 'Alternative Rock', 'Rock', 'Hip Hop', 'Pop Rock', 'Pop', 'Pop', 'Hip Hop', 'Trap', 'Pop Rock', 'Grunge', 'Hip Hop', 'Hip Hop', 'Pop', 'Alternative Rock', 'Pop Rock'])
        .reindex(columns=['song_name', 'genre', 'song_popularity', 'song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence', 'Loud_Enrgy_Prod', 'Vale_Dance_Prod'])
        .append(
            songdata_cleaned
            .tail(50)
            .sort_values(by=['song_popularity'], ascending=True)
            .assign(genre=['Alternative Rock', 'Hip Hop', 'Bolero', 'Latin Pop', 'Latin Pop', 'Alternative Rock', 'Bolero', 'Alternative Rock', 'Latin Pop', 'Bolero', 'Alternative Rock', 'Bolero', 'Bolero', 'Rock', 'Alternative Rock', 'Alternative Rock', 'Bolero', 'Norteño', 'Alternative Rock', 'Rock', 'Alternative Rock', 'Latin Pop', 'Rock', 'Alternative Rock', 'Pop', 'Electronic', 'Hip Hop', 'Latin Pop', 'Bachata', 'Alternative Rock', 'Pop', 'Pop', 'Norteño', 'Grunge', 'Acoustic Pop', 'Gospel', 'Bolero', 'Pop', 'Indie', 'Alternative Rock', 'Alternative Rock', 'Reggae', 'Pop', 'Indie', 'Pop', 'R&B', 'Pop Rock', 'Indie', 'Pop', 'Pop'])
            .reindex(columns=['song_name', 'genre', 'song_popularity', 'song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence', 'Loud_Enrgy_Prod', 'Vale_Dance_Prod'])
        )
    ) 
    return concatenated_df
