# Data Exploration and Cleaning

###### 1. Import Libraries and dependencies 

In [176]:
# Importing the Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import datetime as dt
from datetime import date

###### 2. Load data into the Dataframe from Excel /CSV files 

In [177]:
#Reading data from CSV and creating a Pandas Data Frame
spotify_data_1 ="data/processed_data_1.csv" 
df1 = pd.read_csv(spotify_data_1)
spotify_data_2 ="data/processed_data_2.csv" 
df2 = pd.read_csv(spotify_data_2)
spotify_data_3 ="data/processed_data_3.csv" 
df3 = pd.read_csv(spotify_data_3)
spotify_data_4 ="data/processed_data_4.csv" 
df4 = pd.read_csv(spotify_data_4)
spotify_df= pd.concat([df1,df2,df3,df4], ignore_index=True)

###### 3. Preliminary Data Examination

In [178]:
#Checking the data in the data frame and its structure
spotify_df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,Stream count
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,36892097
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600,26063851
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616,70546338
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093,11862340
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052,63578139


In [179]:
#Checking for the datatypes of the values in the df
spotify_df.dtypes

track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_id               object
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_id                  object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
Stream count                 object
dtype: object

In [180]:
#check for missing values
print(spotify_df.isnull().sum())

track_id                    0
track_name                  5
track_artist                5
track_popularity            0
track_album_id              0
track_album_name            5
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
Stream count                0
dtype: int64


###### 4. Data Cleaning

In [181]:
#Rename Columns
spotify_df.rename(columns= {'Stream count' : 'stream_count'},inplace=True)            

In [182]:
# Converting Stream count into float 
spotify_df['stream_count']=spotify_df['stream_count'].str.replace(',','')

In [183]:
spotify_df['stream_count']=spotify_df['stream_count'].astype(float)

In [184]:
spotify_df.shape

(32829, 24)

In [185]:
spotify_df.dtypes

track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_id               object
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_id                  object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
stream_count                float64
dtype: object

In [186]:
#drop null values
spotify_new= spotify_df.dropna()

In [187]:
# Return number of rows x columns
spotify_new.shape

(32824, 24)

In [188]:
#converting duration from ms to seconds
spotify_new['duration_sec']= spotify_new['duration_ms'].apply(lambda x: x/1000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [189]:
spotify_new.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,stream_count,duration_sec
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,36892097.0,194.754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600,26063851.0,162.6
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616,70546338.0,176.616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093,11862340.0,169.093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052,63578139.0,189.052


In [190]:
# drop unnecessary columns
spotify_new = spotify_new.drop(['duration_ms'],axis=1)
spotify_new.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,stream_count,duration_sec
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,36892097.0,194.754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,26063851.0,162.6
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,70546338.0,176.616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,11862340.0,169.093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,63578139.0,189.052


In [193]:
from datetime import datetime

In [196]:
# Get today's date
today = date.today()

# Calculate months elapsed (consider year difference too)
spotify_new['year_val'] = pd.to_datetime(spotify_new['track_album_release_date'], format = '%Y-%m-%d', errors = 'coerce')
spotify_new['year_val'] = spotify_new['year_val'].combine_first(pd.to_datetime(spotify_new['track_album_release_date'], format = '%m/%d/%Y', errors = 'coerce'))
spotify_new['year_val'] = spotify_new['year_val'].combine_first(pd.to_datetime(spotify_new['track_album_release_date'], format = '%m/%d/%y', errors = 'coerce')).dt.year
spotify_new['months'] = (today.year - spotify_new['year_val']) * 12 + \
                             (today.month - pd.to_datetime(spotify_new['track_album_release_date']).dt.month)



spotify_new.drop(columns=['year_val'], inplace = True)

spotify_new = spotify_new.drop(spotify_new[spotify_new['months']<0].index)

spotify_new.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,stream_count,duration_sec,months
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.0583,0.102,0.0,0.0653,0.518,122.036,36892097.0,194.754,57
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,26063851.0,162.6,51
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,70546338.0,176.616,56
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.102,0.0287,9e-06,0.204,0.277,121.956,11862340.0,169.093,56
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,63578139.0,189.052,60


In [197]:
# drop columns not needed
#dropping track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id
spotify_new = spotify_new.drop(['track_album_id','track_album_name','playlist_name','playlist_id'], axis=1)

In [198]:
spotify_new.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_release_date,playlist_genre,playlist_subgenre,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,stream_count,duration_sec,months
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2019-06-14,pop,dance pop,0.748,0.916,6,...,1,0.0583,0.102,0.0,0.0653,0.518,122.036,36892097.0,194.754,57
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,2019-12-13,pop,dance pop,0.726,0.815,11,...,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,26063851.0,162.6,51
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,2019-07-05,pop,dance pop,0.675,0.931,1,...,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,70546338.0,176.616,56
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,2019-07-19,pop,dance pop,0.718,0.93,7,...,1,0.102,0.0287,9e-06,0.204,0.277,121.956,11862340.0,169.093,56
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,2019-03-05,pop,dance pop,0.65,0.833,1,...,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,63578139.0,189.052,60


In [199]:
spotify_new.dtypes

track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_release_date     object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
stream_count                float64
duration_sec                float64
months                        int64
dtype: object

In [201]:
spotify_final = spotify_new.drop(['track_album_release_date'], axis=1)

In [202]:
spotify_final.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,stream_count,duration_sec,months
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,36892097.0,194.754,57
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,26063851.0,162.6,51
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,70546338.0,176.616,56
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,11862340.0,169.093,56
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,63578139.0,189.052,60


In [203]:
spotify_final.shape

(32746, 20)

In [204]:
#Save cleaned data to csv
spotify_final.to_csv('data/processed_data_cleaned_12March.csv',index=False)