In [1]:
import pandas as pd

# datetime module is used to format the date column
from datetime import datetime

### Load 'uk_top_singles_chart.csv' to Pandas dataframe

In [5]:
df_top_singles = pd.read_csv('../datasets/uk_top_singles_chart.csv')
df_top_singles

Unnamed: 0,date,position,title,artist,label,isrc
0,27 April 2018,1,ONE KISS,CALVIN HARRIS & DUA LIPA,COLUMBIA/WARNER BROS,GBARL1800368
1,27 April 2018,2,NO TEARS LEFT TO CRY,ARIANA GRANDE,REPUBLIC RECORDS,USUM71805289
2,27 April 2018,3,NICE FOR WHAT,DRAKE,CASH MONEY/REPUBLIC RECORDS,USCM51800077
3,27 April 2018,4,FREAKY FRIDAY,LIL DICKY FT CHRIS BROWN,BMG/COMMISSION,QMRSZ1800283
4,27 April 2018,5,PARADISE,GEORGE EZRA,COLUMBIA,GBARL1701373
...,...,...,...,...,...,...
10435,21 April 2023,36,SURE THING,MIGUEL,JIVE,USJI10800160
10436,21 April 2023,37,CEILINGS,LIZZY MCALPINE,HARBOUR ARTISTS & MUSIC,GBKPL2280163
10437,21 April 2023,38,HERE,TOM GRENNAN,INSANITY,GBARL2202424
10438,21 April 2023,39,SNOOZE,SZA,RCA/TOP DAWG,USRC12204591


The dataset includes records of Top 40 UK Singles from 27th April 2018 to 21st April 2023. There are 10440 rows and 6 columns in this dataframe. We do not need 'label' column as we are not looking into record companies so we can remove that column. 

In [6]:
df_top_singles.drop('label',axis = 1, inplace=True)
df_top_singles

Unnamed: 0,date,position,title,artist,isrc
0,27 April 2018,1,ONE KISS,CALVIN HARRIS & DUA LIPA,GBARL1800368
1,27 April 2018,2,NO TEARS LEFT TO CRY,ARIANA GRANDE,USUM71805289
2,27 April 2018,3,NICE FOR WHAT,DRAKE,USCM51800077
3,27 April 2018,4,FREAKY FRIDAY,LIL DICKY FT CHRIS BROWN,QMRSZ1800283
4,27 April 2018,5,PARADISE,GEORGE EZRA,GBARL1701373
...,...,...,...,...,...
10435,21 April 2023,36,SURE THING,MIGUEL,USJI10800160
10436,21 April 2023,37,CEILINGS,LIZZY MCALPINE,GBKPL2280163
10437,21 April 2023,38,HERE,TOM GRENNAN,GBARL2202424
10438,21 April 2023,39,SNOOZE,SZA,USRC12204591


Look at the datatypes of each column

In [7]:
df_top_singles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10440 entries, 0 to 10439
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date      10440 non-null  object
 1   position  10440 non-null  int64 
 2   title     10440 non-null  object
 3   artist    10440 non-null  object
 4   isrc      10440 non-null  object
dtypes: int64(1), object(4)
memory usage: 407.9+ KB


'date' column is an object. We need to turn it to a datetime type

In [8]:
df_top_singles['date'] = pd.to_datetime(df_top_singles['date'], format='%d %B %Y')
df_top_singles['date']

0       2018-04-27
1       2018-04-27
2       2018-04-27
3       2018-04-27
4       2018-04-27
           ...    
10435   2023-04-21
10436   2023-04-21
10437   2023-04-21
10438   2023-04-21
10439   2023-04-21
Name: date, Length: 10440, dtype: datetime64[ns]

Check if there is any missing values in the dataframe

In [9]:
df_top_singles.isnull().sum()

date        0
position    0
title       0
artist      0
isrc        0
dtype: int64

Great! There are no missing values. 

### Load 'track_audio_features_plus_mood_analysis.csv' to Pandas dataframe

In [22]:
df_moods = pd.read_csv('../datasets/track_audio_features_plus_mood_analysis.csv')
df_moods

Unnamed: 0.1,Unnamed: 0,isrc,track_name,track_spotify_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,mood
0,0,GBARL1900429,FIND U AGAIN,1HpzOCZbNWzxvvXfSGtSrX,0.005480,0.605,176417,0.664,0.000003,8,0.2040,-7.162,1,0.0316,103.997,4,0.164,agitated
1,1,GB2DY2000067,RAIN,3WgEwAp8HIMNCqkA3gpRqb,0.026600,0.870,183066,0.870,0.000002,9,0.1090,-4.797,1,0.2020,150.145,4,0.830,happy
2,2,GBARL1900043,BALLY,1AvLAAXMtq7IGi48x9TrD7,0.236000,0.818,165189,0.692,0.000000,5,0.0784,-3.887,0,0.2770,106.396,4,0.919,happy
3,3,USUG12207230,CALIFORNIA BREEZE,6ug9fUi5oLLgQgOF1G8WkM,0.003610,0.738,177789,0.666,0.000000,5,0.1070,-6.970,0,0.4550,161.734,4,0.215,agitated
4,4,USRC12203886,NEVER GONNA NOT DANCE AGAIN,6sZo5nJIsFWXefRCCexpx0,0.027300,0.668,225789,0.802,0.000000,5,0.1630,-4.005,0,0.0917,113.796,4,0.769,happy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1265,1265,GBUM72107771,FLOWERS (SAY MY NAME),4UpONN1cC0MHoJUuIGOXG4,0.489000,0.700,158601,0.738,0.000000,5,0.5300,-6.345,1,0.2520,142.937,4,0.771,happy
1266,1266,GBUM72201012,COME & GO,2Y0wPrPQBrGhoLn14xRYCG,0.017200,0.625,205485,0.814,0.000000,0,0.1580,-5.181,1,0.0657,144.991,4,0.535,happy
1267,1267,GBK3W1800737,GIVE YOURSELF A TRY,4rmIfFUZhhi9sS5IYtpkXw,0.000032,0.334,196589,0.786,0.000000,11,0.5170,-5.002,1,0.0592,182.933,4,0.901,happy
1268,1268,GBARL1900666,STRIKE A POSE,23GvTfcGK454ppLsts3W44,0.010700,0.531,214204,0.581,0.000016,1,0.1010,-5.801,1,0.1010,137.776,5,0.591,happy


Notice first column is unnamed and not needed so delete the first column. 

In [23]:
df_moods.drop(columns=df_moods.columns[0], axis=1, inplace=True)
df_moods

Unnamed: 0,isrc,track_name,track_spotify_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,mood
0,GBARL1900429,FIND U AGAIN,1HpzOCZbNWzxvvXfSGtSrX,0.005480,0.605,176417,0.664,0.000003,8,0.2040,-7.162,1,0.0316,103.997,4,0.164,agitated
1,GB2DY2000067,RAIN,3WgEwAp8HIMNCqkA3gpRqb,0.026600,0.870,183066,0.870,0.000002,9,0.1090,-4.797,1,0.2020,150.145,4,0.830,happy
2,GBARL1900043,BALLY,1AvLAAXMtq7IGi48x9TrD7,0.236000,0.818,165189,0.692,0.000000,5,0.0784,-3.887,0,0.2770,106.396,4,0.919,happy
3,USUG12207230,CALIFORNIA BREEZE,6ug9fUi5oLLgQgOF1G8WkM,0.003610,0.738,177789,0.666,0.000000,5,0.1070,-6.970,0,0.4550,161.734,4,0.215,agitated
4,USRC12203886,NEVER GONNA NOT DANCE AGAIN,6sZo5nJIsFWXefRCCexpx0,0.027300,0.668,225789,0.802,0.000000,5,0.1630,-4.005,0,0.0917,113.796,4,0.769,happy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1265,GBUM72107771,FLOWERS (SAY MY NAME),4UpONN1cC0MHoJUuIGOXG4,0.489000,0.700,158601,0.738,0.000000,5,0.5300,-6.345,1,0.2520,142.937,4,0.771,happy
1266,GBUM72201012,COME & GO,2Y0wPrPQBrGhoLn14xRYCG,0.017200,0.625,205485,0.814,0.000000,0,0.1580,-5.181,1,0.0657,144.991,4,0.535,happy
1267,GBK3W1800737,GIVE YOURSELF A TRY,4rmIfFUZhhi9sS5IYtpkXw,0.000032,0.334,196589,0.786,0.000000,11,0.5170,-5.002,1,0.0592,182.933,4,0.901,happy
1268,GBARL1900666,STRIKE A POSE,23GvTfcGK454ppLsts3W44,0.010700,0.531,214204,0.581,0.000016,1,0.1010,-5.801,1,0.1010,137.776,5,0.591,happy


### Join df_moods dataframe with df_top_singles dataframe to create one dataset to contain information on dates of popular songs and their moods in one place.

In [25]:
result = pd.merge(df_top_singles, df_moods, how="left", on=['isrc'])
result

Unnamed: 0,date,position,title,artist,isrc,track_name,track_spotify_id,acousticness,danceability,duration_ms,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,mood
0,2018-04-27,1,ONE KISS,CALVIN HARRIS & DUA LIPA,GBARL1800368,ONE KISS,7ef4DlsgrMEH11cDZd32M6,0.03700,0.791,214847,...,0.000022,9,0.0814,-3.240,0,0.1100,123.994,4,0.592,happy
1,2018-04-27,2,NO TEARS LEFT TO CRY,ARIANA GRANDE,USUM71805289,NO TEARS LEFT TO CRY,2qT1uLXPVPzGgFOx4jtEuo,0.04000,0.699,205920,...,0.000003,9,0.2940,-5.507,0,0.0594,121.993,4,0.354,agitated
2,2018-04-27,3,NICE FOR WHAT,DRAKE,USCM51800077,NICE FOR WHAT,3CA9pLiwRIGtUBiMjbZmRw,0.08910,0.585,210747,...,0.000097,8,0.1190,-6.474,1,0.0707,93.372,4,0.758,happy
3,2018-04-27,4,FREAKY FRIDAY,LIL DICKY FT CHRIS BROWN,QMRSZ1800283,FREAKY FRIDAY,10Igtw8bSDyyFs7KIsKngZ,0.14700,0.755,216632,...,0.000000,8,0.1090,-5.042,1,0.2240,133.123,4,0.755,happy
4,2018-04-27,5,PARADISE,GEORGE EZRA,GBARL1701373,PARADISE,38zwkK6TtTjIW9tpYBfZ3D,0.20900,0.642,222045,...,0.000000,11,0.2100,-3.537,1,0.0393,138.030,4,0.913,happy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10435,2023-04-21,36,SURE THING,MIGUEL,USJI10800160,SURE THING,0JXXNGljqupsJaZsgSbMZV,0.02670,0.684,195373,...,0.000307,11,0.1910,-8.127,0,0.1000,81.001,4,0.498,agitated
10436,2023-04-21,37,CEILINGS,LIZZY MCALPINE,GBKPL2280163,CEILINGS,2L9N0zZnd37dwF0clgxMGI,0.47300,0.516,182888,...,0.001940,9,0.2150,-11.762,1,0.0292,148.005,3,0.261,sad
10437,2023-04-21,38,HERE,TOM GRENNAN,GBARL2202424,HERE,5yuGEEvvqMHvoLIxEFnaDT,0.07910,0.522,195145,...,0.000000,1,0.1940,-5.073,1,0.0462,120.469,4,0.343,agitated
10438,2023-04-21,39,SNOOZE,SZA,USRC12204591,SNOOZE,4iZ4pt7kvcaH6Yo8UoZ4s2,0.14100,0.559,201800,...,0.000000,5,0.1100,-7.231,1,0.1320,143.008,4,0.392,agitated


Notice that 'title' and 'track_name' are duplicate columns. We do not care about 'track_spotify_id' either so we need to take out those columns

In [26]:
result.drop(columns = ['track_name', 'track_spotify_id'], axis = 1, inplace=True)
result

Unnamed: 0,date,position,title,artist,isrc,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,mood
0,2018-04-27,1,ONE KISS,CALVIN HARRIS & DUA LIPA,GBARL1800368,0.03700,0.791,214847,0.862,0.000022,9,0.0814,-3.240,0,0.1100,123.994,4,0.592,happy
1,2018-04-27,2,NO TEARS LEFT TO CRY,ARIANA GRANDE,USUM71805289,0.04000,0.699,205920,0.713,0.000003,9,0.2940,-5.507,0,0.0594,121.993,4,0.354,agitated
2,2018-04-27,3,NICE FOR WHAT,DRAKE,USCM51800077,0.08910,0.585,210747,0.909,0.000097,8,0.1190,-6.474,1,0.0707,93.372,4,0.758,happy
3,2018-04-27,4,FREAKY FRIDAY,LIL DICKY FT CHRIS BROWN,QMRSZ1800283,0.14700,0.755,216632,0.599,0.000000,8,0.1090,-5.042,1,0.2240,133.123,4,0.755,happy
4,2018-04-27,5,PARADISE,GEORGE EZRA,GBARL1701373,0.20900,0.642,222045,0.881,0.000000,11,0.2100,-3.537,1,0.0393,138.030,4,0.913,happy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10435,2023-04-21,36,SURE THING,MIGUEL,USJI10800160,0.02670,0.684,195373,0.607,0.000307,11,0.1910,-8.127,0,0.1000,81.001,4,0.498,agitated
10436,2023-04-21,37,CEILINGS,LIZZY MCALPINE,GBKPL2280163,0.47300,0.516,182888,0.322,0.001940,9,0.2150,-11.762,1,0.0292,148.005,3,0.261,sad
10437,2023-04-21,38,HERE,TOM GRENNAN,GBARL2202424,0.07910,0.522,195145,0.681,0.000000,1,0.1940,-5.073,1,0.0462,120.469,4,0.343,agitated
10438,2023-04-21,39,SNOOZE,SZA,USRC12204591,0.14100,0.559,201800,0.551,0.000000,5,0.1100,-7.231,1,0.1320,143.008,4,0.392,agitated


check if there are any missing values or empty strings in this dataframe. 

In [27]:
result.isnull().sum()

date                0
position            0
title               0
artist              0
isrc                0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
mood                0
dtype: int64

In [28]:
result.eq('').sum()

date                0
position            0
title               0
artist              0
isrc                0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
mood                0
dtype: int64

### Completion
Dataframe looks pretty good with no missing values. At this stage, we are not sure if all the features will be used for analysis, so we keep them in for now. Our data cleaning is complete. It can be outputed to a csv file ready for analysis. 

In [31]:
result.to_csv('../datasets/popular_tracks_and_moods.csv', index=False, encoding='utf-8')