### Import Essential Libraries

In [3]:
# import libraries
import pandas as pd
import re
import numpy as np
import json
from pandas.io.json import json_normalize

## Data Sources

In [4]:
# 1. https://open.spotify.com/playlist/1cx0Gbqhb7rT3aHUQbQiTQ
# 2. https://open.spotify.com/playlist/0iAm4XiG8zb6q3lWi4qtiF

### Load Dataframes

#### Song Names Dataframe

In [33]:
# create a dataframe from "names.csv" - holds names of the songs
song_names = pd.read_csv('data/mandopop/names.csv')

In [34]:
# check the work
song_names

Unnamed: 0,name
0,晴天
1,零
2,寶貝 (In a Day)
3,雨愛
4,掉了
...,...
2405,"对不起,我爱你"
2406,都给我
2407,药引
2408,反方向的鐘


#### Song Artists Dataframe

In [35]:
# create a dataframe from "artists.csv" - holds artists of the songs
song_artists = pd.read_csv('data/mandopop/artists.csv') 

In [36]:
# check the work
song_artists

Unnamed: 0,artists
0,Jay Chou
1,Alan Kuo
2,Deserts Chang
3,Rainie Yang
4,A-Mei Chang
...,...
2405,龙飞龙泽
2406,嘿人李逵Noisemakers
2407,胡66
2408,Jay Chou


#### Song Popularity Dataframe

In [37]:
# create a dataframe from "lyrics.csv" - holds lyrics of the songs
song_popularity = pd.read_csv('data/mandopop/popularity.csv')

In [38]:
# check the work
song_popularity

Unnamed: 0,popularity
0,61
1,42
2,44
3,55
4,0
...,...
2405,31
2406,33
2407,31
2408,42


#### Song Dates Dataframe

In [39]:
# create a dataframe from "genres.txt" - holds genres of the songs
song_dates = pd.read_csv('data/mandopop/release_date.csv') 

In [40]:
# check the work
song_dates

Unnamed: 0,release_date
0,2003-07-31
1,2005-08-12
2,2006-06-06
3,2009-12-29
4,2009
...,...
2405,2018-07-04
2406,2020-01-16
2407,2021-05-29
2408,2000-11-06


#### Song Features Dataframe

In [41]:
# read "features.csv" - holds features of the songs
song_features = pd.read_csv('data/mandopop/features.csv')
song_features = song_features.reset_index() 
song_features = song_features.drop(columns=['index', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'time_signature'])

In [42]:
# check the work
song_features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0.547,0.567,7,-7.295,1,0.0242,0.2760,0.000548,0.1040,0.3990,137.130,269747
1,0.494,0.565,3,-4.958,0,0.0291,0.0610,0.000000,0.1210,0.0989,120.026,279893
2,0.827,0.160,0,-12.729,1,0.0483,0.8870,0.000000,0.1050,0.3880,119.891,145440
3,0.422,0.657,4,-5.274,1,0.0292,0.2140,0.000000,0.1290,0.2180,159.957,261560
4,0.547,0.475,1,-6.613,1,0.0278,0.8110,0.000000,0.0722,0.1420,161.965,239560
...,...,...,...,...,...,...,...,...,...,...,...,...
2405,0.464,0.310,5,-11.567,1,0.0334,0.8470,0.000000,0.1130,0.2400,143.505,340454
2406,0.704,0.720,4,-3.514,0,0.0525,0.3050,0.000000,0.0972,0.4580,149.969,322242
2407,0.658,0.503,11,-9.205,1,0.0377,0.7990,0.000000,0.2400,0.3210,121.999,171636
2408,0.570,0.887,7,-7.993,0,0.0448,0.0908,0.000003,0.1970,0.1820,94.607,256733


### Merge Dataframes

In [43]:
# merge above dataframes
df = pd.merge(song_names, song_artists, how='inner', left_index=True, right_index=True)
df = df.join(song_popularity)
# df = df.join(song_dates)
df = df.join(song_features)

In [44]:
# remove rows 
df['name'] = df['name'].drop_duplicates()
df= df.dropna()
df = df.reset_index(drop=True)

In [45]:
# # covert dates to datetime
# df['dates'] = pd.to_datetime(df['dates'], errors='coerce')

# # extract year
# df['year'] = pd.to_datetime(df['dates']).dt.to_period('Y')

# # drop dates columns
# df = df.drop(columns='dates')

#### Data Codebook:
* names: Name of the song.
* artists: Artist(s) of the song.
* lyrics: Lyrics of the song.
* danceability: Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.
* energy: Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy.
* key: Key is the major or minor scale around which a piece of music revolves.
* loudness: The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track. Values typical range between -60 and 0 db. 
* mode: Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived.
* speechiness: Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value.
* acousticness: A measure from 0.0 to 1.0 of whether the track is acoustic.
* instrumentalness: Predicts whether a track contains no vocals. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content.
* liveness: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live.
* valence: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).
* tempo: The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.
* duration_ms: Duration of the song in millisecond.

In [46]:
# check the work
print(df.shape)
df.head()

(2039, 15)


Unnamed: 0,name,artists,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,晴天,Jay Chou,61,0.547,0.567,7,-7.295,1,0.0242,0.276,0.000548,0.104,0.399,137.13,269747
1,零,Alan Kuo,42,0.494,0.565,3,-4.958,0,0.0291,0.061,0.0,0.121,0.0989,120.026,279893
2,寶貝 (In a Day),Deserts Chang,44,0.827,0.16,0,-12.729,1,0.0483,0.887,0.0,0.105,0.388,119.891,145440
3,雨愛,Rainie Yang,55,0.422,0.657,4,-5.274,1,0.0292,0.214,0.0,0.129,0.218,159.957,261560
4,掉了,A-Mei Chang,0,0.547,0.475,1,-6.613,1,0.0278,0.811,0.0,0.0722,0.142,161.965,239560


Since the dataset is made from multiple playlists that created by Spotify's users. We should expect that some users also added songs that are not in Vietnamese by mistakes. However, we can remove some of them in the Exploratory Data Analysis. 

### Export to CSV

In [47]:
# write to csv file
df.to_csv('data/mandopop/mandopop_songs.csv', index=False)