### Import Essential Libraries

In [1]:
# import libraries
import pandas as pd
import re
import numpy as np
import json

### Load Dataframes

#### Song Names Dataframe

In [2]:
# create a dataframe from "names.txt" - holds names of the songs
song_names = pd.read_csv('names.txt', delimiter="\',|\",", engine='python')
song_names = song_names.T
song_names = song_names.reset_index()
song_names = song_names.rename(columns={'index': 'names'})

In [3]:
# remove strange characters in the song names 
chars_to_remove = ['[', ']', "'", '"']
regular_expression = '[' + re.escape (''. join (chars_to_remove)) + ']'
song_names['names'] = song_names['names'].str.replace(regular_expression, '', regex=True)

In [4]:
# check the work
song_names

Unnamed: 0,names
0,Cho Mình Em
1,Chỉ Là Không Cùng Nhau (Live Version)
2,Sài Gòn Đau Lòng Quá
3,Chúng Ta Của Hiện Tại
4,Phải Chăng Em Đã Yêu?
...,...
295,Ở Trong Thành Phố
296,Xin
297,Bùa Yêu
298,Vỡ (Siêu Sao Siêu Ngố OST)


#### Song Artists Dataframe

In [5]:
# create a dataframe from "artists.txt" - holds artists of the songs
song_artists = pd.read_csv('artists.txt', delimiter="\',", engine='python') 
song_artists = song_artists.T
song_artists = song_artists.reset_index()
song_artists = song_artists.rename(columns={'index': 'artists'})

In [6]:
# remove strange characters in the artist names
chars_to_remove = ['[', ']', "'", "."]
regular_expression = '[' + re.escape (''. join (chars_to_remove)) + ']'
song_artists['artists'] = song_artists['artists'].str.replace(regular_expression, '', regex=True)

In [7]:
# check the work
song_artists

Unnamed: 0,artists
0,Binz
1,Tăng Phúc
2,Hứa Kim Tuyền
3,Sơn Tùng M-TP
4,Juky San
...,...
295,Masew5
296,Masew6
297,Bich Phuong10
298,Đức Phúc1


#### Song Lyrics Dataframe

In [8]:
# create a dataframe from "lyrics.txt" - holds lyrics of the songs
song_lyrics = pd.read_csv('lyrics.txt', sep="\',|\",", engine='python')
song_lyrics = song_lyrics.T
song_lyrics = song_lyrics.reset_index()
song_lyrics = song_lyrics.rename(columns={'index': 'lyrics'})

In [9]:
# remove strange characters in the lyrics
chars_to_remove = ['[', ']', "'", '"']
regular_expression = '[' + re.escape (''. join (chars_to_remove)) + ']'
song_lyrics['lyrics'] = song_lyrics['lyrics'].str.replace(regular_expression, '', regex=True)
# song_lyrics['lyrics'].replace(r'\\n', '.', regex=True, inplace=True) 

In [10]:
# get the row where index is 87 since the lyric is not available for this song
song_lyrics.loc[87]

lyrics     \n          \n            Lyrics for this son...
Name: 87, dtype: object

In [11]:
# remove it and reset index
song_lyrics = song_lyrics.drop(87)
song_lyrics = song_lyrics.reset_index()

In [12]:
# check the work
song_lyrics = song_lyrics.drop(columns='index')
song_lyrics

Unnamed: 0,lyrics
0,"Binz:\nAnh từng xem một người là thế giới, anh..."
1,
2,Mình đã từng hứa\nBên nhau hết tháng năm dài\...
3,The Official Audio Lyrics of Chúng Ta Của Hiệ...
4,Chorus: Juky SanPhải chăng em đã yêu ngay từ ...
...,...
295,None.152
296,Hoh hoh\nCó lẽ đôi mình giờ chỉ là một lối số...
297,None.153
298,None.154


#### Song Genres Dataframe

In [13]:
# create a dataframe from "genres.txt" - holds genres of the songs
song_genres = pd.read_csv('genres.txt', delimiter="\],|\.", engine='python') 
song_genres = song_genres.T
song_genres = song_genres.reset_index()
song_genres = song_genres.rename(columns={'index': 'genres'})

In [14]:
# remove strange characters in the song names 
chars_to_remove = ['[', ']', "'", '"']
regular_expression = '[' + re.escape (''. join (chars_to_remove)) + ']'
song_genres['genres'] = song_genres['genres'].str.replace(regular_expression, '', regex=True)

In [15]:
song_genres

Unnamed: 0,genres
0,"v-pop, vietnamese hip hop"
1,v-pop
2,
3,v-pop.1
4,v-pop.2
...,...
295,v-pop.166
296,v-pop.167
297,v-pop.168
298,v-pop.169


#### Song Features Dataframe

In [16]:
# create a dataframe from "features.json" - holds features of the songs
with open("features.json") as f:
    contents = json.loads(f.read())
song_features = pd.concat([pd.DataFrame(i) for i in contents])
song_features = song_features.reset_index()
song_features = song_features.drop(columns=['index', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'time_signature'])

In [17]:
song_features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0.603,0.412,0,-10.359,1,0.0579,0.8910,0.021200,0.1150,0.371,99.886,206400
1,0.481,0.484,0,-6.556,0,0.0345,0.7920,0.000000,0.1100,0.383,127.238,226000
2,0.418,0.382,3,-7.778,1,0.0319,0.7860,0.000001,0.3860,0.261,163.878,308689
3,0.569,0.660,2,-5.268,1,0.0358,0.0675,0.000000,0.2020,0.497,155.907,301538
4,0.690,0.615,3,-4.532,1,0.0327,0.4090,0.000001,0.2480,0.371,96.002,190000
...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.526,0.758,5,-1.353,0,0.3480,0.8470,0.000000,0.1130,0.490,179.685,185443
296,0.790,0.501,8,-8.451,0,0.0916,0.7470,0.000000,0.0920,0.401,128.040,271222
297,0.774,0.578,5,-6.227,0,0.0306,0.5520,0.000434,0.0985,0.397,110.018,241650
298,0.599,0.350,0,-7.494,0,0.0259,0.6770,0.000001,0.1060,0.146,95.051,245047


### Merge Dataframes

In [22]:
# merge above dataframes
df = pd.merge(song_names, song_artists, how='inner', left_index=True, right_index=True)
df = df.join(song_lyrics)
df = df.join(song_genres)
df = df.join(song_features)

In [19]:
# remove rows that have no lyrics
df = df[~df['lyrics'].str.contains('None')]
df = df.reset_index(drop=True)

#### Data Codebook:
* names: Name of the song.
* artists: Artist(s) of the song.
* lyrics: Lyrics of the song.
* genres: Genre of the song.
* danceability: Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.
* energy: Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy.
* key: Key is the major or minor scale around which a piece of music revolves.
* loudness: The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track. Values typical range between -60 and 0 db. 
* mode: Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived.
* speechiness: Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value.
* acousticness: A measure from 0.0 to 1.0 of whether the track is acoustic.
* instrumentalness: Predicts whether a track contains no vocals. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content.
* liveness: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live.
* valence: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).
* tempo: The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.
* duration_ms: Duration of the song in millisecond.

In [20]:
# check the work
df

Unnamed: 0,names,artists,lyrics,genres,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,Cho Mình Em,Binz,"Binz:\nAnh từng xem một người là thế giới, anh...","v-pop, vietnamese hip hop",0.603,0.412,0,-10.359,1,0.0579,0.8910,0.021200,0.115,0.371,99.886,206400
1,Sài Gòn Đau Lòng Quá,Hứa Kim Tuyền,Mình đã từng hứa\nBên nhau hết tháng năm dài\...,,0.418,0.382,3,-7.778,1,0.0319,0.7860,0.000001,0.386,0.261,163.878,308689
2,Chúng Ta Của Hiện Tại,Sơn Tùng M-TP,The Official Audio Lyrics of Chúng Ta Của Hiệ...,v-pop.1,0.569,0.660,2,-5.268,1,0.0358,0.0675,0.000000,0.202,0.497,155.907,301538
3,Phải Chăng Em Đã Yêu?,Juky San,Chorus: Juky SanPhải chăng em đã yêu ngay từ ...,v-pop.2,0.690,0.615,3,-4.532,1,0.0327,0.4090,0.000001,0.248,0.371,96.002,190000
4,Sinh Ra Đã Là Thứ Đối Lập Nhau,Emcee L (Da LAB),Em như là đại dương xanh ngắt khiến bao người...,vietnamese hip hop.1,0.494,0.395,6,-8.653,1,0.0301,0.8870,0.000014,0.058,0.179,141.464,234168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,So far,Binz6,"Verse 1\nBạn em thấy, tối hôm đó anh bước bên...","v-pop, vietnamese hip hop.24",0.353,0.460,5,-11.307,1,0.0847,0.6230,0.005910,0.132,0.314,182.155,204696
140,Chuyện Như Chưa Bắt Đầu,Mỹ Tâm1,Mưa rơi trên nỗi đau nào\nLòng chợt thấy nghẹ...,"v-pop, vietnamese singer-songwriter.17",0.587,0.487,4,-5.517,0,0.0272,0.7850,0.000000,0.130,0.225,106.888,269227
141,Túy Âm.1,Masew2,Rót đến tràn lу\nAnh chìm đắm trong men caу đ...,v-pop.163,0.706,0.731,0,-7.797,0,0.0605,0.3400,0.000012,0.727,0.303,74.487,201433
142,Xin,Masew6,Hoh hoh\nCó lẽ đôi mình giờ chỉ là một lối số...,v-pop.167,0.790,0.501,8,-8.451,0,0.0916,0.7470,0.000000,0.092,0.401,128.040,271222


In [23]:
# write to csv file
df.to_csv('vpop.csv')