# Dataset Documentation and Description

In [1]:
import os
import pandas as pd
from typing import *  # type: ignore

In [8]:
# check if the dataset path is correct
DATSET_PATH: str = "../dataset"
if not os.path.exists(DATSET_PATH):
    print("Dataset path is not correct")
    exit(1)
else:
    print("Dataset path is correct")

Dataset path is correct


In [9]:
# retrieve all file paths inthe dataset folder
file_paths: List[str] = []
for root, dirs, files in os.walk(DATSET_PATH):
    for file in files:
        file_paths.append(os.path.join(root, file))
assert len(file_paths) == 24, f"Expected 24 files, but found {len(file_paths)}"
print(f"Total files found: {len(file_paths)}")

Total files found: 24


In [10]:
# get filenames
filenames: List[str] = [os.path.basename(file) for file in file_paths]
filenames

['id_blf_correlation_mmsr.tsv',
 'id_blf_deltaspectral_mmsr.tsv',
 'id_blf_logfluc_mmsr.tsv',
 'id_blf_spectralcontrast_mmsr.tsv',
 'id_blf_spectral_mmsr.tsv',
 'id_blf_vardeltaspectral_mmsr.tsv',
 'id_genres_mmsr.tsv',
 'id_incp_mmsr.tsv',
 'id_information_mmsr.tsv',
 'id_ivec1024_mmsr.tsv',
 'id_ivec256_mmsr.tsv',
 'id_ivec512_mmsr.tsv',
 'id_lyrics_bert_mmsr.tsv',
 'id_lyrics_tf-idf_mmsr.tsv',
 'id_lyrics_word2vec_mmsr.tsv',
 'id_metadata_mmsr.tsv',
 'id_mfcc_bow_mmsr.tsv',
 'id_mfcc_stats_mmsr.tsv',
 'id_musicnn_mmsr.tsv',
 'id_resnet_mmsr.tsv',
 'id_tags_dict.tsv',
 'id_total_listens.tsv',
 'id_url_mmsr.tsv',
 'id_vgg19_mmsr.tsv']

In [11]:
# inspect the ".\dataset\id_information_mmsr.tsv" file
id_information_mmsr_file = os.path.join(DATSET_PATH, "id_information_mmsr.tsv")
assert os.path.exists(id_information_mmsr_file), f"{id_information_mmsr_file} not found"

# read in the tsv file
base_infos: pd.DataFrame = pd.read_csv(id_information_mmsr_file, sep="\t")  # type: ignore

In [12]:
base_infos.head()

Unnamed: 0,id,artist,song,album_name
0,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones
1,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te
2,04OjszRi9rC5BlHC,Grizzly Bear,Knife,Yellow House
3,04iitW3ffa0mhpx3,Ne-Yo,Miss Independent,Year Of The Gentleman (Bonus Track Edition)
4,04xUDjAYC14jsHyH,Jawbreaker,Jinx Removing,24 Hour Revenge Therapy (Remastered)


In [13]:
# inspect the ".\dataset\id_lyrics_tf-idf_mmsr.tsv" file
id_lyrics_tf_idf_mmsr_file = os.path.join(DATSET_PATH, "id_lyrics_tf-idf_mmsr.tsv")
assert os.path.exists(id_lyrics_tf_idf_mmsr_file), f"{id_lyrics_tf_idf_mmsr_file} not found"

# read in the tsv file
lyrics_tf_idf: pd.DataFrame = pd.read_csv(id_lyrics_tf_idf_mmsr_file, sep="\t")  # type: ignore

In [14]:
lyrics_tf_idf.head()

Unnamed: 0,id,abl,accept,across,act,addict,afraid,age,ago,ah,...,yea,yeah,year,yellow,yes,yesterday,yet,yo,young,youth
0,h48f46ZsT9h0Z5Dm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.149783,0.0,0.0,0.0,0.0
1,PV5EXN6AIVBqvsLO,0.0,0.0,0.0,0.0,0.0,0.327025,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,eFEY5JiDF3ZLpXBZ,0.0,0.0,0.143314,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.042526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,VAWiymoCIYxhae3J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109514,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2H91WLAd7ZZJvAiw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.084732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# inpect the genre file
genre_file = os.path.join(DATSET_PATH, "id_genres_mmsr.tsv")
assert os.path.exists(genre_file), f"{genre_file} not found"

# read in the tsv file and convert the "genre" column to a list on read_csv
genres: pd.DataFrame = pd.read_csv(genre_file, sep="\t", converters={"genre": eval})  # type: ignore
genres["genre"][0]


['rock', 'pop punk']

In [16]:
# inpect id_lyrics_tf-idf_mmsr.tsv file
id_lyrics_tf_idf_mmsr_file = os.path.join(DATSET_PATH, "id_lyrics_tf-idf_mmsr.tsv")
assert os.path.exists(id_lyrics_tf_idf_mmsr_file), f"{id_lyrics_tf_idf_mmsr_file} not found"

# read in the tsv file
lyrics_tf_idf: pd.DataFrame = pd.read_csv(id_lyrics_tf_idf_mmsr_file, sep="\t")  # type: ignore

# create a numpy array from the tf-idf values
lyrics_tf_idf_values: pd.DataFrame = lyrics_tf_idf.drop(columns=["id"])
lyrics_tf_idf_values = lyrics_tf_idf_values.to_numpy()
lyrics_tf_idf_values.shape

(5148, 1000)

In [None]:
import pandas as pd

# Path to your file
file_path = "D:/JKU/MMSR/MMSR-project/dataset/id_lyrics_bert_mmsr.tsv"

# Load the file
df = pd.read_csv(file_path, sep="\t")

# Print the column names
print("Columns in the file:", df.columns)

Columns in the file: Index(['id', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '758', '759', '760', '761', '762', '763', '764', '765', '766', '767'],
      dtype='object', length=769)
