# Extraindo 20 mil músicas da base de 1 milhão

## Preparação

### Imports

Necessários o módulo pandas e nosso código custom para extrair as músicas

In [17]:
import pandas as pd
import requests  # para baixar a base/amostra para sua máquina
from tqdm.notebook import tqdm # para mostrar barra de progresso
import tarfile  # para descomprimir eventuais arquivos .tar.gz baixados
from libs.extraction_helpers import load_song_data


### Extensões

A extensão abaixo precisa ser ativada para que uma barra de progresso seja mostrada durante a operação.

In [10]:
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


### Localização do diretório que contém a base alvo (seja a base inteira ou o subset)


In [13]:
PATH_R = "../datasets/MillionSongSubset"

### Baixando o MillionSongSubset para simular a existência da base neste computador (opcional)

In [4]:
# Escrever o arquivo em disco, mostrando barra de progresso
# https://stackoverflow.com/a/37573701

url = "http://labrosa.ee.columbia.edu/~dpwe/tmp/millionsongsubset.tar.gz"
response = requests.get(url, stream=True)

# Levantar HTTPError pra qualquer resposta http não esperada
response.raise_for_status()

# Escrever e mostrar progresso
total_size_in_bytes= int(response.headers.get('content-length', 0))
block_size = 1024 # 1 Kb
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
output_filename = f"{PATH_R}.tar.gz"
with open(output_filename, 'wb') as file:
    for data in response.iter_content(block_size):
        progress_bar.update(len(data))
        file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
    print("ERRO: algo deu errado!")

  0%|          | 0.00/1.98G [00:00<?, ?iB/s]

#### Descomprimir o arquivo baixado (opcional)

In [14]:
# open file
file = tarfile.open(output_filename)
  
# extracting file
file.extractall("../datasets/")
  
file.close()

In [15]:
# Localização de um arquivo de exemplo, para extrairmos os nomes de colunas etc
PATH_HDF5 = "../sample_data/TRAXLZU12903D05F94.h5"

In [18]:
df = load_song_data(dataset_root_dir=PATH_R, sample_hdf5_file_path=PATH_HDF5, letter="*", half=None, max_songs=100)

  0%|          | 0/10000 [00:00<?, ?it/s]

Informações básicas sobre o df:

In [19]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 54 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   analysis_sample_rate        100 non-null    int32  
 1   artist_7digitalid           100 non-null    int32  
 2   artist_familiarity          100 non-null    float64
 3   artist_hotttnesss           100 non-null    float64
 4   artist_id                   100 non-null    object 
 5   artist_latitude             100 non-null    float64
 6   artist_location             100 non-null    object 
 7   artist_longitude            100 non-null    float64
 8   artist_mbid                 100 non-null    object 
 9   artist_mbtags               100 non-null    object 
 10  artist_mbtags_count         100 non-null    object 
 11  artist_name                 100 non-null    object 
 12  artist_playmeid             100 non-null    int32  
 13  artist_terms                100 non-

In [20]:
df.describe()

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_latitude,artist_longitude,artist_playmeid,danceability,duration,end_of_fade_in,...,mode,mode_confidence,release_7digitalid,song_hotttnesss,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_7digitalid,year
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,22050.0,47875.68,0.669029,0.465172,38.129662,-50.648945,21094.04,0.0,231.36242,0.98003,...,0.66,0.49622,289474.69,0.461796,223.00695,128.33648,3.79,0.57317,3267322.0,1995.54
std,0.0,85154.874295,0.11178,0.106817,19.671481,51.327864,35001.766838,0.0,93.980866,1.879673,...,0.476095,0.193497,231530.91598,0.223831,92.137416,37.169436,1.174605,0.384708,2539364.0,10.934534
min,22050.0,158.0,0.407925,0.267757,-36.35484,-123.2079,-1.0,0.0,77.60934,0.0,...,0.0,0.0,205.0,0.0,72.15,66.41,1.0,0.0,2133.0,1963.0
25%,22050.0,4225.75,0.605931,0.402167,34.05349,-91.07571,-1.0,0.0,165.37424,0.087,...,0.0,0.3675,79744.25,0.334707,154.18225,97.37125,4.0,0.1555,1078504.0,1990.0
50%,22050.0,15189.5,0.66295,0.444521,40.71739,-73.54045,6302.5,0.0,216.48934,0.235,...,1.0,0.5,279023.5,0.50174,209.937,123.4245,4.0,0.6845,3110612.0,1998.0
75%,22050.0,44305.5,0.732276,0.505097,51.490705,-1.252925,20363.0,0.0,276.64281,0.602,...,1.0,0.6045,398235.0,0.630779,267.21825,150.99325,4.0,0.95475,4422651.0,2004.0
max,22050.0,466819.0,0.934935,0.908203,57.15382,146.32611,192660.0,0.0,600.37179,12.179,...,1.0,0.988,812979.0,0.984347,588.121,237.466,7.0,1.0,8984739.0,2010.0


### Extraindo o dataframe pra arquivo

Se quisermos extrair o df como um json compresso (.json.gz):

In [21]:
df.to_json("../datasets/20kSongs.json.gz", orient="records", compression={'method': 'gzip', 'compresslevel': 9, 'mtime': 1})

Se quisermos extrair o df como arquivo pickle compresso (.pickle.gz):

In [16]:
df.to_pickle("../datasets/20kSongs.pickle.gz", compression={'method': 'gzip', 'compresslevel': 9, 'mtime': 1})

### Teste de carregamento do arquivo extraído

In [24]:
# Testar carregar o arquivo que acabamos de extrair, ainda compresso:
df_extracted = pd.read_pickle("../datasets/20kSongs.pickle.gz", compression={'method': 'gzip', 'compresslevel': 9, 'mtime': 1})

KeyboardInterrupt: 

In [25]:
df_extracted.info()

NameError: name 'df_extracted' is not defined