# Explorando os dados

## Imports necessários

In [1]:
import gzip  # descomprimir arquivos .gz
import pandas as pd
import requests # para baixar a base/amostra para sua máquina
from tqdm.notebook import tqdm # para mostrar barra de progresso


In [2]:
# Habilitar barra de progresso
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [8]:
# Variáveis pra guardar nome(s) / url(s) de arquivo(s) a manipular

url = "https://projeto-puc-ai-ml-2022.s3.amazonaws.com/20kSongs.pickle.gz"
# Descomente a linha abaixo e comente a linha acima, caso sua máquina não tenha memória
# url = "https://projeto-puc-ai-ml-2022.s3.amazonaws.com/10kSongs_1.pickle.gz"

downloaded_filename = url.split("/")[-1]
output_filename = f"../datasets/{downloaded_filename}"
decompressed_filename = output_filename.replace(".gz", "")
decompressed_filename

'../datasets/20kSongs.pickle'

## Baixando o dataset (opcional, caso já o tenha)

In [4]:
# Escrever o arquivo em disco, mostrando barra de progresso
# https://stackoverflow.com/a/37573701

response = requests.get(url, stream=True)

# Levantar HTTPError pra qualquer resposta http não esperada
response.raise_for_status()

# Escrever e mostrar progresso
total_size_in_bytes= int(response.headers.get('content-length', 0))
block_size = 1024 # 1 Kb
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open(output_filename, 'wb') as file:
    for data in response.iter_content(block_size):
        progress_bar.update(len(data))
        file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
    print("ERRO: algo deu errado!")

  0%|          | 0.00/1.63G [00:00<?, ?iB/s]

### Descomprimindo o arquivo baixado (opcional, caso já o tenha)

In [7]:
import shutil
with gzip.open(output_filename, 'rb') as f_in:
    with open(decompressed_filename, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/20kSongs.pickle.gz'

## Carregando dataset na memória

Antes de rodar a próxima célula, certifique-se de descomprimir o arquivo do dataset antes. 

Alternativa: decomprimir ao mesmo tempo em que carrega na memória

`df = pd.read_pickle("../datasets/10kSongs_1.pickle.gz", compress={"method": "gzip", "compresslevel": 9})`

In [9]:
df = pd.read_pickle(decompressed_filename)
# df = pd.read_pickle("../datasets/10kSongs_1.pickle.gz", compress={"method": "gzip", "compresslevel": 9})


## Explorando

In [10]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 54 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   analysis_sample_rate        20000 non-null  int32  
 1   artist_7digitalid           20000 non-null  int32  
 2   artist_familiarity          20000 non-null  float64
 3   artist_hotttnesss           20000 non-null  float64
 4   artist_id                   20000 non-null  object 
 5   artist_latitude             20000 non-null  float64
 6   artist_location             20000 non-null  object 
 7   artist_longitude            20000 non-null  float64
 8   artist_mbid                 20000 non-null  object 
 9   artist_mbtags               20000 non-null  object 
 10  artist_mbtags_count         20000 non-null  object 
 11  artist_name                 20000 non-null  object 
 12  artist_playmeid             20000 non-null  int32  
 13  artist_terms                200

In [11]:
df.describe()

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_latitude,artist_longitude,artist_playmeid,danceability,duration,end_of_fade_in,...,mode,mode_confidence,release_7digitalid,song_hotttnesss,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_7digitalid,year
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,...,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,22050.0,59736.39915,0.658526,0.451967,41.729697,-45.6169,24905.71815,0.0,244.758911,0.90754,...,0.6743,0.480916,317453.31095,0.466749,235.178549,124.897426,3.6019,0.503718,3528192.0,1996.89865
std,0.0,94337.284936,0.103677,0.082851,16.117322,56.359308,43470.713262,0.0,115.179208,2.146168,...,0.468648,0.19265,233725.794951,0.207622,112.629457,34.99819,1.231825,0.370862,2599764.0,11.119836
min,22050.0,-1.0,0.0,0.0,-45.8745,-159.54054,-1.0,0.0,2.29832,0.0,...,0.0,0.0,39.0,0.0,2.298,0.0,0.0,0.0,443.0,1929.0
25%,22050.0,5681.5,0.595096,0.397687,37.221938,-87.63241,-1.0,0.0,181.65506,0.0,...,0.0,0.365,110126.75,0.34912,173.377,99.0775,3.0,0.103,1156470.0,1992.0
50%,22050.0,18875.0,0.648199,0.444541,42.28474,-73.80817,2993.0,0.0,228.75383,0.212,...,1.0,0.493,286337.0,0.496175,219.0045,121.7985,4.0,0.542,3221752.0,2000.0
75%,22050.0,65104.0,0.72152,0.495863,51.50632,-0.12714,30756.0,0.0,283.92444,0.462,...,1.0,0.611,511268.75,0.612041,272.933,145.939,4.0,0.85,5674990.0,2006.0
max,22050.0,603509.0,0.94829,0.969246,69.65102,175.47131,242965.0,0.0,2502.73914,62.764,...,1.0,1.0,823409.0,1.0,2495.054,252.943,7.0,1.0,9088514.0,2010.0
