# Explorando os dados

## Imports necessários

In [1]:
import gzip  # descomprimir arquivos .gz
import pandas as pd
import requests # para baixar a base/amostra para sua máquina
from tqdm.notebook import tqdm # para mostrar barra de progresso


In [3]:
# Habilitar barra de progresso
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


## Baixando o dataset (opcional, caso já o tenha)

In [2]:
# Escrever o arquivo em disco, mostrando barra de progresso
# https://stackoverflow.com/a/37573701

# url = "https://projeto-puc-ai-ml-2022.s3.amazonaws.com/20kSongs.pickle.gz"
# Descomente a linha abaixo e comente a linha acima, caso sua máquina não tenha memória
url = "https://projeto-puc-ai-ml-2022.s3.amazonaws.com/10kSongs_1.pickle.gz"

downloaded_filename = url.split("/")[-1]

response = requests.get(url, stream=True)

# Levantar HTTPError pra qualquer resposta http não esperada
response.raise_for_status()

# Escrever e mostrar progresso
total_size_in_bytes= int(response.headers.get('content-length', 0))
block_size = 1024 # 1 Kb
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
output_filename = f"../datasets/{downloaded_filename}"
with open(output_filename, 'wb') as file:
    for data in response.iter_content(block_size):
        progress_bar.update(len(data))
        file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
    print("ERRO: algo deu errado!")

  0%|          | 0.00/816M [00:00<?, ?iB/s]

### Descomprimindo o arquivo baixado (opcional, caso já o tenha)

In [4]:
import shutil
with gzip.open(output_filename, 'rb') as f_in:
    decompressed_filename = output_filename.replace(".gz", "")
    with open(decompressed_filename, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

## Carregando dataset na memória

Antes de rodar a próxima célula, certifique-se de descomprimir o arquivo do dataset antes. Ex:

`gzip -d 10kSongs_1.pickle.gz`

Alternativa: decomprimir ao mesmo tempo em que carrega na memória

`df = pd.read_pickle("../datasets/10kSongs_1.pickle.gz", compress={"method": "gzip", "compresslevel": 9})`

In [5]:
df = pd.read_pickle(decompressed_filename)
# df = pd.read_pickle("../datasets/10kSongs_1.pickle.gz", compress={"method": "gzip", "compresslevel": 9})


## Explorando

In [6]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 54 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   analysis_sample_rate        10000 non-null  int32  
 1   artist_7digitalid           10000 non-null  int32  
 2   artist_familiarity          10000 non-null  float64
 3   artist_hotttnesss           10000 non-null  float64
 4   artist_id                   10000 non-null  object 
 5   artist_latitude             10000 non-null  float64
 6   artist_location             10000 non-null  object 
 7   artist_longitude            10000 non-null  float64
 8   artist_mbid                 10000 non-null  object 
 9   artist_mbtags               10000 non-null  object 
 10  artist_mbtags_count         10000 non-null  object 
 11  artist_name                 10000 non-null  object 
 12  artist_playmeid             10000 non-null  int32  
 13  artist_terms                1000

In [13]:
df.describe()

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_latitude,artist_longitude,artist_playmeid,danceability,duration,end_of_fade_in,...,mode,mode_confidence,release_7digitalid,song_hotttnesss,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_7digitalid,year
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,22050.0,58996.1008,0.65751,0.451753,41.681794,-45.486389,24945.6411,0.0,245.463986,0.898147,...,0.6703,0.480704,316255.6831,0.464064,235.715078,125.154325,3.5933,0.501146,3514958.0,1996.8477
std,0.0,92707.901847,0.10358,0.081716,16.207656,56.611602,43464.204468,0.0,113.451018,2.069717,...,0.470128,0.191856,234072.533995,0.207677,110.661045,35.244338,1.233145,0.373282,2604464.0,11.067916
min,22050.0,-1.0,0.0,0.0,-45.8745,-157.85762,-1.0,0.0,2.29832,0.0,...,0.0,0.0,39.0,0.0,2.298,0.0,0.0,0.0,443.0,1951.0
25%,22050.0,5571.0,0.595042,0.397626,37.16793,-87.63241,-1.0,0.0,181.877097,0.0,...,0.0,0.366,109861.0,0.345802,173.5965,99.4255,3.0,0.091,1142070.0,1992.0
50%,22050.0,18914.0,0.648044,0.44303,42.31256,-73.66658,2635.0,0.0,229.58975,0.2085,...,1.0,0.492,285079.5,0.49449,219.8435,122.114,4.0,0.534,3206906.0,2000.0
75%,22050.0,64232.0,0.720908,0.495863,51.50632,-0.12714,30909.0,0.0,285.171792,0.45925,...,1.0,0.608,511260.5,0.609281,273.87675,146.019,4.0,0.85525,5674424.0,2005.0
max,22050.0,603509.0,0.94829,0.969246,69.65102,175.47131,242965.0,0.0,2488.86812,39.66,...,1.0,1.0,823409.0,0.997604,2485.899,252.943,7.0,1.0,9088514.0,2010.0
