# Tratamento de Dados
## Dados Pessoais de Uso do Spotify
### Dashboard Analítico

In [5]:
# Imports

import datetime
import json
import numpy as np
import pandas as pd
from genre_scrapy import genreScrapy


In [6]:
# Reading Data

df1 = pd.read_json('StreamingHistory0.json')
df2 = pd.read_json('StreamingHistory1.json')


In [7]:
# Functions

def to_turns(value):
    if (value < datetime.time(6,0)):
        return 'Madrugada'
    elif (value < datetime.time(12,0)):
        return 'Manhã'
    elif (value < datetime.time(18,0)):
        return 'Tarde'
    else:
        return 'Noite'


In [9]:
# Manipulation Data

# Concatenate Dataframes
df = pd.concat([df1, df2])

# Convert Data Type
df = df.astype({'endTime': 'datetime64', 'artistName': 'string', 'trackName': 'string'})

# Rename Column
df = df.rename(columns={'endTime': 'endDatetime'})

# Create Columns
df['minutesPlayed'] = round(df['msPlayed']/60000, 2)
df['endDate'] = df['endDatetime'].dt.date
df['endTime'] = df['endDatetime'].dt.time
df['turns'] = df['endTime'].map(to_turns)
df['indexes'] = np.arange(len(df))

# Convert Data Type
df = df.astype({'turns': 'string'})

# Order Columns
df = df.loc[:, ['indexes', 'endDatetime', 'endDate', 'endTime', 'msPlayed', 'minutesPlayed', 'turns', 'trackName', 'artistName']]

# Data Filtering
df = df[df['endDate'] < datetime.date(2022, 1, 1)]

# Indexing
df = df.set_index(['indexes'])


In [10]:
# Exploratory Analysis

display(df)
display(df.info())
display(df.describe())


Unnamed: 0_level_0,endDatetime,endDate,endTime,msPlayed,minutesPlayed,turns,trackName,artistName
indexes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2021-01-18 20:41:00,2021-01-18,20:41:00,18610,0.31,Noite,Come A Little Bit Closer,Jay & The Americans
1,2021-01-21 01:58:00,2021-01-21,01:58:00,292853,4.88,Madrugada,Touch,Cigarettes After Sex
2,2021-01-21 02:03:00,2021-01-21,02:03:00,285147,4.75,Madrugada,Hentai,Cigarettes After Sex
3,2021-01-21 02:07:00,2021-01-21,02:07:00,256800,4.28,Madrugada,Cry,Cigarettes After Sex
4,2021-01-21 02:12:00,2021-01-21,02:12:00,245840,4.10,Madrugada,Falling In Love,Cigarettes After Sex
...,...,...,...,...,...,...,...,...
14264,2021-12-31 22:55:00,2021-12-31,22:55:00,294050,4.90,Noite,Mantra (Feat. Emicida),Rubel
14265,2021-12-31 22:56:00,2021-12-31,22:56:00,73215,1.22,Noite,Passagem,Rubel
14266,2021-12-31 23:01:00,2021-12-31,23:01:00,295994,4.93,Noite,Explodir,Rubel
14267,2021-12-31 23:05:00,2021-12-31,23:05:00,257295,4.29,Noite,Sapato,Rubel


<class 'pandas.core.frame.DataFrame'>
Int64Index: 14269 entries, 0 to 14268
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   endDatetime    14269 non-null  datetime64[ns]
 1   endDate        14269 non-null  object        
 2   endTime        14269 non-null  object        
 3   msPlayed       14269 non-null  int64         
 4   minutesPlayed  14269 non-null  float64       
 5   turns          14269 non-null  string        
 6   trackName      14269 non-null  string        
 7   artistName     14269 non-null  string        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2), string(3)
memory usage: 1003.3+ KB


None

Unnamed: 0,msPlayed,minutesPlayed
count,14269.0,14269.0
mean,221515.3,3.691888
std,109415.2,1.823728
min,0.0,0.0
25%,177440.0,2.96
50%,213667.0,3.56
75%,257467.0,4.29
max,2947344.0,49.12


In [None]:
"""
# Artist Names (Uniques)

artistName = list(set(df.artistName))
artistName.sort()

# Get Genres

genreList = list()

for name in artistName:
    try:
        language = 'portuguese'
        genre = genreScrapy(name, language)
        if genre == 'Not found':
            try:
                language = 'english'
                genre = genreScrapy(name, language)
            except:
                genre = 'Not found'
    except:
        try:
            language = 'english'
            genre = genreScrapy(name, language)
        except:
            genre = 'Not found'
            
    genreList.append(genre)

# Data Manipulation

genreList = [i.title() for i in genreList]
dict_genres = dict(zip(artistName, genreList))

# Export JSON

# Create json object from dictionary
json = json.dumps(dict_genres)

# Open file for writing, "w" 
f = open("dict_genres.json","w")

# Write json object to file
f.write(json)

# Close file
f.close()
"""

In [11]:
# Genres Analysis

# New Dataframe
df_genres = df.copy()

# Import JSON File
f = open('dict_genres.json')
artist_genres = json.load(f)

# Modify Dataframe
df_genres['genres'] = df_genres['artistName'].map(artist_genres)
display(df_genres)

# Export Dataframe
df_genres.to_csv('data.csv')


Unnamed: 0_level_0,endDatetime,endDate,endTime,msPlayed,minutesPlayed,turns,trackName,artistName,genres
indexes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2021-01-18 20:41:00,2021-01-18,20:41:00,18610,0.31,Noite,Come A Little Bit Closer,Jay & The Americans,Pop Rock
1,2021-01-21 01:58:00,2021-01-21,01:58:00,292853,4.88,Madrugada,Touch,Cigarettes After Sex,Dream Pop
2,2021-01-21 02:03:00,2021-01-21,02:03:00,285147,4.75,Madrugada,Hentai,Cigarettes After Sex,Dream Pop
3,2021-01-21 02:07:00,2021-01-21,02:07:00,256800,4.28,Madrugada,Cry,Cigarettes After Sex,Dream Pop
4,2021-01-21 02:12:00,2021-01-21,02:12:00,245840,4.10,Madrugada,Falling In Love,Cigarettes After Sex,Dream Pop
...,...,...,...,...,...,...,...,...,...
14264,2021-12-31 22:55:00,2021-12-31,22:55:00,294050,4.90,Noite,Mantra (Feat. Emicida),Rubel,Mpb
14265,2021-12-31 22:56:00,2021-12-31,22:56:00,73215,1.22,Noite,Passagem,Rubel,Mpb
14266,2021-12-31 23:01:00,2021-12-31,23:01:00,295994,4.93,Noite,Explodir,Rubel,Mpb
14267,2021-12-31 23:05:00,2021-12-31,23:05:00,257295,4.29,Noite,Sapato,Rubel,Mpb
