## Análise de dados com Pandas: Reshaping de dados e Pivoting

### Victor Hugo Negrisoli - Desenvolvedor de Software Full-Stack & Analista de Dados

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

#### Reshaping de dados

In [2]:
datas = pd.date_range('20200101', periods = 6)

df_datas = pd.DataFrame(np.random.randn(6, 4), index = datas, columns = ['A', 'B', 'C', 'D'])

df_datas

Unnamed: 0,A,B,C,D
2020-01-01,1.566057,-0.295134,1.25512,0.615924
2020-01-02,0.109825,0.933261,-1.0272,-0.340283
2020-01-03,-0.177423,-0.924933,-0.3355,0.150686
2020-01-04,-0.556527,0.105546,2.034085,0.44583
2020-01-05,0.171412,0.20376,0.143053,-0.557302
2020-01-06,0.35481,0.255871,-0.557364,-1.45262


In [3]:
df_datas.shape

(6, 4)

In [4]:
# Transpondo os dados para que as datas virem colunas

df_datas_transposto = df_datas.T
df_datas_transposto

Unnamed: 0,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05,2020-01-06
A,1.566057,0.109825,-0.177423,-0.556527,0.171412,0.35481
B,-0.295134,0.933261,-0.924933,0.105546,0.20376,0.255871
C,1.25512,-1.0272,-0.3355,2.034085,0.143053,-0.557364
D,0.615924,-0.340283,0.150686,0.44583,-0.557302,-1.45262


In [5]:
df_datas_transposto.shape

(4, 6)

In [7]:
valores = df_datas_transposto.values
valores

array([[ 1.56605694,  0.1098247 , -0.17742342, -0.55652733,  0.17141196,
         0.35481001],
       [-0.29513441,  0.93326092, -0.9249327 ,  0.10554582,  0.20375976,
         0.25587143],
       [ 1.25512   , -1.02719965, -0.33549956,  2.03408451,  0.14305324,
        -0.55736369],
       [ 0.61592358, -0.34028268,  0.15068638,  0.44582996, -0.55730219,
        -1.45262027]])

In [11]:
# Realizando o reshaping no array numpy valores, agora teremos 12 colunas e 2 linhas

valores.reshape((2, 12))

array([[ 1.56605694,  0.1098247 , -0.17742342, -0.55652733,  0.17141196,
         0.35481001, -0.29513441,  0.93326092, -0.9249327 ,  0.10554582,
         0.20375976,  0.25587143],
       [ 1.25512   , -1.02719965, -0.33549956,  2.03408451,  0.14305324,
        -0.55736369,  0.61592358, -0.34028268,  0.15068638,  0.44582996,
        -0.55730219, -1.45262027]])

#### Pivoteamento de dados com a função pivot()

In [12]:
dias = pd.date_range('20190101', periods = 12)
dias

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
               '2019-01-09', '2019-01-10', '2019-01-11', '2019-01-12'],
              dtype='datetime64[ns]', freq='D')

In [14]:
pessoas = ['George', 'Victor', 'Lucas']
pessoas

['George', 'Victor', 'Lucas']

In [34]:
np.random.choice(pessoas)

In [50]:
nome = []
gasto = []

for i in range(12):
    nome.append(np.random.choice(pessoas))
    gasto.append(np.round(np.random.rand() * 100, 2))

In [51]:
nome

['George',
 'George',
 'Lucas',
 'George',
 'Victor',
 'Lucas',
 'Victor',
 'George',
 'Lucas',
 'Lucas',
 'George',
 'George']

In [52]:
gasto

[33.35,
 30.29,
 93.56,
 96.48,
 42.84,
 37.07,
 96.96,
 88.57,
 47.45,
 80.13,
 57.53,
 6.97]

In [55]:
df_nomes_gastos = pd.DataFrame({'Data': dias, 'Nome': nome, 'Gasto': gasto})
df_nomes_gastos

Unnamed: 0,Data,Nome,Gasto
0,2019-01-01,George,33.35
1,2019-01-02,George,30.29
2,2019-01-03,Lucas,93.56
3,2019-01-04,George,96.48
4,2019-01-05,Victor,42.84
5,2019-01-06,Lucas,37.07
6,2019-01-07,Victor,96.96
7,2019-01-08,George,88.57
8,2019-01-09,Lucas,47.45
9,2019-01-10,Lucas,80.13


In [58]:
# Criando um pivoteamento utilizando a função Pivot()

df_nomes_gastos.pivot(index = 'Data', columns = 'Nome', values = 'Gasto')

Nome,George,Lucas,Victor
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,33.35,,
2019-01-02,30.29,,
2019-01-03,,93.56,
2019-01-04,96.48,,
2019-01-05,,,42.84
2019-01-06,,37.07,
2019-01-07,,,96.96
2019-01-08,88.57,,
2019-01-09,,47.45,
2019-01-10,,80.13,


#### Criando uma tabela pivô com a função pivot_table()

In [66]:
qtd_carro_vendida = [7, 4, 3, 2, 8]
dias = pd.date_range('20190101', '20190101', periods = 5)
vendedores = ['George', 'Vagner', 'Pedro', 'Vagner', 'George']

df_venda_carros = pd.DataFrame({
    'Qtd Vendas Carros': qtd_carro_vendida,
    'Data': dias, 
    'Vendedor': vendedores
})
df_venda_carros

Unnamed: 0,Qtd Vendas Carros,Data,Vendedor
0,7,2019-01-01,George
1,4,2019-01-01,Vagner
2,3,2019-01-01,Pedro
3,2,2019-01-01,Vagner
4,8,2019-01-01,George


In [70]:
"""
 Criando a tabela pivô.

* Observação: apenas com a função pivot() não daria certo, pois a função pivot() não aceita dados repetidos, como por 
 exemplo, a nossa coluna de data

"""

pd.pivot_table(df_venda_carros, index = 'Data', columns = 'Vendedor', values = 'Qtd Vendas Carros')

Vendedor,George,Pedro,Vagner
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,7.5,3.0,3.0


In [71]:
# Utilizando a mesma tabela pivô, porém, utilizando a soma dos dados

pd.pivot_table(df_venda_carros, index = 'Data', columns = 'Vendedor', values = 'Qtd Vendas Carros', aggfunc = 'sum')

Vendedor,George,Pedro,Vagner
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,15,3,6


In [72]:
# Utilizando a mesma tabela pivô, porém, utilizando o maior dos dados

pd.pivot_table(df_venda_carros, index = 'Data', columns = 'Vendedor', values = 'Qtd Vendas Carros', aggfunc = 'max')

Vendedor,George,Pedro,Vagner
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,8,3,4


In [73]:
# Utilizando a mesma tabela pivô, porém, utilizando o menor dos dados

pd.pivot_table(df_venda_carros, index = 'Data', columns = 'Vendedor', values = 'Qtd Vendas Carros', aggfunc = 'min')

Vendedor,George,Pedro,Vagner
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,7,3,2


In [74]:
# Utilizando a mesma tabela pivô, porém, utilizando o desvio padrão entre os dados

pd.pivot_table(df_venda_carros, index = 'Data', columns = 'Vendedor', values = 'Qtd Vendas Carros', aggfunc = 'std')

Vendedor,George,Vagner
Data,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,0.707107,1.414214
