In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as sts

In [2]:
# Lendo o arquivo
vendas = pd.read_csv('Dados/vendas-por-fatura.csv', sep=",")
vendas.head(10)

Unnamed: 0,N° da fatura,Data da fatura,ID Cliente,País,Quantidade,Valor
0,548370,3/30/2021 16:14:00,15528.0,United Kingdom,123,22933
1,575767,11/11/2021 11:11:00,17348.0,United Kingdom,163,20973
2,C570727,10/12/2021 11:32:00,12471.0,Germany,-1,-145
3,549106,4/6/2021 12:08:00,17045.0,United Kingdom,1,3995
4,573112,10/27/2021 15:33:00,16416.0,United Kingdom,357,34483
5,576630,11/16/2021 8:38:00,13816.0,Germany,91,19998
6,538125,12/9/2020 15:46:00,18225.0,United Kingdom,16,3000
7,544354,2/18/2021 10:42:00,13489.0,United Kingdom,64,7728
8,546369,3/11/2021 11:41:00,15513.0,United Kingdom,10,6750
9,570651,10/11/2021 13:34:00,14911.0,EIRE,86,32135


In [3]:
vendas.describe(exclude='number')

Unnamed: 0,N° da fatura,Data da fatura,País,Valor
count,25953,25953,25953,25953
unique,25900,23260,38,17540
top,550333,5/10/2021 15:05:00,United Kingdom,0
freq,2,6,23542,2105


In [4]:
# Verificando o shape
vendas.shape

(25953, 6)

In [5]:
# Verificando os tipos de dados
vendas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25953 entries, 0 to 25952
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   N° da fatura    25953 non-null  object 
 1   Data da fatura  25953 non-null  object 
 2   ID Cliente      22229 non-null  float64
 3   País            25953 non-null  object 
 4   Quantidade      25953 non-null  int64  
 5   Valor           25953 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 1.2+ MB


In [6]:
# Verificando se existem valores nulos
vendas.isnull().sum()

N° da fatura         0
Data da fatura       0
ID Cliente        3724
País                 0
Quantidade           0
Valor                0
dtype: int64

In [7]:
# Visualizando as linhas com valores ausentes
linhas_nulas = vendas.loc[vendas['ID Cliente'].isnull()]
print(linhas_nulas.sample(20))

      N° da fatura       Data da fatura  ID Cliente            País  \
1433        558904    7/4/2021 16:18:00         NaN  United Kingdom   
9385        547329   3/22/2021 11:38:00         NaN  United Kingdom   
11235       537002   12/3/2020 15:33:00         NaN  United Kingdom   
8942        551579    5/3/2021 11:20:00         NaN  United Kingdom   
6491        573137  10/27/2021 17:15:00         NaN  United Kingdom   
16216       559378    7/8/2021 10:40:00         NaN  United Kingdom   
7603        573979   11/2/2021 11:02:00         NaN  United Kingdom   
2899        560218   7/15/2021 15:12:00         NaN  United Kingdom   
23911       577947  11/22/2021 12:08:00         NaN  United Kingdom   
11669       561271   7/26/2021 12:18:00         NaN  United Kingdom   
10440      C571501  10/17/2021 15:14:00         NaN  United Kingdom   
2888        539750  12/21/2020 15:40:00         NaN  United Kingdom   
14421      C557664   6/21/2021 18:00:00         NaN            EIRE   
19701 

In [8]:
# Substituindo as vírgulas por pontos na coluna 'Valor'
vendas['Valor'] = vendas['Valor'].str.replace(',', '.')

# Convertendo a coluna 'Valor' de object para float64
vendas['Valor'] = vendas['Valor'].astype(float)

# Verificando se o separador de casas decimais foi alterado
vendas.sample(5)

Unnamed: 0,N° da fatura,Data da fatura,ID Cliente,País,Quantidade,Valor
12195,580771,12/6/2021 10:58:00,16015.0,United Kingdom,276,203.1
10130,571069,10/13/2021 15:15:00,13668.0,United Kingdom,97,188.94
4957,543993,2/15/2021 10:40:00,17612.0,United Kingdom,178,305.46
12738,C542793,2/1/2021 10:52:00,14113.0,United Kingdom,-78,-331.5
15062,571204,10/14/2021 12:34:00,17677.0,United Kingdom,6,59.7


In [9]:
# Verificando se a coluna 'Valor' foi convertida para o tipo correto
vendas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25953 entries, 0 to 25952
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   N° da fatura    25953 non-null  object 
 1   Data da fatura  25953 non-null  object 
 2   ID Cliente      22229 non-null  float64
 3   País            25953 non-null  object 
 4   Quantidade      25953 non-null  int64  
 5   Valor           25953 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 1.2+ MB


In [10]:
valores_em_branco_ID_Cliente = vendas[vendas['ID Cliente'].isnull()]
print(valores_em_branco_ID_Cliente)

      N° da fatura       Data da fatura  ID Cliente            País  \
11          539736  12/21/2020 15:18:00         NaN  United Kingdom   
12          537823   12/8/2020 14:25:00         NaN  United Kingdom   
16          542418   1/27/2021 17:39:00         NaN  United Kingdom   
21          550759   4/20/2021 12:09:00         NaN  United Kingdom   
23          556427   6/10/2021 13:23:00         NaN  United Kingdom   
...            ...                  ...         ...             ...   
25930       539300  12/16/2020 17:31:00         NaN  United Kingdom   
25932       565303    9/2/2021 12:17:00         NaN     Unspecified   
25937       543533    2/9/2021 13:00:00         NaN  United Kingdom   
25939      C572450  10/24/2021 12:35:00         NaN  United Kingdom   
25952       537999   12/9/2020 11:44:00         NaN  United Kingdom   

       Quantidade    Valor  
11            143  1172.46  
12           1484  6073.71  
16            -17     0.00  
21             -7     0.00  
23

In [11]:
# Exibindo novamente as 5 primeiras linhas para servir de referência
vendas.head(5)

Unnamed: 0,N° da fatura,Data da fatura,ID Cliente,País,Quantidade,Valor
0,548370,3/30/2021 16:14:00,15528.0,United Kingdom,123,229.33
1,575767,11/11/2021 11:11:00,17348.0,United Kingdom,163,209.73
2,C570727,10/12/2021 11:32:00,12471.0,Germany,-1,-1.45
3,549106,4/6/2021 12:08:00,17045.0,United Kingdom,1,39.95
4,573112,10/27/2021 15:33:00,16416.0,United Kingdom,357,344.83


In [12]:
# Convertendo a coluna 'Data da fatura' para o formato datetime64
vendas['Data da fatura'] = pd.to_datetime(vendas['Data da fatura'], format='%m/%d/%Y %H:%M:%S')

# Separando a coluna 'Data da fatura' em colunas 'Data' e 'Horario'
vendas['Data'] = vendas['Data da fatura'].dt.date
vendas['Horario'] = vendas['Data da fatura'].dt.time

# Exibindo o DataFrame com as novas colunas
print(vendas[['Data da fatura', 'Data', 'Horario']])

           Data da fatura        Data   Horario
0     2021-03-30 16:14:00  2021-03-30  16:14:00
1     2021-11-11 11:11:00  2021-11-11  11:11:00
2     2021-10-12 11:32:00  2021-10-12  11:32:00
3     2021-04-06 12:08:00  2021-04-06  12:08:00
4     2021-10-27 15:33:00  2021-10-27  15:33:00
...                   ...         ...       ...
25948 2021-06-13 10:53:00  2021-06-13  10:53:00
25949 2021-05-13 10:44:00  2021-05-13  10:44:00
25950 2021-11-17 13:58:00  2021-11-17  13:58:00
25951 2021-04-14 12:39:00  2021-04-14  12:39:00
25952 2020-12-09 11:44:00  2020-12-09  11:44:00

[25953 rows x 3 columns]


In [13]:
# Conferindo os tipos de dados após as mudanças
vendas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25953 entries, 0 to 25952
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   N° da fatura    25953 non-null  object        
 1   Data da fatura  25953 non-null  datetime64[ns]
 2   ID Cliente      22229 non-null  float64       
 3   País            25953 non-null  object        
 4   Quantidade      25953 non-null  int64         
 5   Valor           25953 non-null  float64       
 6   Data            25953 non-null  object        
 7   Horario         25953 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 1.6+ MB


In [14]:
# Exibindo novamente as 5 primeiras linhas para ver como ficou o DataFrame
vendas.head(5)

Unnamed: 0,N° da fatura,Data da fatura,ID Cliente,País,Quantidade,Valor,Data,Horario
0,548370,2021-03-30 16:14:00,15528.0,United Kingdom,123,229.33,2021-03-30,16:14:00
1,575767,2021-11-11 11:11:00,17348.0,United Kingdom,163,209.73,2021-11-11,11:11:00
2,C570727,2021-10-12 11:32:00,12471.0,Germany,-1,-1.45,2021-10-12,11:32:00
3,549106,2021-04-06 12:08:00,17045.0,United Kingdom,1,39.95,2021-04-06,12:08:00
4,573112,2021-10-27 15:33:00,16416.0,United Kingdom,357,344.83,2021-10-27,15:33:00
