# Anotações de Estudo sobre a biblioteca Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
#series são uma lista de valores no qual o pandas criará automaticamente um indice 
pd.Series([1,2,3,np.nan,6,8])

0    1.0
1    2.0
2    3.0
3    NaN
4    6.0
5    8.0
dtype: float64

np.random.randn(n_linhas,n_colunas,index=list_index,columns=list())

In [3]:
# dataframes são estruturas multidimensionais com linhas e colunas como tabelas de excel, bancos de dados etc,
# no exemplo abaixo passamos um array criado pela biblioteca numpy que tem a data como indice usando a função date_range()
dates = pd.date_range("20240101",periods=6)
dates

# em seguida criamos um datafram que tem a variavel dates como indice, uma lista de A a D como colunas e os dados serão gerados utilizando a função random.randn(numero_linhas,numero_colunas)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2024-01-01,0.75904,-0.834354,0.444715,0.634345
2024-01-02,-1.299071,1.759429,1.9934,-0.692261
2024-01-03,-2.06106,-1.084371,2.016134,0.946107
2024-01-04,-0.601836,-0.442779,-0.481264,-1.032713
2024-01-05,-0.41121,-1.403413,-0.083258,-0.025262
2024-01-06,-0.246921,1.074901,0.438909,0.100279


In [4]:
#criando um dataframe passando um dicionário de objetos quando a chave é a coluna e os valoes são os valores das colunas
df2 = pd.DataFrame({
    "A":1.0,
    "B":pd.Timestamp("20240102"),
    "C":pd.Series(1,index=list(range(4)),dtype="float32"),
    "D":np.array([3] * 4,dtype="int32"),
    "E": pd.Categorical(["test","train","item 5","train"]),
    "F":"foo",
})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2024-01-02,1.0,3,test,foo
1,1.0,2024-01-02,1.0,3,train,foo
2,1.0,2024-01-02,1.0,3,item 5,foo
3,1.0,2024-01-02,1.0,3,train,foo


In [5]:
#as colunas tem diferentes tipos ou "dtypes"
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [6]:
#visualização de dados
#utilize DataFrame.head() and DataFrame.tail() para vwe o topo e a base das linhas do respectivo Dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2024-01-04,-0.601836,-0.442779,-0.481264,-1.032713
2024-01-05,-0.41121,-1.403413,-0.083258,-0.025262
2024-01-06,-0.246921,1.074901,0.438909,0.100279


In [7]:
#DataFrame.index mostra o índice e DataFrame.columns mostra o conteúdo das colunas

df.index

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [9]:
df.to_numpy()

array([[ 0.75904037, -0.83435383,  0.44471547,  0.63434537],
       [-1.29907054,  1.75942946,  1.99340031, -0.69226082],
       [-2.06106005, -1.0843709 ,  2.01613448,  0.9461068 ],
       [-0.60183595, -0.44277908, -0.48126442, -1.03271283],
       [-0.41120971, -1.40341262, -0.08325832, -0.0252618 ],
       [-0.24692056,  1.07490133,  0.43890851,  0.10027876]])

In [10]:
#Arrays NumPy tem on dtypes pra cada array inteiro enquanto Pandas DataFrames tem um dtypes por coluna 
#Quando chamamos DataFrame.to_numpy(), pandas vai procurar o dtype que mais se adequa a todos os dtypes do DataFrame, O mais comum é o type 'objetc'
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [11]:
df2.to_numpy()

array([[1.0, Timestamp('2024-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2024-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2024-01-02 00:00:00'), 1.0, 3, 'item 5', 'foo'],
       [1.0, Timestamp('2024-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [12]:
#DataFrame.describe() mostra um sumário estatístico rápido dos dados
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.643509,-0.155098,0.721439,-0.011584
std,0.96118,1.276268,1.053048,0.755397
min,-2.06106,-1.403413,-0.481264,-1.032713
25%,-1.124762,-1.021867,0.047283,-0.525511
50%,-0.506523,-0.638566,0.441812,0.037508
75%,-0.287993,0.695481,1.606229,0.500829
max,0.75904,1.759429,2.016134,0.946107


In [13]:
#DataFrame.T realiza a transposição da matriz de dados, as colunas viram linhas e vice versa
df.T

Unnamed: 0,2024-01-01,2024-01-02,2024-01-03,2024-01-04,2024-01-05,2024-01-06
A,0.75904,-1.299071,-2.06106,-0.601836,-0.41121,-0.246921
B,-0.834354,1.759429,-1.084371,-0.442779,-1.403413,1.074901
C,0.444715,1.9934,2.016134,-0.481264,-0.083258,0.438909
D,0.634345,-0.692261,0.946107,-1.032713,-0.025262,0.100279


In [14]:
#DataFrame.sort_index() organiza pelo índice
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2024-01-01,0.634345,0.444715,-0.834354,0.75904
2024-01-02,-0.692261,1.9934,1.759429,-1.299071
2024-01-03,0.946107,2.016134,-1.084371,-2.06106
2024-01-04,-1.032713,-0.481264,-0.442779,-0.601836
2024-01-05,-0.025262,-0.083258,-1.403413,-0.41121
2024-01-06,0.100279,0.438909,1.074901,-0.246921


In [15]:
#organiza a tabela por um valor específico
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2024-01-05,-0.41121,-1.403413,-0.083258,-0.025262
2024-01-03,-2.06106,-1.084371,2.016134,0.946107
2024-01-01,0.75904,-0.834354,0.444715,0.634345
2024-01-04,-0.601836,-0.442779,-0.481264,-1.032713
2024-01-06,-0.246921,1.074901,0.438909,0.100279
2024-01-02,-1.299071,1.759429,1.9934,-0.692261


In [16]:
df

Unnamed: 0,A,B,C,D
2024-01-01,0.75904,-0.834354,0.444715,0.634345
2024-01-02,-1.299071,1.759429,1.9934,-0.692261
2024-01-03,-2.06106,-1.084371,2.016134,0.946107
2024-01-04,-0.601836,-0.442779,-0.481264,-1.032713
2024-01-05,-0.41121,-1.403413,-0.083258,-0.025262
2024-01-06,-0.246921,1.074901,0.438909,0.100279


In [17]:
#seleção
#Getitem([]) seleciona apenas a coluna correspondente equivalente a df
df["B"]

2024-01-01   -0.834354
2024-01-02    1.759429
2024-01-03   -1.084371
2024-01-04   -0.442779
2024-01-05   -1.403413
2024-01-06    1.074901
Freq: D, Name: B, dtype: float64

In [18]:
#seleciona um intervalo de linhas
df[0:3]

Unnamed: 0,A,B,C,D
2024-01-01,0.75904,-0.834354,0.444715,0.634345
2024-01-02,-1.299071,1.759429,1.9934,-0.692261
2024-01-03,-2.06106,-1.084371,2.016134,0.946107


In [19]:
df["20240101":"20240103"]

Unnamed: 0,A,B,C,D
2024-01-01,0.75904,-0.834354,0.444715,0.634345
2024-01-02,-1.299071,1.759429,1.9934,-0.692261
2024-01-03,-2.06106,-1.084371,2.016134,0.946107


In [20]:
#seleção por label 
#DataFrame.loc(linhas, colunas)
df.loc[dates[0]]

A    0.759040
B   -0.834354
C    0.444715
D    0.634345
Name: 2024-01-01 00:00:00, dtype: float64

In [21]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2024-01-01,0.75904,-0.834354
2024-01-02,-1.299071,1.759429
2024-01-03,-2.06106,-1.084371
2024-01-04,-0.601836,-0.442779
2024-01-05,-0.41121,-1.403413
2024-01-06,-0.246921,1.074901


In [22]:
df.iloc[3]

A   -0.601836
B   -0.442779
C   -0.481264
D   -1.032713
Name: 2024-01-04 00:00:00, dtype: float64

In [23]:
df.iloc[3:5]

Unnamed: 0,A,B,C,D
2024-01-04,-0.601836,-0.442779,-0.481264,-1.032713
2024-01-05,-0.41121,-1.403413,-0.083258,-0.025262


In [24]:
#usando funções definidas
#DataFrame.agg()
df.agg(lambda x: np.mean(x) * 5.6)

A   -3.603653
B   -0.868547
C    4.040060
D   -0.064871
dtype: float64

In [25]:
df.transform(lambda x: x * 101.2)

Unnamed: 0,A,B,C,D
2024-01-01,76.814886,-84.436607,45.005206,64.195751
2024-01-02,-131.465939,178.054261,201.732111,-70.056795
2024-01-03,-208.579277,-109.738335,204.032809,95.746008
2024-01-04,-60.905798,-44.809243,-48.703959,-104.510539
2024-01-05,-41.614423,-142.025358,-8.425742,-2.556494
2024-01-06,-24.988361,108.780015,44.417542,10.14821


In [26]:
#gerando uma lista de valores com np.random.randint entre 0 e 7 com valor 10 de tamanho
s = pd.Series(np.random.randint(0,7,size=10))
s

0    6
1    5
2    0
3    6
4    5
5    3
6    5
7    4
8    2
9    4
dtype: int32

In [27]:
s.value_counts()
#DataFrame.value_counts() conta quantos valores existem de cada item do índice e organiza da seguinte maneira, 'nome':'quantidade'

5    3
6    2
4    2
0    1
3    1
2    1
Name: count, dtype: int64

In [28]:
#configurando uma nova coluna automaticamente alinhando pelo índice
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range("20230201",periods=6))

In [29]:
df["F"] = s1
#adicionando um valor em uma linha e coluna específica de uma serie ou lista
df.at[dates[0],"A"] = 4
#adicionando um valor
df.iat[0,1] = 0
#adicionando um NumPy Array
df.loc[:, "D"] = np.array([5] * len(df))

df
#obs: alterações são "permanentes" na execução do código até o kernel ser reiniciado

Unnamed: 0,A,B,C,D,F
2024-01-01,4.0,0.0,0.444715,5.0,
2024-01-02,-1.299071,1.759429,1.9934,5.0,
2024-01-03,-2.06106,-1.084371,2.016134,5.0,
2024-01-04,-0.601836,-0.442779,-0.481264,5.0,
2024-01-05,-0.41121,-1.403413,-0.083258,5.0,
2024-01-06,-0.246921,1.074901,0.438909,5.0,


In [30]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2024-01-01,-4.0,0.0,-0.444715,-5.0,
2024-01-02,-1.299071,-1.759429,-1.9934,-5.0,
2024-01-03,-2.06106,-1.084371,-2.016134,-5.0,
2024-01-04,-0.601836,-0.442779,-0.481264,-5.0,
2024-01-05,-0.41121,-1.403413,-0.083258,-5.0,
2024-01-06,-0.246921,-1.074901,-0.438909,-5.0,


In [31]:
#tratamento de dados faltantes
#em NumPy np.nan representa um dado faltando, e por padrão não é incluido nos calculos

df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1],"E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2024-01-01,4.0,0.0,0.444715,5.0,,1.0
2024-01-02,-1.299071,1.759429,1.9934,5.0,,1.0
2024-01-03,-2.06106,-1.084371,2.016134,5.0,,
2024-01-04,-0.601836,-0.442779,-0.481264,5.0,,


In [32]:
#DataFrame.fillna(value=5)

#pd.set_option('future.no_silent_downcasting', True)
#o codigo acima evita que seja mostrado o codigo e deprecated em fillna()
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2024-01-01,4.0,0.0,0.444715,5.0,5.0,1.0
2024-01-02,-1.299071,1.759429,1.9934,5.0,5.0,1.0
2024-01-03,-2.06106,-1.084371,2.016134,5.0,5.0,5.0
2024-01-04,-0.601836,-0.442779,-0.481264,5.0,5.0,5.0


In [33]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2024-01-01,False,False,False,False,True,False
2024-01-02,False,False,False,False,True,False
2024-01-03,False,False,False,False,True,True
2024-01-04,False,False,False,False,True,True


In [34]:
#operações 
#DataFrame.mean() exibe a média de cada coluna

df.mean()

A   -0.103349
B   -0.016039
C    0.721439
D    5.000000
F         NaN
dtype: float64

In [35]:
#calcula a media de cada linha
df.mean(axis=1)

2024-01-01    2.361179
2024-01-02    1.863440
2024-01-03    0.967676
2024-01-04    0.868530
2024-01-05    0.775530
2024-01-06    1.566722
Freq: D, dtype: float64

In [36]:
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
s

2024-01-01    NaN
2024-01-02    NaN
2024-01-03    1.0
2024-01-04    3.0
2024-01-05    5.0
2024-01-06    NaN
Freq: D, dtype: float64

In [37]:
df.sub(s,axis='index')

Unnamed: 0,A,B,C,D,F
2024-01-01,,,,,
2024-01-02,,,,,
2024-01-03,-3.06106,-2.084371,1.016134,4.0,
2024-01-04,-3.601836,-3.442779,-3.481264,2.0,
2024-01-05,-5.41121,-6.403413,-5.083258,0.0,
2024-01-06,,,,,
