## Sequencia de estudo de Python. 
### Seguindo o tutorial de Pandas
https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

##### Pandas em 10 minutos

In [1]:
import numpy as np
import pandas as pd

criando uma série

In [3]:
s = pd.Series([1, 3, 5, np.nan, 5, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    5.0
5    8.0
dtype: float64

Criando um DataFramepassando uma matriz NumPy, com um índice datetime e colunas rotuladas:

In [6]:
datas = pd.date_range('20210101', periods=6)
display(datas)
df = pd.DataFrame(np.random.randn(6, 4),index=datas, columns=list("ABCD"))
df

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')

Unnamed: 0,A,B,C,D
2021-01-01,1.12658,0.330994,1.178763,-0.378324
2021-01-02,0.169089,-1.887792,-0.61724,0.337129
2021-01-03,-0.665805,0.897332,0.960721,-0.505955
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842
2021-01-06,-0.252056,-1.115459,-0.92481,-0.763979


Criando um DataFrame, passando um dicionário de objetos que podem ser convertidos em séries.

In [11]:
df2 = pd.DataFrame({
    'A': 1.0,
    'B': pd.Timestamp('20210101'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(['teste', 'treino', 'teste', 'treino']),
    'F': 'foo',
})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-01-01,1.0,3,teste,foo
1,1.0,2021-01-01,1.0,3,treino,foo
2,1.0,2021-01-01,1.0,3,teste,foo
3,1.0,2021-01-01,1.0,3,treino,foo


tipos de dados

In [15]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

Visualisando dados

In [16]:
df.head()

Unnamed: 0,A,B,C,D
2021-01-01,1.12658,0.330994,1.178763,-0.378324
2021-01-02,0.169089,-1.887792,-0.61724,0.337129
2021-01-03,-0.665805,0.897332,0.960721,-0.505955
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842


In [19]:
df.tail(3)

Unnamed: 0,A,B,C,D
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842
2021-01-06,-0.252056,-1.115459,-0.92481,-0.763979


Exibindo indices e colunas

In [20]:
df.index

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

converter para numpy

In [22]:
df.to_numpy()

array([[ 1.1265797 ,  0.33099412,  1.17876309, -0.37832381],
       [ 0.16908872, -1.88779185, -0.61724011,  0.33712931],
       [-0.66580475,  0.8973324 ,  0.96072085, -0.50595489],
       [-0.77556043, -2.06927252, -1.4681335 ,  0.24970196],
       [-0.15427275,  0.70828723, -0.06340674, -0.66684165],
       [-0.25205635, -1.11545893, -0.92481033, -0.7639794 ]])

In [23]:
df2.to_numpy()

array([[1.0, Timestamp('2021-01-01 00:00:00'), 1.0, 3, 'teste', 'foo'],
       [1.0, Timestamp('2021-01-01 00:00:00'), 1.0, 3, 'treino', 'foo'],
       [1.0, Timestamp('2021-01-01 00:00:00'), 1.0, 3, 'teste', 'foo'],
       [1.0, Timestamp('2021-01-01 00:00:00'), 1.0, 3, 'treino', 'foo']],
      dtype=object)

Criando um resumo

In [24]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.092004,-0.522652,-0.155684,-0.288045
std,0.689798,1.331716,1.054774,0.470245
min,-0.77556,-2.069273,-1.468133,-0.763979
25%,-0.562368,-1.694709,-0.847918,-0.62662
50%,-0.203165,-0.392232,-0.340323,-0.442139
75%,0.088248,0.613964,0.704689,0.092696
max,1.12658,0.897332,1.178763,0.337129


Tranpondo

In [25]:
df.T

Unnamed: 0,2021-01-01,2021-01-02,2021-01-03,2021-01-04,2021-01-05,2021-01-06
A,1.12658,0.169089,-0.665805,-0.77556,-0.154273,-0.252056
B,0.330994,-1.887792,0.897332,-2.069273,0.708287,-1.115459
C,1.178763,-0.61724,0.960721,-1.468133,-0.063407,-0.92481
D,-0.378324,0.337129,-0.505955,0.249702,-0.666842,-0.763979


In [28]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2021-01-01,-0.378324,1.178763,0.330994,1.12658
2021-01-02,0.337129,-0.61724,-1.887792,0.169089
2021-01-03,-0.505955,0.960721,0.897332,-0.665805
2021-01-04,0.249702,-1.468133,-2.069273,-0.77556
2021-01-05,-0.666842,-0.063407,0.708287,-0.154273
2021-01-06,-0.763979,-0.92481,-1.115459,-0.252056


In [29]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2021-01-06,-0.252056,-1.115459,-0.92481,-0.763979
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702
2021-01-03,-0.665805,0.897332,0.960721,-0.505955
2021-01-02,0.169089,-1.887792,-0.61724,0.337129
2021-01-01,1.12658,0.330994,1.178763,-0.378324


Classificar por valor

In [31]:
df.sort_values(by=['B'])

Unnamed: 0,A,B,C,D
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702
2021-01-02,0.169089,-1.887792,-0.61724,0.337129
2021-01-06,-0.252056,-1.115459,-0.92481,-0.763979
2021-01-01,1.12658,0.330994,1.178763,-0.378324
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842
2021-01-03,-0.665805,0.897332,0.960721,-0.505955


#### Seleções
Selecionar uma unica coluna

In [32]:
df.A

2021-01-01    1.126580
2021-01-02    0.169089
2021-01-03   -0.665805
2021-01-04   -0.775560
2021-01-05   -0.154273
2021-01-06   -0.252056
Freq: D, Name: A, dtype: float64

In [33]:
df['A']

2021-01-01    1.126580
2021-01-02    0.169089
2021-01-03   -0.665805
2021-01-04   -0.775560
2021-01-05   -0.154273
2021-01-06   -0.252056
Freq: D, Name: A, dtype: float64

Slices

In [34]:
df[:3]

Unnamed: 0,A,B,C,D
2021-01-01,1.12658,0.330994,1.178763,-0.378324
2021-01-02,0.169089,-1.887792,-0.61724,0.337129
2021-01-03,-0.665805,0.897332,0.960721,-0.505955


In [35]:
df[:-1]

Unnamed: 0,A,B,C,D
2021-01-01,1.12658,0.330994,1.178763,-0.378324
2021-01-02,0.169089,-1.887792,-0.61724,0.337129
2021-01-03,-0.665805,0.897332,0.960721,-0.505955
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842


### Seleção pro rótulo

In [44]:
df.loc['2021-01-01']

A    1.126580
B    0.330994
C    1.178763
D   -0.378324
Name: 2021-01-01 00:00:00, dtype: float64

In [47]:
df.loc[datas[0]]

A    1.126580
B    0.330994
C    1.178763
D   -0.378324
Name: 2021-01-01 00:00:00, dtype: float64

In [43]:
df.loc[:,['A', 'B']]

Unnamed: 0,A,B
2021-01-01,1.12658,0.330994
2021-01-02,0.169089,-1.887792
2021-01-03,-0.665805,0.897332
2021-01-04,-0.77556,-2.069273
2021-01-05,-0.154273,0.708287
2021-01-06,-0.252056,-1.115459


Slice de rótulo 

In [50]:
df.loc['2021-01-01': '2021-01-02',['A', 'C']]

Unnamed: 0,A,C
2021-01-01,1.12658,1.178763
2021-01-02,0.169089,-0.61724


In [51]:
df.loc['2021-01-01',['A', 'C']]

A    1.126580
C    1.178763
Name: 2021-01-01 00:00:00, dtype: float64

Retornando o valor escalar

In [56]:
df.loc[datas[0], 'A']

1.1265796988260075

In [57]:
df.at[datas[0], "A"]

1.1265796988260075

#### Seleção por posição

In [59]:
df.iloc[3]

A   -0.775560
B   -2.069273
C   -1.468133
D    0.249702
Name: 2021-01-04 00:00:00, dtype: float64

In [60]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2021-01-04,-0.77556,-2.069273
2021-01-05,-0.154273,0.708287


In [66]:
df.iloc[0:1, 0:1]

Unnamed: 0,A
2021-01-01,1.12658


In [68]:
df.iloc[0:3, :]

Unnamed: 0,A,B,C,D
2021-01-01,1.12658,0.330994,1.178763,-0.378324
2021-01-02,0.169089,-1.887792,-0.61724,0.337129
2021-01-03,-0.665805,0.897332,0.960721,-0.505955


FAtiando colunas

In [69]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2021-01-01,0.330994,1.178763
2021-01-02,-1.887792,-0.61724
2021-01-03,0.897332,0.960721
2021-01-04,-2.069273,-1.468133
2021-01-05,0.708287,-0.063407
2021-01-06,-1.115459,-0.92481


Valore explicito

In [73]:
df.iloc[0,0]

1.1265796988260075

In [74]:
type(df.iloc[0,0])

numpy.float64

### Indexão boleana

In [77]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2021-01-01,1.12658,0.330994,1.178763,-0.378324
2021-01-02,0.169089,-1.887792,-0.61724,0.337129


In [78]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-01-01,1.12658,0.330994,1.178763,
2021-01-02,0.169089,,,0.337129
2021-01-03,,0.897332,0.960721,
2021-01-04,,,,0.249702
2021-01-05,,0.708287,,
2021-01-06,,,,


In [82]:
df2 = df.copy()

In [84]:
df2['E'] = ['um', 'um', 'dois', 'tres', 'quatro', 'tres']
df2

Unnamed: 0,A,B,C,D,E
2021-01-01,1.12658,0.330994,1.178763,-0.378324,um
2021-01-02,0.169089,-1.887792,-0.61724,0.337129,um
2021-01-03,-0.665805,0.897332,0.960721,-0.505955,dois
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702,tres
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842,quatro
2021-01-06,-0.252056,-1.115459,-0.92481,-0.763979,tres


In [85]:
df2[df2['E'].isin(['dois', 'quatro'])]

Unnamed: 0,A,B,C,D,E
2021-01-03,-0.665805,0.897332,0.960721,-0.505955,dois
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842,quatro


### Setting

In [97]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20210101", periods=6))
s1

2021-01-01    1
2021-01-02    2
2021-01-03    3
2021-01-04    4
2021-01-05    5
2021-01-06    6
Freq: D, dtype: int64

In [98]:
df['F'] = s1

In [99]:
df

Unnamed: 0,A,B,C,D,F
2021-01-01,1.12658,0.330994,1.178763,-0.378324,1
2021-01-02,0.169089,-1.887792,-0.61724,0.337129,2
2021-01-03,-0.665805,0.897332,0.960721,-0.505955,3
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702,4
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842,5
2021-01-06,-0.252056,-1.115459,-0.92481,-0.763979,6


Definindo valores por rótulo:

In [103]:
df.at[datas[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2021-01-01,0.0,0.0,1.178763,-0.378324,1
2021-01-02,0.169089,-1.887792,-0.61724,0.337129,2
2021-01-03,-0.665805,0.897332,0.960721,-0.505955,3
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702,4
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842,5
2021-01-06,-0.252056,-1.115459,-0.92481,-0.763979,6


In [104]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2021-01-01,0.0,0.0,1.178763,-0.378324,1
2021-01-02,0.169089,-1.887792,-0.61724,0.337129,2
2021-01-03,-0.665805,0.897332,0.960721,-0.505955,3
2021-01-04,-0.77556,-2.069273,-1.468133,0.249702,4
2021-01-05,-0.154273,0.708287,-0.063407,-0.666842,5
2021-01-06,-0.252056,-1.115459,-0.92481,-0.763979,6
