# **TRATAMENTO DE DADOS COM PYTHON + PANDAS**






## **ANÁLISE EXPLORATÓRIA COM DADOS**





In [20]:
import pandas as pd
import numpy as np


### **SERIES** 

**Criando uma Serie**

In [10]:
base = [1, 3, 5, 7, 9, 12, 15, 18, 21, 24]

In [11]:
data = pd.Series(base)

In [12]:
print(data)

0     1
1     3
2     5
3     7
4     9
5    12
6    15
7    18
8    21
9    24
dtype: int64


In [13]:
data_2 = pd.Series(['Ana', 'Carlos', 'Diego', 'Fernando', 'Maria', 'Paulo'])
print( data_2)

0         Ana
1      Carlos
2       Diego
3    Fernando
4       Maria
5       Paulo
dtype: object


**Visualizando o cabeçalho**

In [17]:
data_2.head()

0         Ana
1      Carlos
2       Diego
3    Fernando
4       Maria
dtype: object

**Definindo uma origem personalizada**

In [18]:
nomes = ['Ana', 'Carlos', 'Diego', 'Fernando', 'Maria', 'Paulo']
data_3 = pd.Series(data = nomes)
print(data_3)

0         Ana
1      Carlos
2       Diego
3    Fernando
4       Maria
5       Paulo
dtype: object


**Definindo um índice personalizado**

In [19]:
indice = [1, 2, 3, 4, 5, 6]
nomes = ['Ana', 'Carlos', 'Diego', 'Fernando', 'Maria', 'Paulo']
data_4 = pd.Series(data = nomes,
                  index = indice)
print(data_4)

1         Ana
2      Carlos
3       Diego
4    Fernando
5       Maria
6       Paulo
dtype: object


**Integrando uma Array Numpy em uma Serie**




In [21]:
nomes = np.array(['Ana', 'Carlos', 'Diego', 'Fernando', 'Maria', 'Paulo'])
data_5 = pd.Series(data = nomes)
print(data_5)

0         Ana
1      Carlos
2       Diego
3    Fernando
4       Maria
5       Paulo
dtype: object


**Qualquer estrutura de Dados como elemento de uma Serie**

In [22]:
funcoes = [input, print]
data_6 = pd.Series(data = funcoes)
print(data_6)

0    <bound method Kernel.raw_input of <google.cola...
1                            <built-in function print>
dtype: object


**Integrando funções com Series**

In [23]:
paises = ['Argentina', 'Brasil', 'Canada', 'Estados Unidos', 'Italia', 'Mexico']
data_7 = pd.Series(data = paises, index = np.arange(0,6))
print(data_7)

0         Argentina
1            Brasil
2            Canada
3    Estados Unidos
4            Italia
5            Mexico
dtype: object


**Verificando o Tamanho de uma Serie**

In [24]:
print(data.index)

RangeIndex(start=0, stop=10, step=1)


In [25]:
print(data_7.index)

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')


**Criando uma Serie a partir de um dicionário**

In [26]:
dicionario = {'Nome': 'Fernando', 'Idade': 33, 'Altura': 1.90}
serie_dicio = pd.Series(dicionario)
print(serie_dicio)

Nome      Fernando
Idade           33
Altura         1.9
dtype: object


In [27]:
meses = {1: 'Janeiro', 2: 'Fevereiro', 3: 'Março', 4: 'Abril'}
serie_meses = pd.Series(meses)
print(serie_meses)

1      Janeiro
2    Fevereiro
3        Março
4        Abril
dtype: object


**Unindo Elementos de duas Series**

In [28]:
nomes_1 = ['Ana', 'Carlos', 'Betina', 'Maria', 'Rafael']
nomes_2 = ['Ana', 'Alberto', 'Maria', 'Paulo', 'Tania']

data_1 = pd.Series([1, 2, 3, 4, 5], index= nomes_1)
data_2 = pd.Series([1, 2, 3, 4, 5], index= nomes_2)

data_3 = data_1 + data_2

print(data_3)


Alberto    NaN
Ana        2.0
Betina     NaN
Carlos     NaN
Maria      7.0
Paulo      NaN
Rafael     NaN
Tania      NaN
dtype: float64


### **DATAFRAME**

**Criando um DataFrame**

In [36]:
base = {'Nome':['Ana', 'Carla', 'Gabriela', 'Maria'],
        'Fones':[991384562, 981128449, 999510014, 991120991]}
data = pd.DataFrame(base)

print(data)

print(type(data))



       Nome      Fones
0       Ana  991384562
1     Carla  981128449
2  Gabriela  999510014
3     Maria  991120991
<class 'pandas.core.frame.DataFrame'>


In [35]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Nome    4 non-null      object
 1   Fones   4 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 192.0+ bytes
None


In [37]:
print(data.describe())

              Fones
count  4.000000e+00
mean   9.907860e+08
std    7.524343e+06
min    9.811284e+08
25%    9.886229e+08
50%    9.912528e+08
75%    9.934159e+08
max    9.995100e+08


In [39]:
data = pd.DataFrame(data = np.random.randn(6,5),
                    index = [1, 2, 3, 4, 5, 6],
                    columns = ['a', 'b', 'c', 'd', 'e'])
print(data)

          a         b         c         d         e
1 -0.002710 -0.101999  1.448035  1.078594  1.003622
2 -1.139177  0.722332  1.047775  0.844145  1.480687
3  1.134303  0.331226 -0.477422 -1.541971  0.646919
4  0.469897  0.183558 -2.316865  0.105734  1.313826
5  0.558933  0.361848 -0.135541 -1.143316 -1.272361
6 -0.275952  0.412808  0.029831 -0.749313 -0.623828


In [40]:
print(data.columns)

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


In [41]:
print(data.index)

Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')


In [42]:
print(data['c'])

1    1.448035
2    1.047775
3   -0.477422
4   -2.316865
5   -0.135541
6    0.029831
Name: c, dtype: float64


In [43]:
print(type(data))

<class 'pandas.core.frame.DataFrame'>


In [44]:
print(type(data['c']))

<class 'pandas.core.series.Series'>


**Extraindo dados de uma coluna específica**

In [45]:
print(data['c'])

1    1.448035
2    1.047775
3   -0.477422
4   -2.316865
5   -0.135541
6    0.029831
Name: c, dtype: float64


In [46]:
print(data.d)

1    1.078594
2    0.844145
3   -1.541971
4    0.105734
5   -1.143316
6   -0.749313
Name: d, dtype: float64


In [52]:
print(data[['c', 'e']])

          c         e
1  1.448035  1.003622
2  1.047775  1.480687
3 -0.477422  0.646919
4 -2.316865  1.313826
5 -0.135541 -1.272361
6  0.029831 -0.623828


**Criando colunas manualmente**

******

In [54]:
data['f'] = data['a'] + data['e']
print(data)

          a         b         c         d         e         f
1 -0.002710 -0.101999  1.448035  1.078594  1.003622  1.000911
2 -1.139177  0.722332  1.047775  0.844145  1.480687  0.341510
3  1.134303  0.331226 -0.477422 -1.541971  0.646919  1.781222
4  0.469897  0.183558 -2.316865  0.105734  1.313826  1.783723
5  0.558933  0.361848 -0.135541 -1.143316 -1.272361 -0.713428
6 -0.275952  0.412808  0.029831 -0.749313 -0.623828 -0.899780


**Removendo colunas manualmente**

In [55]:
data = data.drop('f', axis=1)
print(data)

          a         b         c         d         e
1 -0.002710 -0.101999  1.448035  1.078594  1.003622
2 -1.139177  0.722332  1.047775  0.844145  1.480687
3  1.134303  0.331226 -0.477422 -1.541971  0.646919
4  0.469897  0.183558 -2.316865  0.105734  1.313826
5  0.558933  0.361848 -0.135541 -1.143316 -1.272361
6 -0.275952  0.412808  0.029831 -0.749313 -0.623828


In [58]:
data.drop('e', axis=1, inplace=True)
print(data)

          a         b         c         d
1 -0.002710 -0.101999  1.448035  1.078594
2 -1.139177  0.722332  1.047775  0.844145
3  1.134303  0.331226 -0.477422 -1.541971
4  0.469897  0.183558 -2.316865  0.105734
5  0.558933  0.361848 -0.135541 -1.143316
6 -0.275952  0.412808  0.029831 -0.749313


In [59]:
del data ['b']
print(data)

          a         c         d
1 -0.002710  1.448035  1.078594
2 -1.139177  1.047775  0.844145
3  1.134303 -0.477422 -1.541971
4  0.469897 -2.316865  0.105734
5  0.558933 -0.135541 -1.143316
6 -0.275952  0.029831 -0.749313


**Ordenando o elemento de uma coluna**

In [62]:
data = pd.DataFrame(data = np.random.randn(6, 5),
                    index = [1, 2, 3, 4, 5, 6],
                    columns = ['a', 'b', 'c', 'd', 'e'])
print(data)

data.sort_values(by='b', inplace= True)

print(data)

          a         b         c         d         e
1  0.405410  1.701362 -1.189334  2.580226 -0.178199
2  0.594878 -0.040698 -0.401316  2.280369 -0.750733
3 -0.758969 -0.192155  1.352977  1.138434 -0.355265
4 -0.205358  0.932668 -0.291526 -0.557929  0.504637
5 -0.777103 -0.409082 -1.191475 -0.835074 -0.085402
6 -0.004783 -0.150579  1.153054 -0.968388  1.639793
          a         b         c         d         e
5 -0.777103 -0.409082 -1.191475 -0.835074 -0.085402
3 -0.758969 -0.192155  1.352977  1.138434 -0.355265
6 -0.004783 -0.150579  1.153054 -0.968388  1.639793
2  0.594878 -0.040698 -0.401316  2.280369 -0.750733
4 -0.205358  0.932668 -0.291526 -0.557929  0.504637
1  0.405410  1.701362 -1.189334  2.580226 -0.178199


**Extraindo dados de uma linha específica**

In [63]:
print(data.loc[3])

a   -0.758969
b   -0.192155
c    1.352977
d    1.138434
e   -0.355265
Name: 3, dtype: float64


**Extraindo dados de um elemento específico**

In [64]:
print(data.loc[2,'b'])

-0.04069842042714153


**Extraindo dados de múltiplos elementos**

In [65]:
print(data.loc[[2, 3], ['a', 'b', 'c']])

          a         b         c
2  0.594878 -0.040698 -0.401316
3 -0.758969 -0.192155  1.352977


In [66]:
print(data.iloc[1:3, 0:3])

          a         b         c
3 -0.758969 -0.192155  1.352977
6 -0.004783 -0.150579  1.153054


**Buscando elementos via condicionais**

In [67]:
print(data > 0)

       a      b      c      d      e
5  False  False  False  False  False
3  False  False   True   True  False
6  False  False   True  False   True
2   True  False  False   True  False
4  False   True  False  False   True
1   True   True  False   True  False


###**OPERAÇÕES MATEMÁTICAS COM DATAFRAMES**

###Usando as funções embutidas do sistema

In [68]:
data = pd.DataFrame(data = np.random.randn(6, 5),
                    index = [1, 2, 3, 4, 5, 6],
                    columns = ['a', 'b', 'c', 'd', 'e'])
print(data)
print(data['b'].sum())

          a         b         c         d         e
1 -0.988852 -0.834780  0.256398 -0.436760  1.276765
2 -0.116293 -0.851245 -1.694078 -0.074108  1.228629
3  1.553157 -0.460080  0.521084  1.280147 -0.620589
4  0.991122 -0.023036  2.322866  1.195785  0.700802
5 -0.885315 -0.378932  0.262429  0.665807  1.261484
6  0.819821 -1.128112 -0.524242  0.073433  0.831690
-3.6761858679705175


**Aplicando uma operação matemática a todos os elementos**

In [70]:
data = data + 1
print(data)

          a         b         c         d         e
1  1.011148  1.165220  2.256398  1.563240  3.276765
2  1.883707  1.148755  0.305922  1.925892  3.228629
3  3.553157  1.539920  2.521084  3.280147  1.379411
4  2.991122  1.976964  4.322866  3.195785  2.700802
5  1.114685  1.621068  2.262429  2.665807  3.261484
6  2.819821  0.871888  1.475758  2.073433  2.831690


**Usando funções matemáticas personalizadas**

In [71]:
def soma(x):
  return x + x

print(data['b'].apply(soma))

1    2.330440
2    2.297509
3    3.079839
4    3.953928
5    3.242137
6    1.743775
Name: b, dtype: float64


In [73]:
def ao_quadrado(num):
  return num ** 2

print(data['a'].apply(ao_quadrado))

1     1.022419
2     3.548353
3    12.624922
4     8.946813
5     1.242523
6     7.951388
Name: a, dtype: float64


In [74]:
def ao_quadrado(num):
  return num ** 2

print(data.iloc[0:1, 1:2].apply(ao_quadrado))

          b
1  1.357738


In [76]:
print(data['d'].apply(lambda x: x ** 2))

1     2.443718
2     3.709060
3    10.759364
4    10.213045
5     7.106529
6     4.299125
Name: d, dtype: float64


## ESTRUTURAS CONDICIONAIS APLICADAS A DATAFRAME

##Aplicação de estruturas condicionais simples

In [77]:
data = pd.DataFrame(data = np.random.randn(6,5),
                    index = [1, 2, 3, 4, 5, 6],
                    columns = ['a', 'b', 'c', 'd', 'e'])

print(data)
print(data['a'])
print(data['a'] < 0)

          a         b         c         d         e
1  1.650826 -0.060209 -0.955408 -2.629787 -0.417305
2 -0.511651  1.139733  0.143984  0.392282 -0.377518
3  0.050747 -0.174837 -1.371638  1.382765  0.098071
4 -0.935038  0.916168  0.423858  0.349603 -0.078913
5 -0.195384 -0.529635 -0.373391 -0.388076 -1.831269
6 -1.904913 -1.966915 -0.393172  0.547573  1.069692
1    1.650826
2   -0.511651
3    0.050747
4   -0.935038
5   -0.195384
6   -1.904913
Name: a, dtype: float64
1    False
2     True
3    False
4     True
5     True
6     True
Name: a, dtype: bool


In [78]:
data = pd.DataFrame(data = np.random.randn(6,5),
                    index = [1, 2, 3, 4, 5, 6],
                    columns = ['a', 'b', 'c', 'd', 'e'])

print(data)
print(data['a'])

data_2 = data[data['a'] < 0 ]
print(data_2)


          a         b         c         d         e
1 -0.029785  2.045855  0.649910  1.652737 -1.546180
2 -2.311785 -0.945822 -0.653832  0.651246 -0.795065
3 -0.405029 -0.823602  0.383544 -0.928993  0.469297
4 -0.560397 -0.380717 -0.342607 -0.216142  0.366240
5  1.555387  0.109748  0.816427  0.856225  0.916686
6 -0.279185 -0.177108  1.069417  1.003541  0.085977
1   -0.029785
2   -2.311785
3   -0.405029
4   -0.560397
5    1.555387
6   -0.279185
Name: a, dtype: float64
          a         b         c         d         e
1 -0.029785  2.045855  0.649910  1.652737 -1.546180
2 -2.311785 -0.945822 -0.653832  0.651246 -0.795065
3 -0.405029 -0.823602  0.383544 -0.928993  0.469297
4 -0.560397 -0.380717 -0.342607 -0.216142  0.366240
6 -0.279185 -0.177108  1.069417  1.003541  0.085977


### Aplicação de estruturas condicionais compostas

In [80]:
data = pd.DataFrame(data = np.random.randn(6,5),
                    index = [1, 2, 3, 4, 5, 6],
                    columns = ['a', 'b', 'c', 'd', 'e'])

print(data)
print(data[(data['a'] < 0 ) & (data['b'] > 1)])

          a         b         c         d         e
1 -0.837050  1.284069  0.432692  0.029935  0.759913
2  0.815898 -0.863198  0.875479 -2.237198  2.342362
3  0.283892 -2.029100 -1.734918  0.782177 -0.073595
4 -0.266564 -1.478100 -0.208054 -0.991177  0.164653
5 -0.305058 -0.785639  0.889679 -0.360523 -0.258076
6 -0.452276  0.884344 -0.422986  0.132644  0.001766
         a         b         c         d         e
1 -0.83705  1.284069  0.432692  0.029935  0.759913
