### Imports

In [43]:
import pandas as pd
import numpy as np
import random
from string import ascii_uppercase
import os
from faker import Faker

### Series creation

In [44]:
data = [num for num in range(10, 110, 10)]
print(data)

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]


In [45]:
data_serie = pd.Series(data)
print(data_serie)

0     10
1     20
2     30
3     40
4     50
5     60
6     70
7     80
8     90
9    100
dtype: int64


In [46]:
print(type(data_serie))

<class 'pandas.core.series.Series'>


In [47]:
int_array = [num for num in range(10, 60, 10)]
print(type(int_array))
indexes = ['A B C D E'.split(' ')]
data_series = pd.Series(int_array, indexes)
print(data_series)

<class 'list'>
A    10
B    20
C    30
D    40
E    50
dtype: int64


In [48]:
np_array = np.array(int_array)
print(np_array)
print(type(np_array))

[10 20 30 40 50]
<class 'numpy.ndarray'>


In [49]:
data_series = pd.Series(np_array)
print(data_series)

0    10
1    20
2    30
3    40
4    50
dtype: int64


In [50]:
print(f"Lines: {data_series.shape}")
print(f'Dimension: {data_series.ndim}')
print(f'Size: {data_series.size}')

Lines: (5,)
Dimension: 1
Size: 5


In [51]:
data_series.index = [random.choice(ascii_uppercase) for _ in range(5)]
print(data_series)

R    10
M    20
K    30
U    40
I    50
dtype: int64


In [52]:
values = np.random.random(10)
indexes = np.arange(0, 10)

print(f'Values: {values}')
print(f'Indexes: {indexes}')

data_series = pd.Series(values, indexes)

print(data_series)

Values: [0.51951398 0.9826249  0.35442075 0.17006225 0.01819155 0.20724628
 0.67313204 0.92779672 0.46798119 0.32155718]
Indexes: [0 1 2 3 4 5 6 7 8 9]
0    0.519514
1    0.982625
2    0.354421
3    0.170062
4    0.018192
5    0.207246
6    0.673132
7    0.927797
8    0.467981
9    0.321557
dtype: float64


In [53]:
dict_data = {random.choice(ascii_uppercase):random.randint(1, 10) for _ in range(5)}

print(dict_data)

data_series = pd.Series(dict_data)

print(data_series)

{'P': 3, 'N': 1, 'B': 1, 'J': 6, 'H': 4}
P    3
N    1
B    1
J    6
H    4
dtype: int64


### Slicing

In [54]:
data_serie = pd.Series(data=np.random.random(10))

print(data_serie)

0    0.854650
1    0.223496
2    0.599356
3    0.038017
4    0.902643
5    0.916263
6    0.305195
7    0.697807
8    0.097400
9    0.740242
dtype: float64


In [55]:
data_serie[:]

0    0.854650
1    0.223496
2    0.599356
3    0.038017
4    0.902643
5    0.916263
6    0.305195
7    0.697807
8    0.097400
9    0.740242
dtype: float64

In [56]:
data_serie[-1::]

9    0.740242
dtype: float64

In [57]:
data_serie[-1:]

9    0.740242
dtype: float64

### Copying, converting and concatenating

In [58]:
# Copy
data_serie2 = data_serie.copy()

In [59]:
data_serie2

0    0.854650
1    0.223496
2    0.599356
3    0.038017
4    0.902643
5    0.916263
6    0.305195
7    0.697807
8    0.097400
9    0.740242
dtype: float64

In [60]:
# Convert from float
data_serie2.dtype

dtype('float64')

In [61]:
# To int
data_serie3 = data_serie2.astype(int)
data_serie3

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

In [62]:
# Concat

# Two dicts
dic1 = {'João': 22, 'Alice': 34, 'Gustavo': 29, 'Pedro': 21}
dic2 = {'Gustavo': 17, 'Alana': 30}

# for to Series
data_serie4 = pd.Series(data=dic1)
data_serie5 = pd.Series(data=dic2)


In [63]:
# Concat two series
data_serie6 = pd.concat([data_serie4, data_serie5])
data_serie6

João       22
Alice      34
Gustavo    29
Pedro      21
Gustavo    17
Alana      30
dtype: int64

### Acesso aos dados com iLoc
 - Acessar elementos pelo índice

In [64]:
filename = 'census.csv'
dataset = pd.read_csv(f"{os.path.join(os.path.dirname(os.getcwd()), 'data', filename)}")

In [65]:
print(type(dataset))

<class 'pandas.core.frame.DataFrame'>


In [66]:
dataset.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [67]:
serie_age = dataset['age']
print(serie_age)
print(type(serie_age))
print(serie_age.values, type(serie_age.values))

0        39
1        50
2        38
3        53
4        28
         ..
32556    27
32557    40
32558    58
32559    22
32560    52
Name: age, Length: 32561, dtype: int64
<class 'pandas.core.series.Series'>
[39 50 38 ... 58 22 52] <class 'numpy.ndarray'>


In [68]:
print(serie_age.iloc[0])
print(serie_age.iloc[32])
print(serie_age.iloc[-1])

39
45
52


In [69]:
print(serie_age.iloc[2:10])

2    38
3    53
4    28
5    37
6    49
7    52
8    31
9    42
Name: age, dtype: int64


In [70]:
for i in serie_age.iloc[2:10]:
    print(i)

38
53
28
37
49
52
31
42


In [71]:
menor_que_30 = list(map(lambda value: value < 30, serie_age.iloc[2:10]))
menor_que_30

[False, False, True, False, False, False, False, False]

In [72]:
# Pesquisando por índices
serie_age.iloc[[2,10,18]]

2     38
10    37
18    38
Name: age, dtype: int64

In [73]:
ages_list = []

for age in serie_age.items():
    # print(age)
    # print(f"Indice: {age[0]} - Idade: {age[1]}")
    if age[1] < 30:
        # salvando as idades
        # ages_list.append(age[1])
        # salvando os índices
        ages_list.append(age[0])

In [74]:
print(len(ages_list))

serie_age.iloc[ages_list]

9711


4        28
12       23
16       25
26       19
30       23
         ..
32529    29
32535    22
32555    22
32556    27
32559    22
Name: age, Length: 9711, dtype: int64

### Acessando os dados com Loc
- Acessar elementos com "string"

In [75]:
fake = Faker('pt-BR')

indexes_name = []

for _ in range(len(serie_age)):
    indexes_name.append(fake.name())
    
print(indexes_name[0:11])

['Ana Júlia Oliveira', 'Vitor Jesus', 'Benício Freitas', 'Dra. Ana Sophia Almeida', 'Marcos Vinicius Casa Grande', 'Kamilly Ferreira', 'Renan Oliveira', 'Ravy Nascimento', 'Ana Luiza Martins', 'Lorenzo da Mota', 'Luiz Miguel Nunes']


In [76]:
serie_name_age = pd.Series(serie_age.values, index=indexes_name)
serie_name_age[0:11]

Ana Júlia Oliveira             39
Vitor Jesus                    50
Benício Freitas                38
Dra. Ana Sophia Almeida        53
Marcos Vinicius Casa Grande    28
Kamilly Ferreira               37
Renan Oliveira                 49
Ravy Nascimento                52
Ana Luiza Martins              31
Lorenzo da Mota                42
Luiz Miguel Nunes              37
dtype: int64

In [77]:
serie_name_age["Antony Costa"]

np.int64(45)

In [78]:
serie_name_age_filter = serie_name_age.drop_duplicates()
serie_name_age_filter.size

73

In [79]:
serie_name_age_filter

Ana Júlia Oliveira             39
Vitor Jesus                    50
Benício Freitas                38
Dra. Ana Sophia Almeida        53
Marcos Vinicius Casa Grande    28
                               ..
Danilo Duarte                  83
Levi Mendes                    84
Emanuella Machado              85
Ana Luiza da Mata              86
Agatha Cassiano                87
Length: 73, dtype: int64

In [81]:
print("Sem LOC")
print(serie_name_age_filter[1:3])
print("Com LOC")
print(serie_name_age_filter.loc["Ana Júlia Oliveira":"Emanuella Machado"])
print("Lista")
print(serie_name_age_filter.loc["Ana Júlia Oliveira":"Emanuella Machado"])
print("Tamanho")
print(len(serie_name_age_filter[1:3]))

Sem LOC
Vitor Jesus        50
Benício Freitas    38
dtype: int64
Com LOC
Ana Júlia Oliveira             39
Vitor Jesus                    50
Benício Freitas                38
Dra. Ana Sophia Almeida        53
Marcos Vinicius Casa Grande    28
                               ..
Daniel Melo                    88
Enzo Gabriel Garcia            82
Danilo Duarte                  83
Levi Mendes                    84
Emanuella Machado              85
Length: 71, dtype: int64
Lista
Ana Júlia Oliveira             39
Vitor Jesus                    50
Benício Freitas                38
Dra. Ana Sophia Almeida        53
Marcos Vinicius Casa Grande    28
                               ..
Daniel Melo                    88
Enzo Gabriel Garcia            82
Danilo Duarte                  83
Levi Mendes                    84
Emanuella Machado              85
Length: 71, dtype: int64
Tamanho
2


In [82]:
reset_index = serie_name_age_filter.copy()
reset_index.reset_index(drop=True, inplace=True)
reset_index

0     39
1     50
2     38
3     53
4     28
      ..
68    83
69    84
70    85
71    86
72    87
Length: 73, dtype: int64

In [83]:
reset_index.index

RangeIndex(start=0, stop=73, step=1)

### Ordenação

In [84]:
# Ordenando idades em ordem crescente
serie_name_age.sort_values()

Cauã Caldeira           17
Nicolas Ramos           17
Nathan Vasconcelos      17
Ana Beatriz Andrade     17
Maria Laura Gomes       17
                        ..
Luna Marques            90
Sr. Augusto Teixeira    90
Esther Sales            90
Dra. Isabelly Abreu     90
Luiz Gustavo Costa      90
Length: 32561, dtype: int64

In [85]:
# Ordenando idades em ordem decrescente
serie_name_age.sort_values(ascending=False)

Sr. Enzo Souza              90
Davi Miguel Cardoso         90
Dante Vargas                90
Leonardo Almeida            90
Maria Julia Almeida         90
                            ..
Dra. Aurora Gonçalves       17
Luiz Gustavo Porto          17
Bruna da Rocha              17
Sra. Ana Vitória Moreira    17
Luiz Felipe Ribeiro         17
Length: 32561, dtype: int64

In [86]:
# Ordenando os índices em ordem crescente
serie_name_age.sort_index()

Agatha Abreu      26
Agatha Abreu      29
Agatha Alves      35
Agatha Alves      38
Agatha Andrade    29
                  ..
Ísis da Luz       30
Ísis da Mata      66
Ísis da Mata      30
Ísis da Paz       68
Ísis da Paz       45
Length: 32561, dtype: int64

In [87]:
# Ordenando os índices em ordem decrescente
serie_name_age.sort_index(ascending=False)

Ísis da Paz       45
Ísis da Paz       68
Ísis da Mata      66
Ísis da Mata      30
Ísis da Luz       30
                  ..
Agatha Andrade    29
Agatha Alves      38
Agatha Alves      35
Agatha Abreu      26
Agatha Abreu      29
Length: 32561, dtype: int64

In [88]:
# Retornando as dez pessoas com a maior idade da série
serie_name_age.sort_values(ascending=False).iloc[0:11]

Sr. Enzo Souza          90
Davi Miguel Cardoso     90
Dante Vargas            90
Leonardo Almeida        90
Maria Julia Almeida     90
Sabrina Rezende         90
Marina Vasconcelos      90
Ana Carolina Andrade    90
João Miguel Melo        90
Luna Marques            90
Bruno Melo              90
dtype: int64

### Contagem

In [89]:
serie_name_age.size

32561

In [90]:
serie_name_age.value_counts()

36    898
31    888
34    886
23    877
35    876
     ... 
83      6
88      3
85      3
86      1
87      1
Name: count, Length: 73, dtype: int64

In [91]:
# Porcentagem
serie_name_age.value_counts(normalize=True, sort = True) # ordenando

36    0.027579
31    0.027272
34    0.027210
23    0.026934
35    0.026903
        ...   
83    0.000184
88    0.000092
85    0.000092
86    0.000031
87    0.000031
Name: proportion, Length: 73, dtype: float64

In [92]:
# Dividindo por faixas
serie_name_age.value_counts(bins=10)

(38.9, 46.2]      6163
(31.6, 38.9]      6048
(24.3, 31.6]      5890
(16.926, 24.3]    5570
(46.2, 53.5]      3967
(53.5, 60.8]      2591
(60.8, 68.1]      1595
(68.1, 75.4]       496
(75.4, 82.7]       174
(82.7, 90.0]        67
Name: count, dtype: int64

### Filtragem

In [93]:
indices_paises = []
for _ in range(len(serie_age)):
    indices_paises.append(fake.country())

indices_paises[0:11]

['Bahamas',
 'Gana',
 'Micronésia',
 'Botsuana',
 'Anguila',
 'Guiné',
 'Svalbard e Jan Mayen',
 'Bahamas',
 'Clipperton Island',
 'Butão',
 'Belize']

In [94]:
serie_pais = pd.Series(serie_age.values, index=indices_paises)
serie_pais.head()

Bahamas       39
Gana          50
Micronésia    38
Botsuana      53
Anguila       28
dtype: int64

In [95]:
serie_pais.loc[serie_pais > 50]

Botsuana         53
Bahamas          52
Micronésia       54
Eslovênia        59
Jibuti           56
                 ..
Liechtenstein    72
Domínica         65
Senegal          53
Guiné-Bissau     58
Moldávia         52
Length: 6460, dtype: int64

In [96]:
print(serie_pais.loc[(serie_pais > 50) & (serie_pais.index == 'Brasil')])
print(len(serie_pais.loc[(serie_pais > 50) & (serie_pais.index == 'Brasil')]))

Brasil    58
Brasil    56
Brasil    57
Brasil    51
Brasil    59
Brasil    61
Brasil    69
Brasil    66
Brasil    58
Brasil    54
Brasil    51
Brasil    56
Brasil    64
Brasil    52
Brasil    53
Brasil    55
Brasil    53
Brasil    51
Brasil    62
Brasil    54
Brasil    54
Brasil    53
Brasil    51
Brasil    55
Brasil    69
Brasil    53
Brasil    51
Brasil    53
Brasil    56
Brasil    79
Brasil    69
Brasil    51
Brasil    69
Brasil    60
dtype: int64
34


In [97]:
serie_pais.index.isin(["India", "Brasil"])

array([False, False, False, ..., False, False, False], shape=(32561,))

### Operações matemáticas

In [98]:
serie_pais

Bahamas         39
Gana            50
Micronésia      38
Botsuana        53
Anguila         28
                ..
Austrália       27
Bermudas        40
Guiné-Bissau    58
São Marinho     22
Moldávia        52
Length: 32561, dtype: int64

In [99]:
### Comando python
serie_pais + 2

Bahamas         41
Gana            52
Micronésia      40
Botsuana        55
Anguila         30
                ..
Austrália       29
Bermudas        42
Guiné-Bissau    60
São Marinho     24
Moldávia        54
Length: 32561, dtype: int64

In [100]:
### Comando pandas
serie_pais.add(2)

Bahamas         41
Gana            52
Micronésia      40
Botsuana        55
Anguila         30
                ..
Austrália       29
Bermudas        42
Guiné-Bissau    60
São Marinho     24
Moldávia        54
Length: 32561, dtype: int64

In [101]:
serie_pais.sub(2)

Bahamas         37
Gana            48
Micronésia      36
Botsuana        51
Anguila         26
                ..
Austrália       25
Bermudas        38
Guiné-Bissau    56
São Marinho     20
Moldávia        50
Length: 32561, dtype: int64

In [102]:
serie_pais.mul(2)

Bahamas          78
Gana            100
Micronésia       76
Botsuana        106
Anguila          56
               ... 
Austrália        54
Bermudas         80
Guiné-Bissau    116
São Marinho      44
Moldávia        104
Length: 32561, dtype: int64

In [103]:
serie_pais.div(2)

Bahamas         19.5
Gana            25.0
Micronésia      19.0
Botsuana        26.5
Anguila         14.0
                ... 
Austrália       13.5
Bermudas        20.0
Guiné-Bissau    29.0
São Marinho     11.0
Moldávia        26.0
Length: 32561, dtype: float64

In [104]:
s1 = pd.Series([10, 20, 30])
s2 = pd.Series([1, 2, 3])
s1, s2

(0    10
 1    20
 2    30
 dtype: int64,
 0    1
 1    2
 2    3
 dtype: int64)

In [106]:
s1.add(s2)

0    11
1    22
2    33
dtype: int64

In [107]:
s1.sub(s2)

0     9
1    18
2    27
dtype: int64

In [108]:
s1.mul(2)

0    20
1    40
2    60
dtype: int64

In [109]:
s1.div(2)

0     5.0
1    10.0
2    15.0
dtype: float64