# Manipulações de Dados com Pandas P1

In [1]:
# Importando bibliotecas necessárias
import pandas as pd
import numpy as np

In [8]:
# Verificando a versão das bibliotecas
print(f'{pd.__version__} | {np.__version__}')

0.23.4 | 1.17.2


In [58]:
# Carregando nossos dados
df = pd.read_csv('https://raw.githubusercontent.com/chendaniely/scipy-2019-pandas/master/data/gapminder.tsv', sep='\t')

In [59]:
df.head(8)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945


In [60]:
# Conferindo o tipo da variável df
# Podemos observar que temos um DataFrame
type(df)

pandas.core.frame.DataFrame

In [61]:
df.shape # (linhas, colunas)

(1704, 6)

In [62]:
df.info() # Informações básicas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [63]:
df.tail() # Últimas 5 entradas do nosso DataFrame

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [64]:
df.columns # Imprime as colunas de nosso DataFrame

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [69]:
df.index # Imprime o intervalo do índice de nosso DataFrame

RangeIndex(start=0, stop=1704, step=1)

In [70]:
df.values # Imprime o nosso DataFrame como um array de valores

array([['Afghanistan', 'Asia', 1952, 28.801, 8425333, 779.4453145],
       ['Afghanistan', 'Asia', 1957, 30.331999999999997, 9240934,
        820.8530296],
       ['Afghanistan', 'Asia', 1962, 31.997, 10267083, 853.1007099999999],
       ...,
       ['Zimbabwe', 'Africa', 1997, 46.809, 11404948, 792.4499602999999],
       ['Zimbabwe', 'Africa', 2002, 39.989000000000004, 11926563,
        672.0386227000001],
       ['Zimbabwe', 'Africa', 2007, 43.486999999999995, 12311143,
        469.70929810000007]], dtype=object)

In [72]:
df.dtypes # Imprime os tipos de dados de cada coluna de nosso DataFrame

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [14]:
country = df['country'] # Selecionando apenas uma coluna

In [73]:
# Veja que ao selecionarmos apenas um coluna obtermos uma Series ao invés de um DataFrame
# Series é uma estrutura de dados de apenas um Dimensão
type(country)

pandas.core.series.Series

In [74]:
country.head()

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [75]:
countrydf = df[['country']] # Selecionando como um DataFrame

In [76]:
type(countrydf)

pandas.core.frame.DataFrame

In [77]:
countrydf.head()

Unnamed: 0,country
0,Afghanistan
1,Afghanistan
2,Afghanistan
3,Afghanistan
4,Afghanistan


In [80]:
# Deletando as colunas continent e country
df.drop(['continent', 'country'], axis='columns').head(10)

Unnamed: 0,year,lifeExp,pop,gdpPercap
0,1952,28.801,8425333,779.445314
1,1957,30.332,9240934,820.85303
2,1962,31.997,10267083,853.10071
3,1967,34.02,11537966,836.197138
4,1972,36.088,13079460,739.981106
5,1977,38.438,14880372,786.11336
6,1982,39.854,12881816,978.011439
7,1987,40.822,13867957,852.395945
8,1992,41.674,16317921,649.341395
9,1997,41.763,22227415,635.341351


In [81]:
df.loc[0] # Obtendo a primeira entrada do DataFrame

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

In [82]:
df.loc[[0,1,2]] # Obtendo as três primeiras entradas do DataFrame

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071


In [83]:
df.iloc[[0, 1, -1]] # Obtendo as duas primeiras entradas e a última

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [84]:
# Obtendo um sub DataFrame a partir do nosso DataFrame original
# Selecionando colunas por nome
subset = df.loc[:, ['year', 'pop']]

In [85]:
subset.head()

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460


In [26]:
# Obtendo um sub DataFrame a partir do nosso DataFrame original
# Selecionando colunas por índice
subset_2 = df.iloc[:, [2,4]]

In [27]:
subset_2.head()

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460


### Filtrando Colunas

In [86]:
# selecionando apenas entradas com country == United States
df.loc[df['country'] == 'United States'] 

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1608,United States,Americas,1952,68.44,157553000,13990.48208
1609,United States,Americas,1957,69.49,171984000,14847.12712
1610,United States,Americas,1962,70.21,186538000,16173.14586
1611,United States,Americas,1967,70.76,198712000,19530.36557
1612,United States,Americas,1972,71.34,209896000,21806.03594
1613,United States,Americas,1977,73.38,220239000,24072.63213
1614,United States,Americas,1982,74.65,232187835,25009.55914
1615,United States,Americas,1987,75.02,242803533,29884.35041
1616,United States,Americas,1992,76.09,256894189,32003.93224
1617,United States,Americas,1997,76.81,272911760,35767.43303


In [87]:
# selecionando apenas entradas com country == United States e ano maior ou igual a 1982
df.loc[(df['country'] == 'United States') & (df['year'] >= 1982)] 

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1614,United States,Americas,1982,74.65,232187835,25009.55914
1615,United States,Americas,1987,75.02,242803533,29884.35041
1616,United States,Americas,1992,76.09,256894189,32003.93224
1617,United States,Americas,1997,76.81,272911760,35767.43303
1618,United States,Americas,2002,77.31,287675526,39097.09955
1619,United States,Americas,2007,78.242,301139947,42951.65309


In [88]:
df.groupby(['year']) # Agrupando os dados por ano

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f3bfc3fbf90>

In [89]:
df.groupby(['year'])['lifeExp'].mean() # Obtendo a média para cada ano

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [90]:
# Podemos também utilizar a biblioteca NumPy para calcular a media
df.groupby(['year'])['lifeExp'].agg(np.mean) 

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [95]:
# Agrupando por ano e continente
# Calculando a média para as colunas lifeExp e gdpPercap
# Utilizamos a função head() para mostrar apenas as 10 primeiras entradas
df.groupby(['year', 'continent'])[['lifeExp', 'gdpPercap']].agg(np.mean).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp,gdpPercap
year,continent,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,Africa,39.1355,1252.572466
1952,Americas,53.27984,4079.062552
1952,Asia,46.314394,5195.484004
1952,Europe,64.4085,5661.057435
1952,Oceania,69.255,10298.08565
1957,Africa,41.266346,1385.236062
1957,Americas,55.96028,4616.043733
1957,Asia,49.318544,5787.73294
1957,Europe,66.703067,6963.012816
1957,Oceania,70.295,11598.522455


In [98]:
# Resetando o índice
df.groupby(['year', 'continent'])[['lifeExp', 'gdpPercap']].agg(np.mean).reset_index().head(10)

Unnamed: 0,year,continent,lifeExp,gdpPercap
0,1952,Africa,39.1355,1252.572466
1,1952,Americas,53.27984,4079.062552
2,1952,Asia,46.314394,5195.484004
3,1952,Europe,64.4085,5661.057435
4,1952,Oceania,69.255,10298.08565
5,1957,Africa,41.266346,1385.236062
6,1957,Americas,55.96028,4616.043733
7,1957,Asia,49.318544,5787.73294
8,1957,Europe,66.703067,6963.012816
9,1957,Oceania,70.295,11598.522455


# Pandas Tidy Data

![img](https://raw.githubusercontent.com/the-akira/DataScience/master/imagens/TidyData.png)

Hadley Wickham definiu "Tidy Data" como conjuntos de dados organizados de modo que cada variável seja uma coluna e cada observação seja uma linha.

**Importante**

A organização dos dados é uma consideração importante no processamento de dados, mas não deve ser confundida com a tarefa também importante de limpeza de dados.

Vejamos alguns exemplos de como tratar dados desorganizados

In [9]:
# Colunas contendo valores, não variáveis
pew = pd.read_csv('https://raw.githubusercontent.com/chendaniely/scipy-2019-pandas/master/data/pew.csv')

In [10]:
pew.head()

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116


In [11]:
pew.shape

(18, 11)

In [12]:
pew.melt(id_vars='religion').head(10)

Unnamed: 0,religion,variable,value
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
5,Evangelical Prot,<$10k,575
6,Hindu,<$10k,1
7,Historically Black Prot,<$10k,228
8,Jehovah's Witness,<$10k,20
9,Jewish,<$10k,19


In [13]:
pew_tidy = pew.melt(id_vars='religion', var_name='income', value_name='count')

In [14]:
pew_tidy.head()

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15


## Outro Exemplo: Billboard Dataset

In [15]:
billboard = pd.read_csv('https://raw.githubusercontent.com/chendaniely/scipy-2019-pandas/master/data/billboard.csv')

In [16]:
billboard.head()

Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,...,wk67,wk68,wk69,wk70,wk71,wk72,wk73,wk74,wk75,wk76
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,87,82.0,72.0,77.0,87.0,...,,,,,,,,,,
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,91,87.0,92.0,,,...,,,,,,,,,,
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70.0,68.0,67.0,66.0,...,,,,,,,,,,
3,2000,3 Doors Down,Loser,4:24,2000-10-21,76,76.0,72.0,69.0,67.0,...,,,,,,,,,,
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,57,34.0,25.0,17.0,17.0,...,,,,,,,,,,


In [17]:
billboard.melt(id_vars=['year','artist','track','time','date.entered'], value_name = 'rank', var_name = 'week').head(10)

Unnamed: 0,year,artist,track,time,date.entered,week,rank
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,wk1,91.0
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,wk1,81.0
3,2000,3 Doors Down,Loser,4:24,2000-10-21,wk1,76.0
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,wk1,57.0
5,2000,98^0,Give Me Just One Nig...,3:24,2000-08-19,wk1,51.0
6,2000,A*Teens,Dancing Queen,3:44,2000-07-08,wk1,97.0
7,2000,Aaliyah,I Don't Wanna,4:15,2000-01-29,wk1,84.0
8,2000,Aaliyah,Try Again,4:03,2000-03-18,wk1,59.0
9,2000,"Adams, Yolanda",Open My Heart,5:30,2000-08-26,wk1,76.0


In [18]:
(billboard
 .melt(id_vars=['year','artist','track','time','date.entered'], 
       value_name = 'rank', 
       var_name = 'week')
 .groupby('artist')['rank'].mean()
).head(10)

artist
2 Pac                  85.428571
2Ge+her                90.000000
3 Doors Down           37.602740
504 Boyz               56.222222
98^0                   37.650000
A*Teens                97.000000
Aaliyah                30.269231
Adams, Yolanda         67.750000
Adkins, Trace          76.272727
Aguilera, Christina    21.089552
Name: rank, dtype: float64

In [19]:
ebola = pd.read_csv('https://raw.githubusercontent.com/chendaniely/scipy-2019-pandas/master/data/country_timeseries.csv')

In [20]:
ebola.head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,


In [21]:
ebola_long = ebola.melt(id_vars=['Date','Day'], var_name='cd_country', value_name='count')

In [22]:
ebola_long.head()

Unnamed: 0,Date,Day,cd_country,count
0,1/5/2015,289,Cases_Guinea,2776.0
1,1/4/2015,288,Cases_Guinea,2775.0
2,1/3/2015,287,Cases_Guinea,2769.0
3,1/2/2015,286,Cases_Guinea,
4,12/31/2014,284,Cases_Guinea,2730.0


In [23]:
'hello_world'.split('_')

['hello', 'world']

In [24]:
ebola_split = ebola_long['cd_country'].str.split('_', expand=True)

In [25]:
ebola_split.head()

Unnamed: 0,0,1
0,Cases,Guinea
1,Cases,Guinea
2,Cases,Guinea
3,Cases,Guinea
4,Cases,Guinea


In [26]:
ebola_long['test'] = 1

In [27]:
ebola_long.head()

Unnamed: 0,Date,Day,cd_country,count,test
0,1/5/2015,289,Cases_Guinea,2776.0,1
1,1/4/2015,288,Cases_Guinea,2775.0,1
2,1/3/2015,287,Cases_Guinea,2769.0,1
3,1/2/2015,286,Cases_Guinea,,1
4,12/31/2014,284,Cases_Guinea,2730.0,1


In [28]:
ebola_long[['status','country']] = ebola_split

In [29]:
ebola_long.head()

Unnamed: 0,Date,Day,cd_country,count,test,status,country
0,1/5/2015,289,Cases_Guinea,2776.0,1,Cases,Guinea
1,1/4/2015,288,Cases_Guinea,2775.0,1,Cases,Guinea
2,1/3/2015,287,Cases_Guinea,2769.0,1,Cases,Guinea
3,1/2/2015,286,Cases_Guinea,,1,Cases,Guinea
4,12/31/2014,284,Cases_Guinea,2730.0,1,Cases,Guinea
