# Revisão Pandas

## Lendo uma base de dados

In [34]:
import pandas as pd
df = pd.read_csv('superbowl.csv', sep=',')
df.head()

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
0,Feb 2 2020,LIV (54),Kansas City Chiefs,31,San Francisco 49ers,20,Patrick Mahomes,Hard Rock Stadium,Miami Gardens,Florida
1,Feb 3 2019,LIII (53),New England Patriots,13,Los Angeles Rams,3,Julian Edelman,Mercedes-Benz Stadium,Atlanta,Georgia
2,Feb 4 2018,LII (52),Philadelphia Eagles,41,New England Patriots,33,Nick Foles,U.S. Bank Stadium,Minneapolis,Minnesota
3,Feb 5 2017,LI (51),New England Patriots,34,Atlanta Falcons,28,Tom Brady,NRG Stadium,Houston,Texas
4,Feb 7 2016,50,Denver Broncos,24,Carolina Panthers,10,Von Miller,Levi's Stadium,Santa Clara,California


## Entendendo as colunas

In [4]:
df.dtypes

Date          object
SB            object
Winner        object
Winner Pts     int64
Loser         object
Loser Pts      int64
MVP           object
Stadium       object
City          object
State         object
dtype: object

In [5]:
df.shape

(54, 10)

## Manipulando as colunas

In [10]:
df['Date'].head()

0    Feb 2 2020
1    Feb 3 2019
2    Feb 4 2018
3    Feb 5 2017
4    Feb 7 2016
Name: Date, dtype: object

In [11]:
df['Winner Pts'].head()

0    31
1    13
2    41
3    34
4    24
Name: Winner Pts, dtype: int64

In [12]:
pontos = df['Winner Pts']
print('Soma:', pontos.sum())
print('Média:', pontos.mean())
print('Contagem: ', pontos.count())

Soma: 1626
Média: 30.11111111111111
Contagem:  54


In [19]:
data = df['Date'][0]
print(data)

mes = data.split(' ')[0]
print(mes)

Feb 2 2020
Feb


In [28]:
meses = []
for linha in df['Date']:
    mes = linha.split(' ')[0]
    meses.append(mes)

meses = pd.Series(meses)
meses.head()

0    Feb
1    Feb
2    Feb
3    Feb
4    Feb
dtype: object

### Método Cabreira

In [29]:
def dividir_data(data_string):
    return data_string.split(' ')[0]

meses = df['Date'].apply(dividir_data)
meses.head()

0    Feb
1    Feb
2    Feb
3    Feb
4    Feb
Name: Date, dtype: object

### Método mais cabreira ainda

In [30]:
meses = df['Date'].apply(lambda x: x.split(' ')[0])
meses.head()

0    Feb
1    Feb
2    Feb
3    Feb
4    Feb
Name: Date, dtype: object

In [31]:
df['Contagem'] = 1
df.head()

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State,Contagem
0,Feb 2 2020,LIV (54),Kansas City Chiefs,31,San Francisco 49ers,20,Patrick Mahomes,Hard Rock Stadium,Miami Gardens,Florida,1
1,Feb 3 2019,LIII (53),New England Patriots,13,Los Angeles Rams,3,Julian Edelman,Mercedes-Benz Stadium,Atlanta,Georgia,1
2,Feb 4 2018,LII (52),Philadelphia Eagles,41,New England Patriots,33,Nick Foles,U.S. Bank Stadium,Minneapolis,Minnesota,1
3,Feb 5 2017,LI (51),New England Patriots,34,Atlanta Falcons,28,Tom Brady,NRG Stadium,Houston,Texas,1
4,Feb 7 2016,50,Denver Broncos,24,Carolina Panthers,10,Von Miller,Levi's Stadium,Santa Clara,California,1


In [33]:
df['Mes'] = meses
df.head()

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State,Contagem,Mes
0,Feb 2 2020,LIV (54),Kansas City Chiefs,31,San Francisco 49ers,20,Patrick Mahomes,Hard Rock Stadium,Miami Gardens,Florida,1,Feb
1,Feb 3 2019,LIII (53),New England Patriots,13,Los Angeles Rams,3,Julian Edelman,Mercedes-Benz Stadium,Atlanta,Georgia,1,Feb
2,Feb 4 2018,LII (52),Philadelphia Eagles,41,New England Patriots,33,Nick Foles,U.S. Bank Stadium,Minneapolis,Minnesota,1,Feb
3,Feb 5 2017,LI (51),New England Patriots,34,Atlanta Falcons,28,Tom Brady,NRG Stadium,Houston,Texas,1,Feb
4,Feb 7 2016,50,Denver Broncos,24,Carolina Panthers,10,Von Miller,Levi's Stadium,Santa Clara,California,1,Feb


## Ordenando os valores

- Quais 10 edições do SuperBowl tiveram o maior número de pontos?

In [42]:
ordenado = df.sort_values(by='Winner Pts', ascending=False)
ordenado.head(10)

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
30,Jan 28 1990,XXIV (24),San Francisco 49ers,55,Denver Broncos,10,Joe Montana+,Louisiana Superdome,New Orleans,Louisiana
27,Jan 31 1993,XXVII (27),Dallas Cowboys,52,Buffalo Bills,17,Troy Aikman+,Rose Bowl,Pasadena,California
25,Jan 29 1995,XXIX (29),San Francisco 49ers,49,San Diego Chargers,26,Steve Young+,Joe Robbie Stadium,Miami Gardens,Florida
17,Jan 26 2003,XXXVII (37),Tampa Bay Buccaneers,48,Oakland Raiders,21,Dexter Jackson,Qualcomm Stadium,San Diego,California
34,Jan 26 1986,XX (20),Chicago Bears,46,New England Patriots,10,Richard Dent+,Louisiana Superdome,New Orleans,Louisiana
6,Feb 2 2014,XLVIII (48),Seattle Seahawks,43,Denver Broncos,8,Malcolm Smith,MetLife Stadium,East Rutherford,New Jersey
32,Jan 31 1988,XXII (22),Washington Redskins,42,Denver Broncos,10,Doug Williams,Jack Murphy Stadium,San Diego,California
2,Feb 4 2018,LII (52),Philadelphia Eagles,41,New England Patriots,33,Nick Foles,U.S. Bank Stadium,Minneapolis,Minnesota
33,Jan 25 1987,XXI (21),New York Giants,39,Denver Broncos,20,Phil Simms,Rose Bowl,Pasadena,California
36,Jan 22 1984,XVIII (18),Los Angeles Raiders,38,Washington Redskins,9,Marcus Allen+,Tampa Stadium,Tampa,Florida


- Ordene os nomes dos MVPs de todas as edições do superbowl em ordem alfabética (crescente)

In [45]:
ordenado = df.sort_values(by='MVP', ascending=True)
ordenado.head(10)

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
9,Feb 6 2011,XLV (45),Green Bay Packers,31,Pittsburgh Steelers,25,Aaron Rodgers,Cowboys Stadium,Arlington,Texas
53,Jan 15 1967,I (1),Green Bay Packers,35,Kansas City Chiefs,10,Bart Starr+,Memorial Coliseum,Los Angeles,California
52,Jan 14 1968,II (2),Green Bay Packers,33,Oakland Raiders,14,Bart Starr+,Orange Bowl,Miami,Florida
49,Jan 17 1971,V (5),Baltimore Colts,16,Dallas Cowboys,13,Chuck Howley,Orange Bowl,Miami,Florida
15,Feb 6 2005,XXXIX (39),New England Patriots,24,Philadelphia Eagles,21,Deion Branch,Alltel Stadium,Jacksonville,Florida
23,Jan 26 1997,XXXI (31),Green Bay Packers,35,New England Patriots,21,Desmond Howard,Louisiana Superdome,New Orleans,Louisiana
17,Jan 26 2003,XXXVII (37),Tampa Bay Buccaneers,48,Oakland Raiders,21,Dexter Jackson,Qualcomm Stadium,San Diego,California
32,Jan 31 1988,XXII (22),Washington Redskins,42,Denver Broncos,10,Doug Williams,Jack Murphy Stadium,San Diego,California
10,Feb 7 2010,XLIV (44),New Orleans Saints,31,Indianapolis Colts,17,Drew Brees,Sun Life Stadium,Miami Gardens,Florida
12,Feb 3 2008,XLII (42),New York Giants,17,New England Patriots,14,Eli Manning,University of Phoenix Stadium,Glendale,Arizona


## Operações de filtro

In [46]:
df.head()

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
0,Feb 2 2020,LIV (54),Kansas City Chiefs,31,San Francisco 49ers,20,Patrick Mahomes,Hard Rock Stadium,Miami Gardens,Florida
1,Feb 3 2019,LIII (53),New England Patriots,13,Los Angeles Rams,3,Julian Edelman,Mercedes-Benz Stadium,Atlanta,Georgia
2,Feb 4 2018,LII (52),Philadelphia Eagles,41,New England Patriots,33,Nick Foles,U.S. Bank Stadium,Minneapolis,Minnesota
3,Feb 5 2017,LI (51),New England Patriots,34,Atlanta Falcons,28,Tom Brady,NRG Stadium,Houston,Texas
4,Feb 7 2016,50,Denver Broncos,24,Carolina Panthers,10,Von Miller,Levi's Stadium,Santa Clara,California


- Quantas superbowls foram realizadas na Orange Bowl (Florida)?

In [48]:
df[
    (df['Stadium'] == 'Orange Bowl')
]

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
41,Jan 21 1979,XIII (13),Pittsburgh Steelers,35,Dallas Cowboys,31,Terry Bradshaw+,Orange Bowl,Miami,Florida
44,Jan 18 1976,X (10),Pittsburgh Steelers,21,Dallas Cowboys,17,Lynn Swann+,Orange Bowl,Miami,Florida
49,Jan 17 1971,V (5),Baltimore Colts,16,Dallas Cowboys,13,Chuck Howley,Orange Bowl,Miami,Florida
51,Jan 12 1969,III (3),New York Jets,16,Baltimore Colts,7,Joe Namath+,Orange Bowl,Miami,Florida
52,Jan 14 1968,II (2),Green Bay Packers,33,Oakland Raiders,14,Bart Starr+,Orange Bowl,Miami,Florida


- Quantas superbowls foram realizadas na Florida?

In [49]:
df[
    (df['State'] == 'Florida')
]

Unnamed: 0,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City,State
0,Feb 2 2020,LIV (54),Kansas City Chiefs,31,San Francisco 49ers,20,Patrick Mahomes,Hard Rock Stadium,Miami Gardens,Florida
10,Feb 7 2010,XLIV (44),New Orleans Saints,31,Indianapolis Colts,17,Drew Brees,Sun Life Stadium,Miami Gardens,Florida
11,Feb 1 2009,XLIII (43),Pittsburgh Steelers,27,Arizona Cardinals,23,Santonio Holmes,Raymond James Stadium,Tampa,Florida
13,Feb 4 2007,XLI (41),Indianapolis Colts,29,Chicago Bears,17,Peyton Manning,Dolphin Stadium,Miami Gardens,Florida
15,Feb 6 2005,XXXIX (39),New England Patriots,24,Philadelphia Eagles,21,Deion Branch,Alltel Stadium,Jacksonville,Florida
19,Jan 28 2001,XXXV (35),Baltimore Ravens,34,New York Giants,7,Ray Lewis+,Raymond James Stadium,Tampa,Florida
21,Jan 31 1999,XXXIII (33),Denver Broncos,34,Atlanta Falcons,19,John Elway+,Pro Player Stadium,Miami Gardens,Florida
25,Jan 29 1995,XXIX (29),San Francisco 49ers,49,San Diego Chargers,26,Steve Young+,Joe Robbie Stadium,Miami Gardens,Florida
29,Jan 27 1991,XXV (25),New York Giants,20,Buffalo Bills,19,Ottis Anderson,Tampa Stadium,Tampa,Florida
31,Jan 22 1989,XXIII (23),San Francisco 49ers,20,Cincinnati Bengals,16,Jerry Rice+,Joe Robbie Stadium,Miami Gardens,Florida


## Agrupamento

- Quantas superbowls foram realizadas em cada estado?

In [53]:
agrupado = df.groupby(by='State').count()

agrupado[['SB']]

Unnamed: 0_level_0,SB
State,Unnamed: 1_level_1
Arizona,3
California,12
Florida,16
Georgia,3
Indiana,1
Louisiana,10
Michigan,2
Minnesota,2
New Jersey,1
Texas,4


### Resetando o índice

In [55]:
df.groupby(by='State', as_index=False).count()

Unnamed: 0,State,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City
0,Arizona,3,3,3,3,3,3,3,3,3
1,California,12,12,12,12,12,12,12,12,12
2,Florida,16,16,16,16,16,16,16,16,16
3,Georgia,3,3,3,3,3,3,3,3,3
4,Indiana,1,1,1,1,1,1,1,1,1
5,Louisiana,10,10,10,10,10,10,10,10,10
6,Michigan,2,2,2,2,2,2,2,2,2
7,Minnesota,2,2,2,2,2,2,2,2,2
8,New Jersey,1,1,1,1,1,1,1,1,1
9,Texas,4,4,4,4,4,4,4,4,4


In [56]:
agrupado.reset_index()

Unnamed: 0,State,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium,City
0,Arizona,3,3,3,3,3,3,3,3,3
1,California,12,12,12,12,12,12,12,12,12
2,Florida,16,16,16,16,16,16,16,16,16
3,Georgia,3,3,3,3,3,3,3,3,3
4,Indiana,1,1,1,1,1,1,1,1,1
5,Louisiana,10,10,10,10,10,10,10,10,10
6,Michigan,2,2,2,2,2,2,2,2,2
7,Minnesota,2,2,2,2,2,2,2,2,2
8,New Jersey,1,1,1,1,1,1,1,1,1
9,Texas,4,4,4,4,4,4,4,4,4


## Misturando tudo

- Quais estados realizam o maior número de superbowls?

In [62]:
mistura = df.groupby(by='State', as_index=False).count()
mistura = mistura.sort_values(by='SB', ascending=False)
mistura[['State', 'SB']]

Unnamed: 0,State,SB
2,Florida,16
1,California,12
5,Louisiana,10
9,Texas,4
0,Arizona,3
3,Georgia,3
6,Michigan,2
7,Minnesota,2
4,Indiana,1
8,New Jersey,1


- Quais cidades da Florida que mais realizaram as edições do superbowl?

In [64]:
mistura = df[(df['State'] == 'Florida')]
mistura = mistura.groupby(by=['State', 'City'], as_index=False).count()
mistura = mistura.sort_values(by='SB', ascending=False)
mistura

Unnamed: 0,State,City,Date,SB,Winner,Winner Pts,Loser,Loser Pts,MVP,Stadium
2,Florida,Miami Gardens,6,6,6,6,6,6,6,6
1,Florida,Miami,5,5,5,5,5,5,5,5
3,Florida,Tampa,4,4,4,4,4,4,4,4
0,Florida,Jacksonville,1,1,1,1,1,1,1,1


- Quem são os times que mais ganham o superbowl?

In [70]:
mistura = df.groupby(by='Winner', as_index=False).count()
mistura = mistura.sort_values(by='SB', ascending=False)
mistura[['Winner', 'SB']]

Unnamed: 0,Winner,SB
16,Pittsburgh Steelers,6
10,New England Patriots,6
3,Dallas Cowboys,5
17,San Francisco 49ers,5
5,Green Bay Packers,4
12,New York Giants,4
21,Washington Redskins,3
4,Denver Broncos,3
7,Kansas City Chiefs,2
9,Miami Dolphins,2


## Exercícios 

In [None]:
# Quantas superbowls existem?

In [None]:
# Quantas superbowls foram realizadas na California?

In [None]:
# Quais são os estádios mais utilizados para Superbowl na California?

In [None]:
# Quantas superbowls foram realizadas em Fevereiro?

In [None]:
# Quem é o MVP de todo o superbowl e em quantas edições ele foi elegido MVP? 

# Problemas de Formatação

## Formato de data

In [None]:
# Formato desejado: dd/mês/ano
# Formato atual: ano-mês-dia

import pandas as pd
apple = pd.read_csv('AAPL.csv', sep=',')
apple.head()

In [None]:
for data in apple['Date']:
    ano, mes, dia = data.split('-')
    print(ano, mes, dia)
    break

In [None]:
novas_datas = []
for data in apple['Date']:
    ano, mes, dia = data.split('-')
    novas_datas.append(dia + '/' + mes + '/' + ano)

novas_datas[0]

In [None]:
apple['Date'] = novas_datas
apple.head()

## Produtos e-commerce

In [None]:
# Importando as bibliotecas
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Encontrar a URL
url = 'https://www.kalunga.com.br/depto/gamers/13?menuID=109&menuCat=all'

# Contruir um objeto soup a partir da URL
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

# Construindo o código
catalogo = []
for resultado in soup.find_all('h1', attrs={'class': 'text-left mt-2 text-dark'}):
    bloco = resultado.parent.parent.parent.parent
    
    titulo = resultado.text
    preco = bloco.find('span', attrs={'class': 'text-primary h6'}).text.split('à vista')[0].strip()
    categoria = ' '.join(resultado.text.split(' ')[0:3])
    
    catalogo.append([titulo, preco, categoria])

# Armazenando os dados
produtos = pd.DataFrame(catalogo, columns=['Título', 'Preço', 'Categoria'])
produtos

In [None]:
produtos['Preço'].sum()

In [None]:
produtos['Categoria'].value_counts()

In [None]:
precos = []
for preco in produtos['Preço']:
    print(float(preco.replace('R$', '')))

In [None]:
precos = []
for preco in produtos['Preço']:
    preco = preco.replace('R$', '')
    preco = preco.replace('.', '')
    preco = preco.replace(',', '.')
    precos.append(float(preco))

produtos['Preço'] = precos
produtos['Preço'].sum()

In [None]:
categorias = []
for categoria in produtos['Categoria']:
    palavras = categoria.split('-')
    categorias.append(palavras[0])

produtos['Categoria'] = categorias
produtos['Categoria'].value_counts()

# Valores duplicados

In [None]:
from random import choice

import pandas as pd
import requests
from bs4 import BeautifulSoup

url = 'https://www.google.com/search?q=ciencia+de+dados'

html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

resultados = []
for resultado in soup.find_all('div', attrs={'class': 'BNeawe vvjwJb AP7Wnd'}):
    bloco = resultado.parent.parent.parent.parent
    
    titulo = resultado.text
    url = bloco.a['href']
    descricao = bloco.find('div', attrs={'class': 'BNeawe s3v9rd AP7Wnd'}).text
    
    resultados.append([titulo, url, descricao])
    
    duplicar = choice([False, True])
    if duplicar:
        resultados.append([titulo, url, descricao])


buscas = pd.DataFrame(resultados, columns=['Título', 'URL', 'Descrição'])
buscas

In [None]:
buscas.drop_duplicates().reset_index(drop=True)

# Atividade

## Exercício 1

In [None]:
import pandas as pd
df = pd.read_csv('superbowl.csv', sep=',')
df.head()

1. Criar 3 novas colunas: dia, mês, ano

2. Transformar a coluna mês em numero

3. Colocar a coluna Date no formato dia/mes/ano 

## Exercício 2

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.youtube.com/results?search_query=ciencia+de+dados'

html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

resultados = []
for video in soup.find_all('div', attrs={'class': 'yt-lockup-content'}):
    titulo = video.find('h3', attrs={'class': 'yt-lockup-title'}).a.text
    duracao = video.find('h3', attrs={'class': 'yt-lockup-title'}).span.text
    url = video.find('h3', attrs={'class': 'yt-lockup-title'}).a['href']
    resultados.append([titulo, duracao, url])

videos = pd.DataFrame(resultados, columns=['Título', 'Duração', 'URL'])
videos.head()

1. Formatar a coluna de duração para o formato mm:ss

2. Criar duas novas colunas: Minutos, Segundos

3. Adicionar 'https://www.youtube.com' antes dos valores da coluna URL

## Exercício 3

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = 'https://www.google.com/search?q=ciencia+de+dados'

html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

resultados = []
for resultado in soup.find_all('div', attrs={'class': 'BNeawe vvjwJb AP7Wnd'}):
    bloco = resultado.parent.parent.parent.parent
    
    titulo = resultado.text
    url = bloco.a['href']
    descricao = bloco.find('div', attrs={'class': 'BNeawe s3v9rd AP7Wnd'}).text
    
    resultados.append([titulo, url, descricao])

buscas = pd.DataFrame(resultados, columns=['Título', 'URL', 'Descrição'])
buscas.head()

1. Eliminar a string "/url?q=" do campo URL