In [1]:
import pandas as pd
import numpy as np

In [2]:
file = "drinks.csv"
drinks = pd.read_csv(file)
drinks.columns

Index(['country', 'beer_servings', 'spirit_servings', 'wine_servings',
       'total_litres_of_pure_alcohol', 'population', 'continent'],
      dtype='object')

In [3]:
#Como utilizamos um método de string normalmente?
'data science'.upper()

'DATA SCIENCE'

## Verificando os tipos do Dataset

In [4]:
#Utilizando dtypes
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
population                       object
continent                        object
dtype: object

In [5]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,population,continent
0,Afghanistan,0,0,0,0.0,34.660.000,Asia
1,Albania,89,132,54,4.9,2.876.000,Europe
2,Algeria,25,0,14,0.7,40.061.000,Africa
3,Andorra,245,138,312,12.4,77.281.000,Europe
4,Angola,217,57,45,5.9,28.081.000,Africa


In [6]:
#Modificando o tipo em uma serie existente com ASTYPE()
drinks['beer_servings'] = drinks.beer_servings.astype(float)

In [7]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,population,continent
0,Afghanistan,0.0,0,0,0.0,34.660.000,Asia
1,Albania,89.0,132,54,4.9,2.876.000,Europe
2,Algeria,25.0,0,14,0.7,40.061.000,Africa
3,Andorra,245.0,138,312,12.4,77.281.000,Europe
4,Angola,217.0,57,45,5.9,28.081.000,Africa


In [8]:
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
population                       object
continent                        object
dtype: object

In [9]:
#Alternativamente, podemos modificar o tipo enquanto lemos uma série
drinks = pd.read_csv(file,dtype={'spirit_servings':float})
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                 float64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
population                       object
continent                        object
dtype: object

## Lendo outro dataset

In [10]:
arquivo = "sanduiches.txt"
vendas = pd.read_table(arquivo)
vendas.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [11]:
#Observe a coluna item_price
vendas.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

In [12]:
vendas.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


## O preço do sanduíche é uma string e está com o caracter $

#### Como podemos converter em float e trazer a média com uma linha de código? Com astype!

In [20]:
#convertendo uma string em float  e trazendo a média
vendas.item_price.str.replace('$','').astype(float).mean()

7.464335785374397

## Mais um pouco sobre boleanos

In [26]:
#Ja aprendemos a utilizar o contains
vendas.item_name.str.contains('Chicken').head(10)

0    False
1    False
2    False
3    False
4     True
5     True
6    False
7    False
8    False
9    False
Name: item_name, dtype: bool

In [31]:
#Vamos converter uma serie boolean em integer (False=0, True=1)
vendas.item_name.str.contains('Chicken').astype(int).head(10)

0    0
1    0
2    0
3    0
4    1
5    1
6    0
7    0
8    0
9    0
Name: item_name, dtype: int64

In [32]:
#Podemos fazer esse procedimento com uma variável?
vendas_int = vendas.item_name.str.contains('Chicken').astype(int)
vendas_int.head(10)

0    0
1    0
2    0
3    0
4    1
5    1
6    0
7    0
8    0
9    0
Name: item_name, dtype: int64

In [33]:
#Quantos sanduiches contem Tomato?
vendas_int = vendas.item_name.str.contains('Tomato').astype(int)
vendas_int.sum()

111

In [34]:
#Será que vendem coca-cola nessa lanchonete?
vendas.item_name.str.contains('Coke').astype(int).sum()

0

In [35]:
#Vamos verificar as ordens de venda
vendas

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


In [36]:
vendas.choice_description.str.contains('Coke').astype(int).sum() #ERRO! pois a coluna possui valores NaN

ValueError: cannot convert float NaN to integer