# Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [6]:
import numpy as np
import pandas as pd

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipo.

In [7]:
chipo = pd.read_csv("https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv",sep = '\t')
chipo

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75


### Step 4. See the first 10 entries

In [14]:
chipo.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


### Step 5. What is the number of observations in the dataset?

In [16]:
chipo.shape

(4622, 5)

In [17]:
chipo.shape[0]

4622

In [18]:
len(chipo)

4622

In [20]:
chipo.count()

order_id              4622
quantity              4622
item_name             4622
choice_description    3376
item_price            4622
dtype: int64

### Step 6. What is the number of columns in the dataset?

In [21]:
chipo.shape[1]

5

### Step 7. Print the name of all the columns.

In [22]:
chipo.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

### Step 8. How is the dataset indexed?

In [23]:
chipo.index

RangeIndex(start=0, stop=4622, step=1)

### Step 9. Which was the most ordered item?

In [24]:
chipo['item_name'].value_counts().index[0]

'Chicken Bowl'

In [25]:
chipo['item_name'].value_counts(ascending=True).loc[chipo['item_name'].value_counts(ascending=True) == 1]

Carnitas Salad                       1
Chips and Mild Fresh Tomato Salsa    1
Veggie Crispy Tacos                  1
Name: item_name, dtype: int64

In [26]:
chipo['item_name'].value_counts(normalize = True)

Chicken Bowl                             0.157075
Chicken Burrito                          0.119645
Chips and Guacamole                      0.103635
Steak Burrito                            0.079619
Canned Soft Drink                        0.065123
Steak Bowl                               0.045651
Chips                                    0.045651
Bottled Water                            0.035050
Chicken Soft Tacos                       0.024881
Chips and Fresh Tomato Salsa             0.023799
Chicken Salad Bowl                       0.023799
Canned Soda                              0.022501
Side of Chips                            0.021852
Veggie Burrito                           0.020554
Barbacoa Burrito                         0.019688
Veggie Bowl                              0.018390
Carnitas Bowl                            0.014712
Barbacoa Bowl                            0.014280
Carnitas Burrito                         0.012765
Steak Soft Tacos                         0.011900


In [28]:
counts_group = chipo.groupby('item_name')
counts_group = counts_group.sum()
counts_group = counts_group.sort_values(['quantity'],ascending=False)
counts_group.head(1)

  counts_group = counts_group.sum()


Unnamed: 0_level_0,order_id,quantity
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicken Bowl,713926,761


### Step 10. How many items were ordered?

In [29]:
chipo.quantity.sum()

4972

### Step 11. What was the most ordered item in the choice_description column?

In [43]:
chipo.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98


In [45]:
chipo.groupby('item_name').sum().sort_values(['quantity'],ascending=False).head(1)

  chipo.groupby('item_name').sum().sort_values(['quantity'],ascending=False).head(1)


Unnamed: 0_level_0,order_id,quantity,item_price
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chicken Bowl,713926,761,7342.73


### Step 12. How many items were orderd in total?

In [36]:
chipo.quantity.sum()

4972

### Step 13. Turn the item price into a float

In [46]:
chipo.item_price.dtype


dtype('float64')

In [None]:
chipo['item_price'] = chipo['item_price'].apply(lambda x: x.replace('$',''))
chipo['item_price'] = chipo['item_price'].astype('float')


In [48]:
chipo.item_price.dtype

dtype('float64')

### Step 14. How much was the revenue for the period in the dataset?

In [50]:
chipo.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98


In [51]:
chipo['revenue'] = chipo['quantity']*chipo['item_price']
chipo['revenue'].sum()

39237.02

### Step 15. How many orders were made in the period?

In [52]:
chipo['order_id'].value_counts().count()


1834

In [54]:
chipo['order_id'].nunique()

1834

In [55]:
chipo['order_id'].unique()

array([   1,    2,    3, ..., 1832, 1833, 1834], dtype=int64)

### Step 16. What is the average amount per order?

In [56]:
chipo.groupby('order_id')['revenue'].agg('mean')

order_id
1        2.890000
2       33.960000
3        6.335000
4       10.500000
5        6.850000
          ...    
1830    11.500000
1831     4.300000
1832     6.600000
1833    11.750000
1834     9.583333
Name: revenue, Length: 1834, dtype: float64

In [57]:
chipo['revenue'].sum() / chipo.shape[0]


8.48918649935093

### Step 17. How many different items are sold?

In [58]:
chipo.item_name.value_counts().count()


50

In [59]:
chipo['item_name'].nunique()


50