In [1]:
import pandas as pd

In [None]:
# Pandas has very convenient features to aggregate data. That is, to compute summary statistics about datasets
# either as a whole or after dividing them into subsets based on data values.
# I will show you how to describe data this way, how to group them, and how to create multidimensional groupings
# known as pivot tables.
# I will also show you how easy it is to load a data frame from a standard comma-separated-value text file.
# As I said, pandas is very good at reading data from many different types of file: JSON, text, excel etc.

In [2]:
# Let's have a look at the first few lines of the file. We open it for reading, call readlines, and use slicing to select the 
# first 10 rows.
# As you can see, the first line specifies the names of data columns. The following lines give the values, separated by commas.
open('tips.csv','r').readlines()[:10]

['"total_bill","tip","sex","smoker","day","time","size"\n',
 '16.99,1.01,"Female","No","Sun","Dinner",2\n',
 '10.34,1.66,"Male","No","Sun","Dinner",3\n',
 '21.01,3.5,"Male","No","Sun","Dinner",3\n',
 '23.68,3.31,"Male","No","Sun","Dinner",2\n',
 '24.59,3.61,"Female","No","Sun","Dinner",4\n',
 '25.29,4.71,"Male","No","Sun","Dinner",4\n',
 '8.77,2,"Male","No","Sun","Dinner",2\n',
 '26.88,3.12,"Male","No","Sun","Dinner",4\n',
 '15.04,1.96,"Male","No","Sun","Dinner",2\n']

In [15]:
# In this case, we will load a data frame from the comma-separated-file tips.csv
# Pandas can read such a file with a read_csv function.
# It will do its best to guess data types, but in some cases, you may need some guidance.
# For instance, you could tell it to ignore certain columns or lines.
# You could specify the names of columns if they're not given.
# You could also prescribe how to handle missing data.
tips = pd.read_csv('tips.csv')
# Let's see the top of the result in data frame.
# It gives the total amount of the bill, the tip, the sex of the diner, whether the diner was a smoker,
# the day and time, and the size of the party.
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [12]:
# One easy thing that we can do is to apply the aggregation function: mean, which will compute average values
# for all the columns for which it's meaningful to do so.
tips.mean()

total_bill    19.785943
tip            2.998279
size           2.569672
dtype: float64

In [5]:
# If we want more information, we can ask pandas to describe the dataset.
# In this case, pandas reports the count, so how many rows there were, the mean, the standard deviation,
# the minimum and maximum value, and the 25%, 50%, and 70% quantiles
tips.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [14]:
# Let's talk about grouping.
# Let's say you want to know how well men tip versus women.
# For that, we can tell pandas to group the data frame based on the value of the column: sex
# using the function groupby.
# And then, we can take the mean.
# We see that in this dataset, men tipped better than women.
tips.groupby('sex').mean()

Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,18.056897,2.833448,2.45977
Male,20.744076,3.089618,2.630573


In [21]:
# We could also group the data based on two columns by giving a list to groupby.
# For instance, sex and smoker, which specifies whether the diner was a smoker.
# And again, we take the mean.
# This creates a pandas multidimensional index.
tips.groupby(['sex','smoker']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,18.105185,2.773519,2.592593
Female,Yes,17.977879,2.931515,2.242424
Male,No,19.791237,3.113402,2.71134
Male,Yes,22.2845,3.051167,2.5


In [22]:
# A similar idea are the so-called pivot tables.
# pivot_table is used to summarize and aggregate data inside dataframe
# Here, we create groups and assign them to both index values and columns so that represent a multidimensional analysis of the 
# data in tabular format.
# For instance, we'll create a pivot table for our tips data frame showing the total bill amount, grouped by sex and 
# smoker status.
pd.pivot_table(tips,'total_bill','sex','smoker')

smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,18.105185,17.977879
Male,19.791237,22.2845


In [26]:
# you can alternately say exactly the same thing more declaratively
tips.pivot_table('total_bill', index='sex',columns='smoker', aggfunc="mean")

smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,18.105185,17.977879
Male,19.791237,22.2845


In [9]:
# We can also drill down and do one more dimensional grouping.
# Again, pivot_table on the tips data frame for the total bill column grouping by sex and smoker in one dimension
# and by day and time in the other.
pd.pivot_table(tips,'total_bill',['sex','smoker'],['day','time'])

Unnamed: 0_level_0,day,Fri,Fri,Sat,Sun,Thur,Thur
Unnamed: 0_level_1,time,Dinner,Lunch,Dinner,Dinner,Dinner,Lunch
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,No,22.75,15.98,19.003846,20.824286,18.78,15.899167
Female,Yes,12.2,13.26,20.266667,16.54,,19.218571
Male,No,17.475,,19.929063,20.403256,,18.4865
Male,Yes,25.892,11.386667,21.837778,26.141333,,19.171


In [None]:
# You can see how this type of analytics can be very effective in spotting trends and features of a dataset.