In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# check all available datasets in seaborn

datasets = sns.get_dataset_names()
print(datasets)

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']


In [4]:
tips_data = sns.load_dataset('tips')
print(tips_data)

     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]


In [5]:
tips_data.shape

(244, 7)

In [6]:
tips_data.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [7]:
tips_data.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [8]:
tips_data.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
tips_data.select_dtypes(include='number')

Unnamed: 0,total_bill,tip,size
0,16.99,1.01,2
1,10.34,1.66,3
2,21.01,3.50,3
3,23.68,3.31,2
4,24.59,3.61,4
...,...,...,...
239,29.03,5.92,3
240,27.18,2.00,2
241,22.67,2.00,2
242,17.82,1.75,2


In [14]:
tips_data.select_dtypes(include='object')

0
1
2
3
4
...
239
240
241
242
243


## measures of central tendency

In [15]:
tips_data.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [18]:
tips_data.tip

0      1.01
1      1.66
2      3.50
3      3.31
4      3.61
       ... 
239    5.92
240    2.00
241    2.00
242    1.75
243    3.00
Name: tip, Length: 244, dtype: float64

In [20]:
tips_tip_mean = tips_data.tip.mean()
print(tips_tip_mean)

2.99827868852459


In [24]:
tip_mean = tips_data['tip'].mean()
tip_median = tips_data['tip'].median()
tip_mode = tips_data['tip'].mode().values[0]


print(tip_mean)
print(tip_median)
print(tip_mode)

2.99827868852459
2.9
2.0


In [25]:
tips_data.day.value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [26]:
tips_data.time.value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [27]:
tips_data.sex.value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [28]:
tips_data.smoker.value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

# measures of dispersion

In [37]:
#  highest and lowest tips

max_tip = tips_data.tip.max()
min_tip = tips_data.tip.min()
tip_range = max_tip - min_tip

print("Minimum Tip:-", min_tip)
print("Maximum Tip:-", max_tip)
print("Tip Range:-", tip_range)



Minimum Tip:- 1.0
Maximum Tip:- 10.0
Tip Range:- 9.0


In [36]:
#  highest and lowest bills

max_bill = tips_data.total_bill.max()
min_bill = tips_data.total_bill.min()
bill_range = max_bill - min_bill

print("Minimum Bill Amount:-", min_bill)
print("Maximum Bill Amount:-", max_bill)
print("Bill Range:-", bill_range)


Minimum Bill Amount:- 3.07
Maximum Bill Amount:- 50.81
Bill Range:- 47.74


In [39]:
#  range for all numerical columns together

tips_num_cols = tips_data.select_dtypes(include="number")

tips_num_cols

Unnamed: 0,total_bill,tip,size
0,16.99,1.01,2
1,10.34,1.66,3
2,21.01,3.50,3
3,23.68,3.31,2
4,24.59,3.61,4
...,...,...,...
239,29.03,5.92,3
240,27.18,2.00,2
241,22.67,2.00,2
242,17.82,1.75,2


In [46]:

all_max = tips_num_cols.max()
all_min = tips_num_cols.min()
all_range = all_max - all_min

print("\n\nColumn-wise max\n", all_max)
print("\n\nColumn-wise min\n", all_min)
print("\n\nColumn-wise Range\n", all_range)



Column-wise max
 total_bill    50.81
tip           10.00
size           6.00
dtype: float64


Column-wise min
 total_bill    3.07
tip           1.00
size          1.00
dtype: float64


Column-wise Range
 total_bill    47.74
tip            9.00
size           5.00
dtype: float64


In [48]:
# variance for indevidual columns

tip_variance = tips_data["tip"].var()
bill_variance = tips_data["total_bill"].var()


print("Tip variance:-", tip_variance)
print("Bill variance:-", bill_variance)

Tip variance:- 1.914454638062471
Bill variance:- 79.25293861397827


In [49]:
#  variance for all columns

tips_num_cols.var()

total_bill    79.252939
tip            1.914455
size           0.904591
dtype: float64

In [53]:
#  variance with numpy

np_tip_var = np.var(tips_data["tip"])
np_bill_var = np.var(tips_data.total_bill)

print("NP tip variance:- ", np_tip_var)
print("NP bill variance:- ", np_bill_var)

NP tip variance:-  1.9066085124966412
NP bill variance:-  78.92813148851114


In [54]:
#  comparing variance and standard deviation

tips_data.tip.var(), tips_data.tip.std()**2

(1.914454638062471, 1.9144546380624712)

# measures of location

In [56]:
#  Quartiles for bills

np.quantile(tips_data.total_bill, 0.25), np.quantile(tips_data.total_bill, 0.50), np.quantile(tips_data.total_bill, 0.75)

(13.3475, 17.795, 24.127499999999998)

In [57]:
#  deciles for tips

np.quantile(tips_data.tip, 0.1), np.quantile(tips_data.tip, 0.2), np.quantile(tips_data.tip, 0.3)

(1.5, 2.0, 2.0)

In [60]:
q1_bills = np.quantile(tips_data.total_bill, 0.25)
q2_bills = np.quantile(tips_data.total_bill, 0.50)
q3_bills = np.quantile(tips_data.total_bill, 0.75)

percentile_25 = np.percentile(tips_data.total_bill, 25)
percentile_50 = np.percentile(tips_data.total_bill, 50)
percentile_75 = np.percentile(tips_data.total_bill, 75)

bill_IQR = q3_bills - q1_bills

print("Q1:- ", q1_bills)
print("Q2 (median):- ", q2_bills)
print("Q3:- ", q3_bills)
print("25th Percentile:- ", percentile_25)
print("50th Percentile:- ", percentile_50)
print("75th Percentile:- ", percentile_75)
print("IQR:- ", bill_IQR)

Q1:-  13.3475
Q2 (median):-  17.795
Q3:-  24.127499999999998
25th Percentile:-  13.3475
50th Percentile:-  17.795
75th Percentile:-  24.127499999999998
IQR:-  10.779999999999998
