### Using NumPy to calculate statistics and pandas to filter, sort, and summarize a small dataset.

In [1]:
import numpy as np
import pandas as pd
path = "C:\\Users\\tkasiror\\Downloads\\tips.csv"  

In [2]:
#This dataset comes from a restaurant and includes data about customer tips. It contains 244 rows and 7 columns
data = pd.read_csv(path)

In [3]:
data.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [4]:
data.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [5]:
#total_bilL column as a ndarray
total_bill_np = data.total_bill.to_numpy()
total_bill_np

array([16.99, 10.34, 21.01, 23.68, 24.59, 25.29,  8.77, 26.88, 15.04,
       14.78, 10.27, 35.26, 15.42, 18.43, 14.83, 21.58, 10.33, 16.29,
       16.97, 20.65, 17.92, 20.29, 15.77, 39.42, 19.82, 17.81, 13.37,
       12.69, 21.7 , 19.65,  9.55, 18.35, 15.06, 20.69, 17.78, 24.06,
       16.31, 16.93, 18.69, 31.27, 16.04, 17.46, 13.94,  9.68, 30.4 ,
       18.29, 22.23, 32.4 , 28.55, 18.04, 12.54, 10.29, 34.81,  9.94,
       25.56, 19.49, 38.01, 26.41, 11.24, 48.27, 20.29, 13.81, 11.02,
       18.29, 17.59, 20.08, 16.45,  3.07, 20.23, 15.01, 12.02, 17.07,
       26.86, 25.28, 14.73, 10.51, 17.92, 27.2 , 22.76, 17.29, 19.44,
       16.66, 10.07, 32.68, 15.98, 34.83, 13.03, 18.28, 24.71, 21.16,
       28.97, 22.49,  5.75, 16.32, 22.75, 40.17, 27.28, 12.03, 21.01,
       12.46, 11.35, 15.38, 44.3 , 22.42, 20.92, 15.36, 20.49, 25.21,
       18.24, 14.31, 14.  ,  7.25, 38.07, 23.95, 25.71, 17.31, 29.93,
       10.65, 12.43, 24.08, 11.69, 13.42, 14.26, 15.95, 12.48, 29.8 ,
        8.52, 14.52,

### Statistics with Numpy

In [6]:
#findinng the percentiles of the data
percentile = np.percentile(total_bill_np, q= [25,50,75])
print(percentile)

[13.3475 17.795  24.1275]


In [7]:
# can also be evaluated using pd.DataFrame methods
data.total_bill.describe().loc[["25%","50%","75%"]].to_numpy()

array([13.3475, 17.795 , 24.1275])

In [8]:
#finding the standard deviation of the total_bill column
"{:.2f}".format(float(total_bill_np.std()))

'8.88'

In [9]:
# Demonstrating NumPy broadcasting
top_20 = total_bill_np[:20]
top_20 

array([16.99, 10.34, 21.01, 23.68, 24.59, 25.29,  8.77, 26.88, 15.04,
       14.78, 10.27, 35.26, 15.42, 18.43, 14.83, 21.58, 10.33, 16.29,
       16.97, 20.65])

In [10]:
result = top_20 + 100 # Add 100 to each element using broadcasting
result

array([116.99, 110.34, 121.01, 123.68, 124.59, 125.29, 108.77, 126.88,
       115.04, 114.78, 110.27, 135.26, 115.42, 118.43, 114.83, 121.58,
       110.33, 116.29, 116.97, 120.65])

In [11]:
result//20 # broadcasting operations between arrays and scalars allowed for many single operators (e.g., +, -, *, /, //, %, **)


array([5., 5., 6., 6., 6., 6., 5., 6., 5., 5., 5., 6., 5., 5., 5., 6., 5.,
       5., 5., 6.])

In [12]:
20 + result

array([136.99, 130.34, 141.01, 143.68, 144.59, 145.29, 128.77, 146.88,
       135.04, 134.78, 130.27, 155.26, 135.42, 138.43, 134.83, 141.58,
       130.33, 136.29, 136.97, 140.65])

In [13]:
next_20 = total_bill_np[20:40]

In [14]:
next_20

array([17.92, 20.29, 15.77, 39.42, 19.82, 17.81, 13.37, 12.69, 21.7 ,
       19.65,  9.55, 18.35, 15.06, 20.69, 17.78, 24.06, 16.31, 16.93,
       18.69, 31.27])

In [15]:
top_20 + next_20 # Add corresponding elements of top_20 and next_20 using NumPy element-wise addition

array([34.91, 30.63, 36.78, 63.1 , 44.41, 43.1 , 22.14, 39.57, 36.74,
       34.43, 19.82, 53.61, 30.48, 39.12, 32.61, 45.64, 26.64, 33.22,
       35.66, 51.92])

In [16]:
#old way of normalizing arrays 
def normalize(arr: np.ndarray) -> np.ndarray:
    '''
    Inputs a ndarray
    Returns the normalization of the array
    '''
    maxim_val = arr.max()
    minim_val = arr.min()
    width = maxim_val - minim_val
    return (arr- minim_val)/width
    
normalize(top_20)


array([0.31030578, 0.05926765, 0.46206116, 0.56285391, 0.59720649,
       0.62363156, 0.        , 0.68365421, 0.23669309, 0.22687807,
       0.05662514, 1.        , 0.25103813, 0.36466591, 0.22876557,
       0.48357871, 0.05889015, 0.28388071, 0.30955077, 0.44847112])

In [17]:
# normalizing with vectorization
normalized = (next_20- np.min(next_20)) / (np.max(next_20) - np.min(next_20))
normalized

array([0.28021426, 0.35955809, 0.20823569, 1.        , 0.34382323,
       0.27653164, 0.12788751, 0.1051222 , 0.40676264, 0.3381319 ,
       0.        , 0.29460998, 0.18446602, 0.37294945, 0.27552728,
       0.48577168, 0.22631403, 0.24707064, 0.30599263, 0.72715099])

In [18]:
# Vectorized standardization 
standardized = (top_20 - top_20.mean())/(top_20.std())

In [19]:
standardized

array([-0.21354539, -1.24258661,  0.40852162,  0.82168554,  0.96250171,
        1.07082183, -1.48553318,  1.31686327, -0.51529432, -0.55552751,
       -1.25341862,  2.61360994, -0.45649197,  0.00928458, -0.54779036,
        0.49672516, -1.24413404, -0.32186552, -0.21664026,  0.35281413])

### Filtering, sorting, and summarizing with pandas

In [20]:
# Filter rows where tip amount is greater than or equal to $2 using boolean indexing
tip_2or_more = data[data.tip >= 2]
tip_2or_more.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4
5,25.29,4.71,No,Sun,Dinner,4
6,8.77,2.0,No,Sun,Dinner,2


In [21]:
# Filtering by smokers
smokers_bool_series = data["smoker"] == "Yes"
smokers_bool_series.head()

0    False
1    False
2    False
3    False
4    False
Name: smoker, dtype: bool

In [22]:
smokers_df = data[smokers_bool_series].reset_index()

data.index[smokers_bool_series]

Int64Index([ 56,  58,  60,  61,  62,  63,  67,  69,  72,  73,  76,  80,  83,
             90,  92,  93,  95,  96,  97,  98, 100, 101, 102, 103, 105, 106,
            107, 109, 138, 164, 168, 169, 170, 171, 172, 173, 174, 175, 176,
            177, 178, 179, 180, 181, 182, 183, 184, 186, 187, 188, 189, 190,
            191, 192, 193, 194, 196, 197, 198, 199, 200, 201, 202, 203, 204,
            205, 206, 207, 208, 209, 210, 211, 213, 214, 215, 216, 217, 218,
            219, 220, 221, 222, 224, 225, 226, 229, 230, 231, 234, 236, 237,
            240, 241],
           dtype='int64')

In [23]:
smokers_df = data[smokers_bool_series].reset_index(drop = True)
smokers_df.head(4)

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,38.01,3.0,Yes,Sat,Dinner,4
1,11.24,1.76,Yes,Sat,Dinner,2
2,20.29,3.21,Yes,Sat,Dinner,2
3,13.81,2.0,Yes,Sat,Dinner,2


In [24]:
# Sorting the DataFrame by tips given
data = data.sort_values(by="tip",ascending=True)
data.head(10)

Unnamed: 0,total_bill,tip,smoker,day,time,size
67,3.07,1.0,Yes,Sat,Dinner,1
236,12.6,1.0,Yes,Sat,Dinner,2
92,5.75,1.0,Yes,Fri,Dinner,2
111,7.25,1.0,No,Sat,Dinner,1
0,16.99,1.01,No,Sun,Dinner,2
215,12.9,1.1,Yes,Sat,Dinner,2
237,32.83,1.17,Yes,Sat,Dinner,2
235,10.07,1.25,No,Sat,Dinner,2
75,10.51,1.25,No,Sat,Dinner,2
135,8.51,1.25,No,Thur,Lunch,2


In [25]:
# Sort by Multiple Columns (e.g., day then tip descending)
sorted_df = data.sort_values(by=['day', 'tip'], ascending=[True, False])
sorted_df.head(20)

Unnamed: 0,total_bill,tip,smoker,day,time,size
95,40.17,4.73,Yes,Fri,Dinner,4
93,16.32,4.3,Yes,Fri,Dinner,2
96,27.28,4.0,Yes,Fri,Dinner,2
91,22.49,3.5,No,Fri,Dinner,2
221,13.42,3.48,Yes,Fri,Lunch,2
94,22.75,3.25,No,Fri,Dinner,2
223,15.98,3.0,No,Fri,Lunch,3
98,21.01,3.0,Yes,Fri,Dinner,2
90,28.97,3.0,Yes,Fri,Dinner,2
101,15.38,3.0,Yes,Fri,Dinner,2


### Summarizing the dataframe

In [26]:
# Number of records for each day of the week
data['day'].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [27]:
#Average tip for each day of the week
data.groupby('day')["tip"].mean()
data.groupby("smoker")["tip"].max()

smoker
No      9.0
Yes    10.0
Name: tip, dtype: float64

In [29]:
# pct of total tip to the total bills for each day 
data.groupby('day')['tip'].sum()/data.groupby('day')['total_bill'].sum()

day
Fri     0.159445
Sat     0.146424
Sun     0.152038
Thur    0.156732
dtype: float64