In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score
from sklearn.feature_selection import f_regression

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 250)

In [2]:
tips_df = data('tips')

tips_df.info()
tips_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips_df.describe()


Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [4]:
print(f'Average total bill is: ${tips_df.total_bill.mean():.2f}')
print(f'Average tip amount is: ${tips_df.tip.mean():.2f}')

Average total bill is: $19.79
Average tip amount is: $3.00


In [5]:
print(f'Are there null values in the dataset?\n{tips_df.isnull().sum()}')
print('--------------')
print(f'Are there NaNs in the dataset?\n{tips_df.isna().sum()}')

Are there null values in the dataset?
total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64
--------------
Are there NaNs in the dataset?
total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64


In [7]:
# get splits
train, test = train_test_split(tips_df, test_size= 0.2, random_state=302)
train, validate = train_test_split(train, test_size= 0.3, random_state=302)

# isolate target in splits
X_train = train.drop(columns= ['tip'])
y_train = train[['tip']]

X_validate = validate.drop(columns= ['tip'])
y_validate = validate[['tip']]

X_test = test.drop(columns= ['tip'])
y_test= test[['tip']]

# remove non-number variables in X splits (to feed model)
X_train = X_train.select_dtypes(include='number')
X_validate = X_validate.select_dtypes(include='number')
X_test = X_test.select_dtypes(include='number')


In [8]:
print(train.shape, validate.shape, test.shape)

(136, 7) (59, 7) (49, 7)


In [9]:
X_train.columns
X_train.info()
X_train = X_train.drop(columns=['size'])
X_validate = X_validate.drop(columns=['size'])
X_test = X_test.drop(columns=['size'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 81 to 33
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  136 non-null    float64
 1   size        136 non-null    int64  
dtypes: float64(1), int64(1)
memory usage: 3.2 KB


In [11]:
lr1 = LinearRegression().fit(X_train, y_train)
yhat = lr1.predict(X_train)

y_train['yhat'] = yhat
y_train['residual'] = y_train.tip - y_train.yhat
y_train['baseline'] = y_train.tip.mean()
y_train['residual_baseline'] = y_train.tip.mean() - y_train.tip

y_train.head()


Unnamed: 0,tip,yhat,residual,baseline,residual_baseline
81,3.0,2.979305,0.020695,3.114559,0.114559
147,1.36,2.896373,-1.536373,3.114559,1.754559
88,4.0,2.859053,1.140947,3.114559,-0.885441
32,2.5,2.866309,-0.366309,3.114559,0.614559
207,3.41,3.720517,-0.310517,3.114559,-0.295441
