In [1]:
# os functions
import os

# local files
from env import host, user, password
import wrangle as w

# df manipulations
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from scipy import stats
from math import sqrt

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import SelectKBest, RFE, SequentialFeatureSelector

import warnings
warnings.filterwarnings("ignore")

np.random.seed(123)

In [2]:
# 1. Load tips dataset
from pydataset import data
tips = data('tips')
# A. Create a column price_per_person (total_bill / party size)
tips['price_per_person'] = round((tips.total_bill / tips['size']), 2)

In [3]:
tips.head(), tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 17.2+ KB


(   total_bill   tip     sex smoker  day    time  size  price_per_person
 1       16.99  1.01  Female     No  Sun  Dinner     2              8.49
 2       10.34  1.66    Male     No  Sun  Dinner     3              3.45
 3       21.01  3.50    Male     No  Sun  Dinner     3              7.00
 4       23.68  3.31    Male     No  Sun  Dinner     2             11.84
 5       24.59  3.61  Female     No  Sun  Dinner     4              6.15,
 None)

In [4]:
tips.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [5]:
# Encode the cat_cols
tips.sex = tips.sex.map({'Male' : 0, 'Female' : 1})
tips.smoker = tips.smoker.map({'No' : 0, 'Yes' : 1})
tips.day = tips.day.map({'Thur' : 0, 'Fri' : 1, 'Sat' : 2, 'Sun' : 3})
tips.time = tips.time.map({'Lunch' : 0, 'Dinner' : 1})

cols = ['sex', 'smoker', 'day', 'time']
for col in cols:
    tips[col] = tips[col].astype(int)

In [6]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    int64  
 3   smoker            244 non-null    int64  
 4   day               244 non-null    int64  
 5   time              244 non-null    int64  
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
dtypes: float64(3), int64(5)
memory usage: 17.2 KB


In [7]:
# Data split
train_validate, test = train_test_split(tips, train_size= .8,
                                       random_state=123)
train, validate = train_test_split(train_validate, train_size = .7,
                                  random_state=123)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 19 to 167
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        136 non-null    float64
 1   tip               136 non-null    float64
 2   sex               136 non-null    int64  
 3   smoker            136 non-null    int64  
 4   day               136 non-null    int64  
 5   time              136 non-null    int64  
 6   size              136 non-null    int64  
 7   price_per_person  136 non-null    float64
dtypes: float64(3), int64(5)
memory usage: 9.6 KB


B. Before using any of the methods discussed in the lesson, which features do you think would be most important for 
predicting tip amount?

I think total_bill, time, and size will have the biggest impacts on tip

In [10]:
# C. Use select K best to select the top 2 features for predicting tip amount, what are they?
train_x = train.drop(columns='tip')
train_y = train.tip

validate_x = validate.drop(columns='tip')
validate_y = validate.tip

test_x = test.drop(columns='tip')
test_y = test.tip


kbest = SelectKBest(f_regression, k=2)
kbest.fit(train_x, train_y)

SelectKBest(k=2, score_func=<function f_regression at 0x7fa60c3553a0>)

In [15]:
feature_mask = kbest.get_support()
features = train_x.iloc[:, feature_mask].columns.tolist()
features

['total_bill', 'size']

In [35]:
# D. Use recursive feature elimination to select the top 2 features for tip amount. What are they?


In [None]:
# E. Why do you think select k best and recursive feature elimination might give different answers for the top 
# features? Does this change as you change the number of features you are selecting?


In [42]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15
