Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import env

from pydataset import data

import wrangle as w

1) Load the ```tips``` dataset.

In [2]:
tips = data('tips')

In [67]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [68]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 17.2+ KB


a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [69]:
tips['price_per_person'] = tips.total_bill / tips['size']

In [70]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [71]:
tips_train, tips_validate, tips_test = w.split_data(tips)

b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

I think price_per_person will probably be the best predictor, followed by total_bill.

c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [72]:
#Initialize select kbest

from sklearn.feature_selection import SelectKBest, f_regression

In [73]:
f_selector = SelectKBest(f_regression, k=2)

In [74]:
#Getting dummies
dummy_df = pd.get_dummies(tips_train[['sex', 'smoker', 'day', 'time']], drop_first=True)

dummy_df

Unnamed: 0,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
225,1,1,0,0,0,1
182,1,1,0,1,0,0
103,0,1,1,0,0,0
165,0,1,0,1,0,0
74,0,1,1,0,0,0
175,1,1,0,1,0,0
4,1,0,0,1,0,0
150,1,0,0,0,1,1
86,0,0,0,0,1,1
41,1,0,1,0,0,0


In [75]:
#Scaling other numeric data.
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

cols = ['total_bill', 'size', 'price_per_person']

tips_train_scaled = tips_train.copy()

tips_train_scaled[cols] = scaler.fit_transform(tips_train[cols])

In [76]:
#Combining the two together
tips_train_scaled = pd.concat([tips_train_scaled, dummy_df], axis=1)

In [77]:
#Check to see that encoding and scaling worked for the variables I wanted.
tips_train_scaled.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
225,0.228679,1.58,Male,Yes,Fri,Lunch,0.2,0.211566,1,1,0,0,0,1
182,0.447636,5.65,Male,Yes,Sun,Dinner,0.2,0.499564,1,1,0,1,0,0
103,0.910959,2.5,Female,Yes,Sat,Dinner,0.4,0.679841,0,1,1,0,0,0
165,0.319046,3.0,Female,Yes,Sun,Dinner,0.2,0.330427,0,1,0,1,0,0
74,0.49072,5.0,Female,Yes,Sat,Dinner,0.2,0.556234,0,1,1,0,0,0


In [78]:
#Assigning all variables I want to send in to modelcols
modelcols = ['total_bill', 'size', 'price_per_person', 'sex_Male', 'smoker_Yes', 'time_Lunch', 'day_Sat', 'day_Sun', 'day_Thur']

tts_train = tips_train_scaled[modelcols]

In [79]:
#Fitting the model
f_selector.fit(tts_train, tips_train_scaled.tip)

SelectKBest(k=2, score_func=<function f_regression at 0x7fb408cc9b80>)

In [80]:
#Returning boolean of top two predictive features
f_selector.get_support()

array([ True,  True, False, False, False, False, False, False, False])

In [81]:
#Applying mask to features. Make sure that tips_scaled has [modelcols] afterwards so length of arrays are the same.
feature_mask = f_selector.get_support()

f_feature = tts_train[modelcols].iloc[:, feature_mask].columns.tolist()

In [82]:
#What's the top two.
f_feature

['total_bill', 'size']

The top two features are total_bill and size.

In [83]:
#Just checking for multicollinearity

import scipy.stats as stats

r, p = stats.pearsonr(tips_train_scaled.total_bill, tips_train_scaled['size'])

r, p

(0.6247777073262855, 3.529359992494182e-17)

d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [84]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [85]:
rfe = RFE(LinearRegression(), n_features_to_select=2)

In [86]:
rfe.fit(tts_train, tips_train_scaled.tip)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [87]:
rfe.support_

array([ True, False,  True, False, False, False, False, False, False])

In [89]:
tts_train.iloc[:, rfe.support_].columns.tolist()

['total_bill', 'price_per_person']

The top two features for tip amount is total_bill and price_per_person. 

e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

RFE is based off of multivariate linear regression, whereas SelectKBest is based off of a univariate regression model. So SelectKBest is looking at each variable individually and how they correlate with the tip, then selects the top performers. RFE looks at all of them at the same time and see which ones have the highest coefficient based off of the multivariate linear regression model it has produced.

2) Write a function named ```select_kbest``` that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the ```SelectKBest``` class. Test your function with the ```tips``` dataset. You should see the same results as when you did the process manually.

In [91]:
def select_kbest(X_train, y_train, n=2):
    '''Takes in a scaled X_train, y_train, and number of features (default=2), and outputs the 
    top number of features in X_train that are most predictive based off of SelectKBest modeling.
    '''
    f_selector = SelectKBest(f_regression, k=n)
    f_selector.fit(X_train, y_train)
    return X_train.iloc[:, f_selector.get_support()].columns.tolist()

In [92]:
select_kbest(tts_train, tips_train_scaled.tip)

['total_bill', 'size']

3) Write a function named ```rfe``` that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the ```RFE``` class. Test your function with the ```tips``` dataset. You should see the same results as when you did the process manually.

In [59]:
def rfe(X_train, y_train, n=2):
    '''Takes in a scaled X_train, y_train, and number of features (default=2), and outputs the 
    top number of features in X_train that are most predictive based off of Recursive Feature Elimination modeling.
    '''
    rfe = RFE(LinearRegression(), n_features_to_select=n)
    rfe.fit(X_train, y_train)
    return X_train.iloc[:, rfe.support_].columns.tolist()

In [60]:
rfe(tips_scaled_train, tips.tip)

['total_bill', 'price_per_person']

4) Load the ```swiss``` dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [93]:
swiss = data('swiss')

In [94]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [95]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [51]:
swiss.columns

Index(['Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic',
       'Infant.Mortality'],
      dtype='object')

In [54]:
scaler = MinMaxScaler()

cols = ['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']

swiss_scaled = swiss.copy()

swiss_scaled[cols] = scaler.fit_transform(swiss[cols])

In [55]:
swiss_scaled.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,0.178531,0.352941,0.211538,0.079816,0.721519
Delemont,83.1,0.496045,0.088235,0.153846,0.845069,0.721519
Franches-Mnt,92.5,0.435028,0.058824,0.076923,0.93255,0.594937
Moutier,85.8,0.39887,0.264706,0.115385,0.323148,0.601266
Neuveville,76.9,0.477966,0.411765,0.269231,0.030761,0.620253


In [57]:
#I know I didn't split my data, but that's only because if I did, the sample size would be too small and show
#really unreliable correlations. 

select_kbest(swiss_scaled[cols], swiss_scaled.Fertility, n=3)

['Examination', 'Education', 'Catholic']

In [61]:
rfe(swiss_scaled[cols], swiss_scaled.Fertility, n=3)

['Agriculture', 'Education', 'Infant.Mortality']