In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data
import math

from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

# Feature Engineering Exercises

1. Load the tips dataset.
    a. Create a column named price_per_person. This should be the total bill divided by the party size.
    b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
    c. Use select k best to select the top 2 features for predicting tip amount. What are they?
    d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?
    e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?
<br>

2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.
<br>

3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.
<br>

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [2]:
df= data('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


#### #1a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [5]:
df['tip_percentage'] = df['tip'] / df['total_bill']

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


#### #1b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [7]:
df['price_per_person'] = df['total_bill'] / df['size'] 

In [8]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


<hr style="border:2px solid gray">

#### SPLIT DATA

In [9]:
#split the data
train, test = train_test_split(df, train_size = 0.8, random_state = 123)
train, validate = train_test_split(train, train_size = 0.7, random_state = 123)

In [10]:
#check shape of each dataset
train.shape, validate.shape, test.shape

((136, 9), (59, 9), (49, 9))

In [11]:
#assign everything to X_train except tip and tip percentage
#X_train = train.drop(columns=['tip', 'smoker', 'day', 'sex', 'time'])

#assign y_train 
#y_train = train['tip']

# X and y splits
target = "tip"

X_train = train[['total_bill', 'size', 'tip', 'price_per_person']]
y_train = train[target]

X_validate = validate[['total_bill', 'size', 'tip', 'price_per_person']]
y_validate = validate[target]

X_test = test[['total_bill', 'size', 'tip', 'price_per_person']]
y_test = test[target]


In [12]:
X_train.head()

Unnamed: 0,total_bill,size,tip,price_per_person
19,16.97,3,3.5,5.656667
173,7.25,2,5.15,3.625
119,12.43,2,1.8,6.215
29,21.7,2,4.3,10.85
238,32.83,2,1.17,16.415


<hr style="border:2px solid gray">

#### SCALE DATA

In [13]:
#assign to variable #need scaler for each independent variable
#create it
scaler = MinMaxScaler()

In [14]:
#fit it
scaler.fit(X_train)
MinMaxScaler()

In [15]:
#use it
#make a new column within train
#use 'transform' instead of 'predict'
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

<hr style="border:2px solid gray">

#### #1c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

In [16]:
# I believe total_bill feature would be most important in predicting tip/tip percentage
df.corr()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person
total_bill,1.0,0.675734,0.598315,-0.338624,0.647497
tip,0.675734,1.0,0.489299,0.34237,0.347393
size,0.598315,0.489299,1.0,-0.14286,-0.175412
tip_percentage,-0.338624,0.34237,-0.14286,1.0,-0.314156
price_per_person,0.647497,0.347393,-0.175412,-0.314156,1.0


In [17]:
#### #1d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?
#### RFE

In [18]:
lm = LinearRegression()

In [19]:
rfe = RFE(estimator=lm, n_features_to_select=2)

In [20]:
rfe.fit(X_train_scaled, y_train)

In [21]:
feature_mask_rank = rfe.ranking_
feature_mask_rank

array([1, 3, 1, 2])

In [22]:
rfe_columns = X_train.columns[rfe.support_].tolist()
rfe_columns

['total_bill', 'tip']

#### SelectKBest

In [23]:
#create the model
kbest = SelectKBest(score_func=f_regression, k=2)

In [24]:
#fit the model
kbest.fit(X_train_scaled, y_train)

In [25]:
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

#### #1e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [26]:
#assign everything to X_train except tip and tip percentage
X_train2 = train[['total_bill', 'size', 'tip', 'price_per_person']]
#assign y_train 
y_train2 = train['tip_percentage']

In [27]:
X_validate2 = validate[['total_bill', 'size', 'tip', 'price_per_person']]
y_validate2 = validate['tip_percentage']

In [28]:
X_test2 = test[['total_bill', 'size', 'tip', 'price_per_person']]
y_test2 = test['tip_percentage']

In [29]:
X_train2_scaled = scaler.transform(X_train2)
X_validate2_scaled = scaler.transform(X_validate2)
X_test2_scaled = scaler.transform(X_test2)

In [30]:
#create the model
kbest2 = SelectKBest(score_func=f_regression, k=2)
#fit the model
kbest2.fit(X_train2_scaled, y_train2)

#### RFE

In [31]:
lm = LinearRegression()

In [32]:
rfe = RFE(estimator=lm, n_features_to_select=2)


In [33]:
rfe.fit(X_train2, y_train2)

In [34]:
feature_mask_rank = rfe.ranking_
feature_mask_rank

array([1, 2, 1, 3])

In [35]:
pd.Series(dict(zip(X_train2.columns, rfe.ranking_))).sort_values()

total_bill          1
tip                 1
size                2
price_per_person    3
dtype: int64

#### #1f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [36]:
#RFE with 1 feature
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=1)
rfe.fit(X_train, y_train)

In [37]:
feature_mask_rank = rfe.ranking_
feature_mask_rank

array([3, 2, 1, 4])

In [38]:
pd.Series(dict(zip(X_train.columns, rfe.ranking_))).sort_values()

tip                 1
size                2
total_bill          3
price_per_person    4
dtype: int64

In [39]:
#SKB model with feature of 1
f_selector = SelectKBest(score_func=f_regression, k=1)
#fit the model
f_selector.fit(X_train, y_train)

In [40]:
mask = f_selector.get_support()
X_train.columns[mask]

Index(['tip'], dtype='object')

<hr style="border:2px solid gray">

#### #2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [43]:
#X- features, y- target, k-#of features
def select_kbest(X,y,k): 
    f_selector = SelectKBest(f_regression, k)
    f_selector.fit(X, y)
    k_features = X.columns[f_selector.get_support()]

    return k_features

In [46]:
select_kbest(X_train, y_train, 2)

TypeError: __init__() takes from 1 to 2 positional arguments but 3 were given

<hr style="border:2px solid gray">


#### #3 Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [None]:
def rfe(X, y, n):
    lm = LinearRegression()
    rfe = RFE(lm, n)
    rfe.fit(X, y)
    
    n_features = X.columns[rfe.support_]
    
    return n_features


In [None]:
rfe(X_train, y_train, 2)

<hr style="border:2px solid gray">

#### #4 Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [None]:
import wrangle
import evaluate

In [None]:
df_swiss= data('swiss')

In [None]:
df_swiss.head()

In [None]:
df_swiss.info()

In [None]:
#use train, validate, test function in wrangle
train, validate, test= wrangle.split_data(df_swiss)

In [None]:
#assign everything to X_train 
X_train = train.drop(columns=['Fertility'])
#assign y_train 
y_train = train['Fertility']

In [None]:
X_validate = validate.drop(columns='Fertility')
y_validate = validate.Fertility

In [None]:
X_test = test.drop(columns='Fertility')
y_test = test.Fertility

In [None]:
X_train.head()
y_train.head()

In [None]:
#select K best model
select_kbest(X_train, y_train,3)

In [None]:
#RFE model
rfe(X_train, y_train, 3)