# Feature Engineering Exercises
1.  Load the tips dataset.

In [1]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


- a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [3]:
df['tip_percentage'] = df.tip / df.total_bill
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.50,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.139780
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
241,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584
242,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222
243,17.82,1.75,Male,No,Sat,Dinner,2,0.098204


- b.  Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = df['total_bill']/df['size']
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495000
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.50,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.139780,11.840000
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.147500
...,...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3,0.203927,9.676667
241,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584,13.590000
242,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222,11.335000
243,17.82,1.75,Male,No,Sat,Dinner,2,0.098204,8.910000


- c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

In [5]:
# The features I think would be most important for predicting the tip amount are: 
# size and total_bill

- d. Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

### Select K Best

In [6]:
df.replace({'sex':{'Male':1, 'Female':0}}, inplace=True)

In [7]:
df.replace({'smoker':{'Yes':1, 'No':0}}, inplace=True)

In [8]:
df.replace({'time':{'Lunch':0, 'Dinner':1}}, inplace=True)

In [9]:
df.replace({'day':{'Thur':0, 'Fri':1, 'Sat':3, 'Sun':4}}, inplace=True)

In [10]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,0,0,4,1,2,0.059447,8.495
2,10.34,1.66,1,0,4,1,3,0.160542,3.446667
3,21.01,3.5,1,0,4,1,3,0.166587,7.003333
4,23.68,3.31,1,0,4,1,2,0.13978,11.84
5,24.59,3.61,0,0,4,1,4,0.146808,6.1475


In [13]:
X = df.drop(columns=['tip', 'tip_percentage'])
y = df.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fdc9294bc10>)

In [15]:
kbest.get_support()

array([ True, False, False, False, False,  True, False])

In [16]:
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [17]:
#The best features to use in our model using K best are "total_bill" and "size"

### Recursive Feature Elimination

In [19]:
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([ True, False, False, False, False, False,  True])

In [20]:
X_train.columns[rfe.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

In [21]:
pd.Series(rfe.ranking_, index=X_train.columns)

total_bill          1
sex                 5
smoker              6
day                 3
time                4
size                2
price_per_person    1
dtype: int64

In [22]:
#The best features to use in our model using RFE are "total_bill" and "price_per_person"

- e. Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. - - What are they?

In [23]:
X = df.drop(columns='tip_percentage')
y = df.tip_percentage

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
#Using Kbest 
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fdc9294bc10>)

In [25]:
kbest.get_support()

array([False,  True, False, False, False, False, False,  True])

In [26]:
X_train.columns[kbest.get_support()]

Index(['tip', 'price_per_person'], dtype='object')

In [27]:
#The best features to use in our model for predicting tip percentage using K best are "tip" and "price_per_person"

In [28]:
#Using RFE
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([ True,  True, False, False, False, False, False, False])

In [29]:
X_train.columns[rfe.get_support()]

Index(['total_bill', 'tip'], dtype='object')

In [30]:
pd.Series(rfe.ranking_, index=X_train.columns)

total_bill          1
tip                 1
sex                 7
smoker              4
day                 5
time                6
size                2
price_per_person    3
dtype: int64

In [31]:
#The best features to use in our model for predicting tip percentage using RFE are "total_bill" and "tip"

- f. Why do you think select k best and recursive feature elimination might give different answers for the top  -features? Does this change as you change the number of features your are selecting?

In [32]:
# It gives different answers for the top features because k best looks at each feature individually where as RFE 
# looks at all the features together.  

#### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. 
Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [33]:
def select_kbest(X, y, k):
    
    #Take in predictors and target and create train and test data sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    #make the object
    kbest = SelectKBest(f_regression, k=k)
    
    #fit the object
    kbest.fit(X_train_scaled, y_train)
    
    #use the object
    top_k = X_train.columns[kbest.get_support()]
    
    return top_k

#### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. 

It should return the top k features based on the RFE class. Test your function with the tips dataset.  You should see the same results as when you did the process manually.

In [34]:
def rfe(X, y, k):
    
    #Take in predictors and target and create train and test data sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    rfe = RFE(estimator=LinearRegression(), n_features_to_select=k)
    rfe.fit(X_train_scaled, y_train)
    rfe.get_support()
    
    top_k = X_train.columns[rfe.get_support()]
    
    return top_k

#### 4. Load the swiss dataset and use all the other features to predict Fertility. 

Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [35]:
import pydataset
df = pydataset.data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [36]:
X = df.drop(columns='Fertility')
y = df.Fertility
k = 3

In [37]:
select_kbest(X, y, k)

Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [38]:
rfe(X, y, k)

Index(['Agriculture', 'Education', 'Catholic'], dtype='object')