In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pydataset
import warnings
warnings.filterwarnings("ignore")

import split_scale

**1. Load the tips dataset from either pydataset or seaborn.**

In [2]:
df = pydataset.data("tips")

let's peek at the data

In [3]:
df.sample(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
104,22.42,3.48,Female,Yes,Sat,Dinner,2
53,34.81,5.2,Female,No,Sun,Dinner,4
58,26.41,1.5,Female,No,Sat,Dinner,2
80,17.29,2.71,Male,No,Thur,Lunch,2
169,10.59,1.61,Female,Yes,Sat,Dinner,2
85,15.98,2.03,Male,No,Thur,Lunch,2
127,8.52,1.48,Male,No,Thur,Lunch,2
166,24.52,3.48,Male,No,Sun,Dinner,3
200,13.51,2.0,Male,Yes,Thur,Lunch,2
183,45.35,3.5,Male,Yes,Sun,Dinner,3


In [4]:
df.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

**a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.**

In [5]:
df["tip_percentage"] = df.tip / df.total_bill

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


**b. Create a column named price_per_person. This should be the total bill divided by the party size.**

In [7]:
df["price_per_person"] = df.total_bill / df["size"]

In [8]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


**c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?**

total_bill, price_per_person

**d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?**

First need to split my data into train and test to avoid overfitting

In [9]:
train, test = split_scale.split_my_data(df)

Now let's create a df for all numeric values

In [10]:
train_numeric = train[["total_bill", "size", "tip_percentage", "price_per_person", "tip"]]

In [11]:
train_numeric.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person,tip
149,9.78,2,0.176892,4.89,1.73
214,13.27,2,0.188395,6.635,2.5
179,9.6,2,0.416667,4.8,4.0
42,17.46,2,0.145475,8.73,2.54
120,24.08,4,0.121262,6.02,2.92


In [12]:
test_numeric = test[["total_bill", "size", "tip_percentage", "price_per_person", "tip"]]

Now I'm going to scale the data

In [13]:
scaler, train_scaled, test_scaled = split_scale.standard_scaler(train_numeric, test_numeric)

In [14]:
X_train_scaled = train_scaled[["total_bill", "size", "tip_percentage", "price_per_person"]]

In [15]:
y_train_scaled = train_scaled["tip"]

In [16]:
X_test_scaled = test_scaled[["total_bill", "size", "tip_percentage", "price_per_person"]]

In [17]:
y_test_scaled = test_scaled["tip"]

Take a peek

In [18]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [19]:
X_train_scaled.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
149,-1.134981,-0.564569,0.225705,-1.027708
214,-0.711168,-0.564569,0.396685,-0.416823
179,-1.15684,-0.564569,3.789623,-1.059215
42,-0.202349,-0.564569,-0.241253,0.316589
120,0.601561,1.667448,-0.601144,-0.632121


Initialize the linear regression object

**Recursive Feature Elimination - RFE**

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
lm = LinearRegression()

Now I'm going to initialize the RFE object

In [22]:
from sklearn.feature_selection import RFE

In [23]:
rfe = RFE(lm, 2)

fit the rfe object to my data

In [24]:
rfe.fit(X_train_scaled, y_train_scaled)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=2, step=1, verbose=0)

Now I am going to get a list of the filters selected

In [25]:
mask = rfe.support_

In [26]:
rfe_features = X_train_scaled.loc[:,mask].columns.tolist()

In [27]:
print("The two features selected during Select RFE: ", rfe_features)

The two features selected during Select RFE:  ['total_bill', 'tip_percentage']


**Select K Best**

In [28]:
from sklearn.feature_selection import SelectKBest, f_regression

In [29]:
f_selector = SelectKBest(f_regression, k = 2)

In [30]:
f_selector.fit(X_train_scaled, y_train_scaled)

SelectKBest(k=2, score_func=<function f_regression at 0x134d175f0>)

In [31]:
X2 = f_selector.transform(X_train_scaled)

In [32]:
X2.shape

(170, 2)

Get a list of feature selected

In [33]:
f_support = f_selector.get_support()

In [34]:
f_support

array([ True,  True, False, False])

In [35]:
f_feature = X_train_scaled.loc[:,f_support].columns.tolist()

In [36]:
print("The two features selected during Select K Best: ", f_feature)

The two features selected during Select K Best:  ['total_bill', 'size']


**e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?**

In [37]:
X_train = train_scaled.drop(columns="tip_percentage")

In [38]:
X_train.head()

Unnamed: 0,total_bill,size,price_per_person,tip
149,-1.134981,-0.564569,-1.027708,-0.942164
214,-0.711168,-0.564569,-0.416823,-0.320451
179,-1.15684,-0.564569,-1.059215,0.890679
42,-0.202349,-0.564569,0.316589,-0.288154
120,0.601561,1.667448,-0.632121,0.018666


In [39]:
y_train = train_scaled["tip_percentage"]

In [40]:
X_test = test_scaled.drop(columns="tip_percentage")

In [41]:
y_test = test_scaled["tip_percentage"]

**Select K Best**

from sklearn.feature_selection import SelectKBest

In [42]:
f_selector1 = SelectKBest(f_regression, k = 2)

In [43]:
f_selector1.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x134d175f0>)

In [44]:
x_2 = f_selector.transform(X_train)

In [45]:
f_support1 = f_selector1.get_support()

In [46]:
f_support1

array([ True, False, False,  True])

In [47]:
f_feature1 = X_train.loc[:,f_support1].columns.tolist()

In [48]:
print("The two features selected during Select K Best: ",f_feature1)

The two features selected during Select K Best:  ['total_bill', 'tip']


**RFE - Recursive Feature Elimination**

from sklearn.linear_model import LinearRegression

from sklearn.feature_selection import RFE

In [49]:
lm1 = LinearRegression()

In [50]:
rfe1 = RFE(lm1, 2)

In [51]:
rfe1.fit(X_train, y_train)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=2, step=1, verbose=0)

In [52]:
X_RFE = rfe1.transform(X_train)

In [53]:
mask1 = rfe1.support_

In [54]:
rfe_features1 = X_train.loc[:,mask1].columns.tolist()

In [55]:
print("The two features selected during RFE: ", rfe_features1)

The two features selected during RFE:  ['price_per_person', 'tip']


**f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?**

They use different stats tools

**2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.**

In [56]:
def select_kbest(X_train, y_train, n_features=2):
    """
    Takes in two dataframes and number of features, if not specifed selects 2
    Returns the name of features that have been chosen by select K best
    """
    f_selector = SelectKBest(f_regression, k = n_features)
    f_selector.fit(X_train, y_train)
    x2 = f_selector.transform(X_train)
    f_support = f_selector.get_support()
    f_feature = X_train.loc[:,f_support].columns.tolist()
    return f_feature

test the function

In [57]:
select_kbest(X_train, y_train, 2)

['total_bill', 'tip']

**3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.**

In [58]:
def rfe(X_train, y_train, n_features=2):
    """
    Takes in two dataframes and number of features, if not specifed selects 2
    Returns the name of the features that have been selected through reclusive feature
    elimination
    """
    lm = LinearRegression()
    rfe = RFE(lm, n_features)
    rfe.fit(X_train, y_train)
    X_rfe = rfe.transform(X_train)
    mask = rfe.support_
    rfe_features = X_train.loc[:,mask].columns.tolist()
    return rfe_features


test the function

In [59]:
rfe(X_train, y_train, 3)

['size', 'price_per_person', 'tip']

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [60]:
swiss = pydataset.data("swiss")

In [61]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [62]:
train, test = split_scale.split_my_data(swiss)

In [63]:
train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Sierre,92.2,84.6,3,3,99.46,16.3
Moudon,65.0,55.1,14,3,4.52,22.4
Rolle,60.5,60.8,16,10,7.72,16.3
Echallens,68.3,72.6,18,2,24.2,21.2
La Chauxdfnd,65.7,7.7,29,11,13.79,20.5


In [64]:
scaler, train_scaled, test_scaled = split_scale.standard_scaler(train, test)

In [65]:
train_scaled.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Sierre,1.83402,1.519159,-1.82485,-0.989428,1.462955,-1.432499
Moudon,-0.400486,0.187726,-0.34819,-0.989428,-0.886844,1.055258
Rolle,-0.770165,0.444986,-0.079706,-0.123678,-0.807643,-1.432499
Echallens,-0.129388,0.977559,0.188778,-1.113106,-0.399757,0.565863
La Chauxdfnd,-0.34298,-1.951592,1.665438,0.0,-0.657408,0.280382


In [66]:
X_train = train_scaled.drop(columns="Fertility")

In [67]:
X_train.shape

(32, 5)

In [68]:
X_test = test_scaled.drop(columns="Fertility")

In [69]:
X_test.shape

(15, 5)

In [70]:
y_train = train_scaled["Fertility"]

In [71]:
y_train.shape

(32,)

In [72]:
y_yest = test_scaled["Fertility"]

In [73]:
X_train.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Sierre,1.519159,-1.82485,-0.989428,1.462955,-1.432499
Moudon,0.187726,-0.34819,-0.989428,-0.886844,1.055258
Rolle,0.444986,-0.079706,-0.123678,-0.807643,-1.432499
Echallens,0.977559,0.188778,-1.113106,-0.399757,0.565863
La Chauxdfnd,-1.951592,1.665438,0.0,-0.657408,0.280382


In [74]:
y_train.head()

Sierre          1.834020
Moudon         -0.400486
Rolle          -0.770165
Echallens      -0.129388
La Chauxdfnd   -0.342980
Name: Fertility, dtype: float64

In [75]:
select_kbest(X_train, y_train, 3)

['Examination', 'Education', 'Catholic']

In [76]:
rfe(X_train, y_train, 3)

['Agriculture', 'Education', 'Catholic']