In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from pydataset import data
from evaluate import get_splits

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 250)


### Load the tips dataset.

In [2]:
df = data('tips')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


### Create a column named price_per_person. This should be the total bill divided by the party size.

In [3]:
df['price_per_person'] = df['total_bill']/ df['size']

df.describe()


Unnamed: 0,total_bill,tip,size,price_per_person
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,7.88823
std,8.902412,1.383638,0.9511,2.91435
min,3.07,1.0,1.0,2.875
25%,13.3475,2.0,2.0,5.8025
50%,17.795,2.9,2.0,7.255
75%,24.1275,3.5625,3.0,9.39
max,50.81,10.0,6.0,20.275


In [4]:
train, validate, test = get_splits(df)

train.info()
print(train.shape, validate.shape, test.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 81 to 33
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        136 non-null    float64
 1   tip               136 non-null    float64
 2   sex               136 non-null    object 
 3   smoker            136 non-null    object 
 4   day               136 non-null    object 
 5   time              136 non-null    object 
 6   size              136 non-null    int64  
 7   price_per_person  136 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 9.6+ KB
(136, 8) (49, 8) (59, 8)


In [5]:
from evaluate import isolate_lm_target
X_train, y_train, X_validate, y_validate, X_test, y_test = isolate_lm_target(train, validate, test, 'tip')
f_selector = SelectKBest(f_regression, k=2)

### Use select k best to select the top 2 features for predicting tip amount. What are they?

In [6]:
X_train.head()
f_selector.fit(X_train, y_train)

f_mask = f_selector.get_support()
f_mask

f_feature = X_train.iloc[:,f_mask].columns.tolist()
f_feature

['total_bill', 'size']

### Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [7]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=2)
rfe.fit(X_train, y_train)

rfe_mask = rfe.support_
rfe_feature = X_train.iloc[:,rfe_mask].columns.tolist()
rfe_feature

['total_bill', 'size']

In [8]:
X_train.columns

Index(['total_bill', 'size', 'price_per_person'], dtype='object')

In [9]:
X_rfe = rfe.transform(X_train)
X_rfe = pd.DataFrame(X_rfe)
X_rfe.head()

Unnamed: 0,0,1
0,19.44,2.0
1,18.64,3.0
2,18.28,2.0
3,18.35,4.0
4,26.59,3.0


### Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset

In [9]:
def select_kbest(X, y, k):
    f_selector = SelectKBest(f_regression, k)
    f_selector.fit(X,y)
    f_mask = f_selector.get_support()
    f_feature = X.iloc[:,f_mask].columns.tolist()
    return f_feature

In [4]:
treino, exame = train_test_split(df, test_size=0.2, random_state=302)
treino, validar = train_test_split(df, test_size=0.3, random_state=302)

print(treino.shape, validar.shape, exame.shape)

(170, 7) (74, 7) (49, 7)


In [5]:
X_treino = treino.drop(columns= ['tip'])
y_treino = treino[['tip']]

X_validar = validar.drop(columns= ['tip'])
y_validar = validar[['tip']]

X_exame = exame.drop(columns= ['tip'])
y_exame = exame[['tip']]



In [6]:
X_treino.time.value_counts()

Dinner    126
Lunch      44
Name: time, dtype: int64

In [18]:
X_treino.day.value_counts()

Sat     60
Sun     57
Thur    40
Fri     13
Name: day, dtype: int64

In [7]:
X_treino_cats = X_treino.select_dtypes(exclude=np.number)
X_treino_cats.head()
X_treino_bobo = pd.get_dummies(X_treino_cats, dummy_na=False, drop_first=True)
X_treino_bobo.head()

Unnamed: 0,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
35,1,0,1,0,0,0
15,0,0,0,1,0,0
132,0,0,0,0,1,1
220,0,1,1,0,0,0
97,1,1,0,0,0,0


In [8]:
X_treino.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
35,17.78,Male,No,Sat,Dinner,2
15,14.83,Female,No,Sun,Dinner,2
132,20.27,Female,No,Thur,Lunch,2
220,30.14,Female,Yes,Sat,Dinner,4
97,27.28,Male,Yes,Fri,Dinner,2


In [9]:
X_treino = pd.concat([X_treino, X_treino_bobo], axis=1, ignore_index=False)
X_treino.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
35,17.78,Male,No,Sat,Dinner,2,1,0,1,0,0,0
15,14.83,Female,No,Sun,Dinner,2,0,0,0,1,0,0
132,20.27,Female,No,Thur,Lunch,2,0,0,0,0,1,1
220,30.14,Female,Yes,Sat,Dinner,4,0,1,1,0,0,0
97,27.28,Male,Yes,Fri,Dinner,2,1,1,0,0,0,0


In [10]:
X_treino_nums = X_treino.select_dtypes(include=np.number)

In [17]:
X_treino_nums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 35 to 95
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  170 non-null    float64
 1   size        170 non-null    int64  
 2   sex_Male    170 non-null    uint8  
 3   smoker_Yes  170 non-null    uint8  
 4   day_Sat     170 non-null    uint8  
 5   day_Sun     170 non-null    uint8  
 6   day_Thur    170 non-null    uint8  
 7   time_Lunch  170 non-null    uint8  
dtypes: float64(1), int64(1), uint8(6)
memory usage: 5.0 KB


In [32]:
y_treino.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 35 to 95
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tip     170 non-null    float64
dtypes: float64(1)
memory usage: 2.7 KB


In [13]:
f_feature = select_kbest(X_treino_nums, y_treino, 3)
f_feature

['total_bill', 'size', 'time_Lunch']

### Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset.

In [34]:
def rfe(X, y, k):
    lm = LinearRegression()
    rfe = RFE(lm, k)
    rfe.fit(X, y)

    rfe_mask = rfe.support_
    rfe_feature = X.iloc[:,rfe_mask].columns.tolist()
    var_ranks = rfe.ranking_
    var_names = X.columns.tolist()
    rfe_ranked = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
    return rfe_feature, rfe_ranked


In [35]:
rfe_feature, rfe_ranked = rfe(X_treino_nums, y_treino, 2)

rfe_feature, rfe_ranked

(['day_Thur', 'time_Lunch'],
           Var  Rank
 0  total_bill     2
 1        size     4
 2    sex_Male     7
 3  smoker_Yes     3
 4     day_Sat     5
 5     day_Sun     6
 6    day_Thur     1
 7  time_Lunch     1)

In [36]:
rfe_feature, rfe_ranked = rfe(X_train, y_train, 2)

rfe_feature, rfe_ranked

(['total_bill', 'size'],
           Var  Rank
 0  total_bill     1
 1        size     1)

In [37]:
X_train.columns

Index(['total_bill', 'size'], dtype='object')

### Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [19]:
swiss_df = data('swiss')

swiss_df.info()
swiss_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [23]:
swiss_train, swiss_val, swiss_test = get_splits(swiss_df)

print(swiss_train.shape, swiss_val.shape, swiss_test.shape)

(25, 6) (10, 6) (12, 6)


In [24]:
swiss_X_trn, swiss_y_trn, swiss_X_val, swiss_y_val, swiss_X_test, swiss_y_test = isolate_lm_target(swiss_train, swiss_val, swiss_test, 'Fertility')

In [10]:
swiss_features = select_kbest(swiss_X_trn, swiss_y_trn, 1)

swiss_features

NameError: name 'swiss_X_trn' is not defined

In [28]:
swiss_rfe_feats = rfe(swiss_X_trn, swiss_y_trn, 1)

swiss_rfe_feats

['Examination', 'Education', 'Infant.Mortality']