In [1]:
import pandas as pd
import numpy as np
from pydataset import data
import split_scale

In [2]:
tips = data('tips')

In [3]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3


In [4]:
tips = tips.rename(columns={'size': 'party_size'})

In [5]:
tips['tip_percentage'] = tips.tip / tips.total_bill

In [6]:
tips['price_per_person'] = tips.total_bill / tips.party_size

In [7]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333


In [8]:
tips.dtypes

total_bill          float64
tip                 float64
sex                  object
smoker               object
day                  object
time                 object
party_size            int64
tip_percentage      float64
price_per_person    float64
dtype: object

In [9]:
import sklearn.linear_model
import sklearn.feature_selection
k = 2
# initialize the sklearn objects
lm = sklearn.linear_model.LinearRegression()
kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, 2)
rfe = sklearn.feature_selection.RFE(lm, 2)

In [10]:
# prep our data
import sklearn.model_selection

train, test = sklearn.model_selection.train_test_split(tips, random_state=123, train_size=.80)

x_cols = ['total_bill', 'party_size', 'tip_percentage', 'price_per_person']
X_train = train[x_cols]
y_train = train.tip

X_test = test[x_cols]
y_test = test.tip

In [11]:
kbest.fit(X_train, y_train)
X_train.columns[kbest.get_support()]

Index(['total_bill', 'party_size'], dtype='object')

In [12]:
rfe.fit(X_train, y_train)
X_train.columns[rfe.support_]

Index(['total_bill', 'tip_percentage'], dtype='object')

In [13]:
def select_kbest(X, y, k):
    kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()]

In [14]:
select_kbest(X_train, y_train, 2)

Index(['total_bill', 'party_size'], dtype='object')

In [15]:
def select_rfe(X, y, k):
    lm = sklearn.linear_model.LinearRegression()
    rfe = sklearn.feature_selection.RFE(lm, k)
    rfe.fit(X, y)
    return X.columns[rfe.support_]

In [16]:
select_rfe(X_train, y_train, 2)

Index(['total_bill', 'tip_percentage'], dtype='object')

In [17]:
swiss = data('swiss')

In [18]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [19]:
swiss.dtypes

Fertility           float64
Agriculture         float64
Examination           int64
Education             int64
Catholic            float64
Infant.Mortality    float64
dtype: object

In [20]:
train, test = split_scale.split_my_data(swiss)

In [21]:
train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Broye,83.8,70.2,16,7,92.85,23.6
Paysd'enhaut,72.0,63.5,6,3,2.56,18.0
Avenches,68.9,60.7,19,12,4.43,22.7
Aubonne,66.9,67.5,14,7,2.27,19.1
Oron,72.5,71.2,12,1,2.4,21.0


In [22]:
k = 3
# scale data using standard scaler
scaler, train, test = split_scale.standard_scaler(train, test)
X_train = train.drop(columns='Fertility')
y_train = train[['Fertility']]
X_test = test.drop(columns='Fertility')
y_test = test[['Fertility']]

In [23]:
select_kbest(X_train, y_train, k)

Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [26]:
select_rfe(X_train, y_train, k)

Index(['Agriculture', 'Education', 'Catholic'], dtype='object')

In [25]:
lm.fit(X_train, y_train)

lm.coef_

array([[-0.39033096, -0.24510239, -0.62477557,  0.41923544,  0.35488771]])