In [1]:
import pandas as pd
import numpy as np
from pydataset import data
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import viz

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")

import split_scale

In [3]:
tips = data('tips')

In [4]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3


In [5]:
tips['tip_percentage'] = tips.tip/tips.total_bill

In [6]:
tips['price_per_person'] = tips.total_bill/tips.size

In [7]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,0.008704
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,0.005297
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,0.010763


In [8]:
tips.dtypes

total_bill          float64
tip                 float64
sex                  object
smoker               object
day                  object
time                 object
size                  int64
tip_percentage      float64
price_per_person    float64
dtype: object

In [9]:
tips = tips.drop(columns=['sex','smoker','day','time'])
tips.head()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person
1,16.99,1.01,2,0.059447,0.008704
2,10.34,1.66,3,0.160542,0.005297
3,21.01,3.5,3,0.166587,0.010763
4,23.68,3.31,2,0.13978,0.012131
5,24.59,3.61,4,0.146808,0.012597


In [10]:
train, test = split_scale.split_my_data(tips)

In [11]:
train.head(3)

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person
24,39.42,7.58,4,0.192288,0.020195
191,15.69,1.5,2,0.095602,0.008038
210,12.76,2.23,2,0.174765,0.006537


In [12]:
# scale data using standard scaler
scaler, train, test = split_scale.standard_scaler(train, test)

# to return to original values
# scaler, train, test = scaling.my_inv_transform(scaler, train, test)

X_train = train.drop(columns='tip')
y_train = train[['tip']]
X_test = test.drop(columns='tip')
y_test = test[['tip']]

In [13]:
f_selector = SelectKBest(f_regression, k = 2)

In [14]:
# running correlation test between each x and y and returning the score, f-statistic

f_selector.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x12a57a440>)

In [15]:
# select the k best features
X2 = f_selector.transform(X_train)
print(X2.shape)
print(X_train.shape)

(195, 2)
(195, 4)


In [16]:
X2 = SelectKBest(f_regression, k = 2).fit_transform(X_train, y_train)
print(X2.shape)
X2[0:2]

(195, 2)


array([[ 2.22751067,  2.22751067],
       [-0.4404687 , -0.4404687 ]])

In [17]:
f_support = f_selector.get_support()
f_support

array([ True, False, False,  True])

In [18]:
f_feature = X_train.loc[:,f_support].columns.tolist()
f_feature

['total_bill', 'price_per_person']

In [19]:
lm = LinearRegression()

In [20]:
rfe = RFE(lm, 2)

In [21]:
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=2, step=1, verbose=0)

In [22]:
X_rfe = rfe.transform(X_train)
X_rfe[0:2]

array([[ 2.22751067,  2.22751067],
       [-0.4404687 , -0.4404687 ]])

In [23]:
lm.fit(X_rfe, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [24]:
lm = LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X_train, y_train)

In [25]:
mask = rfe.support_
rfe_features = X_train.loc[:,mask].columns.tolist()
rfe_features

['total_bill', 'price_per_person']

In [26]:
var_ranks = rfe.ranking_
var_names = X_train.columns.tolist()

pd.DataFrame({'Feature': var_names, 'Rank': var_ranks})

Unnamed: 0,Feature,Rank
0,total_bill,1
1,size,3
2,tip_percentage,2
3,price_per_person,1


In [27]:
X_train = train.drop(columns='tip_percentage')
y_train = train[['tip_percentage']]
X_test = test.drop(columns='tip_percentage')
y_test = test[['tip_percentage']]

In [28]:
X2 = SelectKBest(f_regression, k = 2).fit_transform(X_train, y_train)
print(X2.shape)
X2[0:2]

(195, 2)


array([[ 3.10160771,  2.22751067],
       [-1.03535802, -0.4404687 ]])

In [29]:
f_feature = X_train.loc[:,f_support].columns.tolist()
f_feature

['total_bill', 'price_per_person']

In [30]:
lm = LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X_train, y_train)

In [31]:
mask = rfe.support_
rfe_features = X_train.loc[:,mask].columns.tolist()
rfe_features

['total_bill', 'price_per_person']

In [36]:
def select_kbest_freg_scaled(X_train, y_train, k):
    '''
    Takes unscaled data (X_train, y_train) and number of features to select (k) as input
    and returns a list of the top k features
    '''
    f_selector = SelectKBest(f_regression, k).fit(X_train, y_train).get_support()
    f_feature = X_train.loc[:,f_selector].columns.tolist()
    return f_feature

In [37]:
k = 2
select_kbest_freg_scaled(X_train, y_train, k)

['tip', 'price_per_person']

In [38]:
def select_kbest_freg_unscaled(X_train, y_train, k):
    '''
    Takes unscaled data (X_train, y_train) and number of features to select (k) as input
    and returns a list of the top k features
    '''
    f_selector = SelectKBest(f_regression, k).fit(X_train, y_train).get_support()
    f_feature = X_train.loc[:,f_selector].columns.tolist()
    return f_feature

In [39]:
select_kbest_freg_scaled(X_train, y_train, k)

['tip', 'price_per_person']

In [40]:
def optimal_features(X_train, X_test, y_train, number_of_features):
    '''Taking the output of optimal_number_of_features, as n, and use that value to 
    run recursive feature elimination to find the n best features'''
    cols = list(X_train.columns)
    model = LinearRegression()
    
    #Initializing RFE model
    rfe = RFE(model, number_of_features)

    #Transforming data using RFE
    train_rfe = rfe.fit_transform(X_train,y_train)
    test_rfe = rfe.transform(X_test)
    
    #Fitting the data to model
    model.fit(train_rfe, y_train)
    temp = pd.Series(rfe.support_,index = cols)
    selected_features_rfe = temp[temp==True].index
    
    X_train_rfe = pd.DataFrame(train_rfe, columns=selected_features_rfe)
    X_test_rfe = pd.DataFrame(test_rfe, columns=selected_features_rfe)
    
    return selected_features_rfe, X_train_rfe, X_test_rfe

In [43]:
number_of_features = 2
optimal_features(X_train, X_test, y_train, number_of_features)

(Index(['total_bill', 'price_per_person'], dtype='object'),
      total_bill  price_per_person
 0      2.227511          2.227511
 1     -0.440469         -0.440469
 2     -0.769891         -0.769891
 3     -1.049843         -1.049843
 4     -1.041973         -1.041973
 ..          ...               ...
 190    0.157662          0.157662
 191   -0.837349         -0.837349
 192   -0.355021         -0.355021
 193   -1.246596         -1.246596
 194   -0.595623         -0.595623
 
 [195 rows x 2 columns],
     total_bill  price_per_person
 0     2.075729          2.075729
 1     0.117187          0.117187
 2     1.220132          1.220132
 3    -1.009368         -1.009368
 4    -0.141403         -0.141403
 5     1.184154          1.184154
 6     0.174527          0.174527
 7    -0.331411         -0.331411
 8     1.052610          1.052610
 9     0.023870          0.023870
 10   -1.086945         -1.086945
 11   -0.059329         -0.059329
 12   -0.440469         -0.440469
 13   -0.601245  

In [46]:
import statsmodels.api as sm
def ols_backward_elimination(X_train, y_train):
    '''
    Takes dataframe of features and dataframe of target variable as input,
    runs OLS, extracts each features p-value, removes the column with the highest p-value
    until there are no features remaining with a p-value > 0.05
    It then returns a list of the names of the selected features
    '''
    cols = list(X_train.columns)

    while (len(cols) > 0):
        # create a new dataframe that we will use to train the model...each time we loop through it will 
        # remove the feature with the highest p-value IF that p-value is greater than 0.05.
        # if there are no p-values > 0.05, then it will only go through the loop one time. 
        X_1 = X_train[cols]
        # fit the Ordinary Least Squares Model
        model = sm.OLS(y_train,X_1).fit()
        # create a series of the pvalues with index as the feature names
        p = pd.Series(model.pvalues)
        # get the max p-value
        pmax = max(p)
        # get the feature that has the max p-value
        feature_with_p_max = p.idxmax()
        # if the max p-value is >0.05, the remove the feature and go back to the start of the loop
        # else break the loop with the column names of all features with a p-value <= 0.05
        if(pmax>0.05):
            cols.remove(feature_with_p_max)
        else:
            break

    selected_features_BE = cols
    return selected_features_BE

In [47]:
ols_backward_elimination(X_train, y_train)

['total_bill', 'tip', 'price_per_person']

In [48]:
swiss = data('swiss')

In [49]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [52]:
swiss.dtypes

Fertility           float64
Agriculture         float64
Examination           int64
Education             int64
Catholic            float64
Infant.Mortality    float64
dtype: object

In [53]:
train, test = split_scale.split_my_data(swiss)

In [54]:
train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Broye,83.8,70.2,16,7,92.85,23.6
Paysd'enhaut,72.0,63.5,6,3,2.56,18.0
Avenches,68.9,60.7,19,12,4.43,22.7
Aubonne,66.9,67.5,14,7,2.27,19.1
Oron,72.5,71.2,12,1,2.4,21.0


In [57]:
k = 3
# scale data using standard scaler
scaler, train, test = split_scale.standard_scaler(train, test)

# to return to original values
# scaler, train, test = scaling.my_inv_transform(scaler, train, test)

X_train = train.drop(columns='Fertility')
y_train = train[['Fertility']]
X_test = test.drop(columns='Fertility')
y_test = test[['Fertility']]

In [58]:
select_kbest_freg_scaled(X_train, y_train, k)

['Examination', 'Education', 'Catholic']