In [1]:
import pandas as pd

data = pd.read_csv('quidditch.csv')

In [2]:
data

Unnamed: 0,Student,House,Quidditch,Potion_Ave,Charm_Ave
0,Hermione Granger,Gryffindor,No,100.0,100.0
1,Anthony Goldstein,Ravenclaw,Yes,89.0,87.0
2,Harry Potter,Gryffindor,Yes,88.0,90.0
3,Lisa Turpin,Ravenclaw,No,86.5,84.0
4,Michael Corner,Ravenclaw,Yes,85.5,86.5
5,Draco Malfoy,Slytherin,Yes,84.5,81.0
6,Susan Bones,Hufflepuff,Yes,84.0,83.5
7,Ron Weasley,Gryffindor,Yes,83.0,87.5
8,Hannah Abbott,Hufflepuff,No,80.5,84.5
9,Ernie Macmillan,Hufflepuff,Yes,77.5,85.0


In [3]:
new_df = data['House'].values.reshape(-1, 1)
new_df

array([['Gryffindor'],
       ['Ravenclaw'],
       ['Gryffindor'],
       ['Ravenclaw'],
       ['Ravenclaw'],
       ['Slytherin'],
       ['Hufflepuff'],
       ['Gryffindor'],
       ['Hufflepuff'],
       ['Hufflepuff'],
       ['Slytherin'],
       ['Slytherin'],
       ['Gryffindor'],
       ['Slytherin'],
       ['Hufflepuff'],
       ['Ravenclaw']], dtype=object)

In [13]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse = False)

encoded_df = encoder.fit_transform(new_df)

In [14]:
encoded_df

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

In [6]:
encoder.get_feature_names()

array(['x0_Anthony Goldstein', 'x0_Harry Potter', 'x1_Gryffindor',
       'x1_Ravenclaw', 'x2_Yes', 'x3_88.0', 'x3_89.0', 'x4_87.0',
       'x4_90.0'], dtype=object)

In [7]:
features_df = pd.DataFrame(encoded_df, columns = encoder.get_feature_names())
features_df

Unnamed: 0,x0_Anthony Goldstein,x0_Harry Potter,x1_Gryffindor,x1_Ravenclaw,x2_Yes,x3_88.0,x3_89.0,x4_87.0,x4_90.0
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0


In [8]:
features = data.iloc[:, 1:3]

In [9]:
features

Unnamed: 0,House,Quidditch
0,Gryffindor,No
1,Ravenclaw,Yes
2,Gryffindor,Yes
3,Ravenclaw,No
4,Ravenclaw,Yes
5,Slytherin,Yes
6,Hufflepuff,Yes
7,Gryffindor,Yes
8,Hufflepuff,No
9,Hufflepuff,Yes


In [10]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse = True)
encoded_df = encoder.fit_transform(features)

In [11]:
encoded_df

<16x6 sparse matrix of type '<class 'numpy.float64'>'
	with 32 stored elements in Compressed Sparse Row format>

In [12]:
encoder.get_feature_names()

array(['x0_Gryffindor', 'x0_Hufflepuff', 'x0_Ravenclaw', 'x0_Slytherin',
       'x1_No', 'x1_Yes'], dtype=object)

In [13]:
'''
Feature Discretization (Binning)
- splitting continuous variables into multiple features
- transforms the continuous variables into a discrete one
  that represents intervals spanning the range of the 
  variable's values
'''

"\nFeature Discretization (Binning)\n- splitting continuous variables into multiple features\n- transforms the continuous variables into a discrete one\n  that represents intervals spanning the range of the \n  variable's values\n"

In [14]:
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins = 3, encode = 'onehot-dense', strategy = 'quantile')
potion_discretized = discretizer.fit_transform(data['Potion_Ave'].values.reshape(-1, 1))

In [15]:
potion_discretized

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [16]:
potion_df = pd.DataFrame(potion_discretized, columns = ['potionBin1', 'potionBin2', 'potionBin3'])
potion_df

Unnamed: 0,potionBin1,potionBin2,potionBin3
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
5,0.0,1.0,0.0
6,0.0,1.0,0.0
7,0.0,1.0,0.0
8,0.0,1.0,0.0
9,1.0,0.0,0.0


In [17]:
discretizer.bin_edges_

array([array([ 60. ,  80.5,  85. , 100. ])], dtype=object)

In [18]:
'''
3. Polynomial Features
- can generate polynomial values
- using polynomial features together with linear regression
'''

'\n3. Polynomial Features\n- can generate polynomial values\n- using polynomial features together with linear regression\n'

In [19]:
from sklearn.preprocessing import PolynomialFeatures

In [20]:
# Includes polynomials up to x**10
# a degree of 10 yields 10 features
poly = PolynomialFeatures(degree = 10, include_bias = False)

In [21]:
potion_poly = poly.fit_transform(data['Potion_Ave'].values.reshape(-1, 1))
potion_poly

array([[1.00000000e+02, 1.00000000e+04, 1.00000000e+06, 1.00000000e+08,
        1.00000000e+10, 1.00000000e+12, 1.00000000e+14, 1.00000000e+16,
        1.00000000e+18, 1.00000000e+20],
       [8.90000000e+01, 7.92100000e+03, 7.04969000e+05, 6.27422410e+07,
        5.58405945e+09, 4.96981291e+11, 4.42313349e+13, 3.93658881e+15,
        3.50356404e+17, 3.11817199e+19],
       [8.80000000e+01, 7.74400000e+03, 6.81472000e+05, 5.99695360e+07,
        5.27731917e+09, 4.64404087e+11, 4.08675596e+13, 3.59634525e+15,
        3.16478382e+17, 2.78500976e+19],
       [8.65000000e+01, 7.48225000e+03, 6.47214625e+05, 5.59840651e+07,
        4.84262163e+09, 4.18886771e+11, 3.62337057e+13, 3.13421554e+15,
        2.71109644e+17, 2.34509842e+19],
       [8.55000000e+01, 7.31025000e+03, 6.25026375e+05, 5.34397551e+07,
        4.56909906e+09, 3.90657969e+11, 3.34012564e+13, 2.85580742e+15,
        2.44171535e+17, 2.08766662e+19],
       [8.45000000e+01, 7.14025000e+03, 6.03351125e+05, 5.09831701e+07,
   

In [22]:
poly_df = pd.DataFrame(potion_poly, columns = poly.get_feature_names())
poly_df

Unnamed: 0,x0,x0^2,x0^3,x0^4,x0^5,x0^6,x0^7,x0^8,x0^9,x0^10
0,100.0,10000.0,1000000.0,100000000.0,10000000000.0,1000000000000.0,100000000000000.0,1e+16,1e+18,1e+20
1,89.0,7921.0,704969.0,62742240.0,5584059000.0,496981300000.0,44231330000000.0,3936589000000000.0,3.503564e+17,3.118172e+19
2,88.0,7744.0,681472.0,59969540.0,5277319000.0,464404100000.0,40867560000000.0,3596345000000000.0,3.164784e+17,2.78501e+19
3,86.5,7482.25,647214.625,55984070.0,4842622000.0,418886800000.0,36233710000000.0,3134216000000000.0,2.711096e+17,2.345098e+19
4,85.5,7310.25,625026.375,53439760.0,4569099000.0,390658000000.0,33401260000000.0,2855807000000000.0,2.441715e+17,2.087667e+19
5,84.5,7140.25,603351.125,50983170.0,4308078000.0,364032600000.0,30760750000000.0,2599284000000000.0,2.196395e+17,1.855953e+19
6,84.0,7056.0,592704.0,49787140.0,4182119000.0,351298000000.0,29509030000000.0,2478759000000000.0,2.082157e+17,1.749012e+19
7,83.0,6889.0,571787.0,47458320.0,3939041000.0,326940400000.0,27136050000000.0,2252292000000000.0,1.869403e+17,1.551604e+19
8,80.5,6480.25,521660.125,41993640.0,3380488000.0,272129300000.0,21906410000000.0,1763466000000000.0,1.41959e+17,1.14277e+19
9,77.5,6006.25,465484.375,36075040.0,2795816000.0,216675700000.0,16792370000000.0,1301408000000000.0,1.008592e+17,7.816584e+18


In [23]:
'''
Univariate Nonlinear Transformation
'''

'\nUnivariate Nonlinear Transformation\n'

In [24]:
import numpy as np

X_train_log = np.log(data['Potion_Ave'].values + 1)

In [25]:
X_train_log

array([4.61512052, 4.49980967, 4.48863637, 4.47163879, 4.46014441,
       4.44851638, 4.44265126, 4.4308168 , 4.40060302, 4.36309862,
       4.2121276 , 4.17438727, 4.11087386, 4.29045944, 4.40671925,
       4.4543473 ])

In [26]:
'''
- Sklearn makes it easy to add new features and increase the dimensionality of the data
- Adding more features increases the complexity of models
- Increased likelihood of overfitting
- focus on the most important features and use a reduced number of features for
  simpler models that generalize better
  
1. Univariate Statistics
2. Model Based Selection
3. Iterative Selection
'''

'\n- Sklearn makes it easy to add new features and increase the dimensionality of the data\n- Adding more features increases the complexity of models\n- Increased likelihood of overfitting\n- focus on the most important features and use a reduced number of features for\n  simpler models that generalize better\n  \n1. Univariate Statistics\n2. Model Based Selection\n3. Iterative Selection\n'

In [27]:
'''
Univariate: individual realtionship between each feature
The features should be informative independantly
Performs an ANOVA
'''

'\nUnivariate: individual realtionship between each feature\nThe features should be informative independantly\nPerforms an ANOVA\n'

In [28]:
import pandas as pd

from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()  # Bunch object

df = pd.DataFrame(california.data, columns=california.feature_names)
df["Value"] = california.target

features = df.drop("Value", axis=1)
target = df["Value"]

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=3000)

#define a selection method and specify the score function to be f_regression
select = SelectKBest(score_func = f_regression, k = 3)
select.fit(X_train, y_train)

#transform training and testing sets so only the selected features are retained
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

In [30]:
X_train_selected

array([[ 2.7483    ,  4.10650888, 34.09      ],
       [ 4.58      ,  6.00986193, 34.27      ],
       [ 1.3844    ,  3.45646438, 37.78      ],
       ...,
       [ 5.299     ,  7.21493213, 34.91      ],
       [ 7.0309    ,  5.43678161, 37.41      ],
       [ 2.8167    ,  6.08007812, 37.19      ]])

In [31]:
model = LinearRegression().fit(X=X_train, y=y_train)

print("Original results:")
print("\tR-squared value for training set: ", r2_score(y_train, model.predict(X_train)))
print("\tR-squared value for testing set: ", r2_score(y_test, model.predict(X_test)))


model = LinearRegression().fit(X=X_train_selected, y=y_train)

print("With selected features:")
print("\tR-squared value for training set: ", r2_score(y_train, model.predict(X_train_selected)))
print("\tR-squared value for testing set: ", r2_score(y_test, model.predict(X_test_selected)))

Original results:
	R-squared value for training set:  0.6095160399631113
	R-squared value for testing set:  0.5954462325232102
With selected features:
	R-squared value for training set:  0.4922565671674721
	R-squared value for testing set:  0.45712915487088823


In [32]:
print(features.columns)

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')


In [33]:
#returns a Boolean mask of selected features
select.get_support()

array([ True, False,  True, False, False, False,  True, False])

In [34]:
'''
MODEL BASED SELECTION

judge which features are most important
use an importance ranking
'''

'\nMODEL BASED SELECTION\n\njudge which features are most important\nuse an importance ranking\n'

In [35]:
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor

select = SelectFromModel(DecisionTreeRegressor(random_state = 3000), threshold = 'median')

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

select.fit(X_train, y_train)

#transform training and testing sets so only the selected features are retained
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

model = LinearRegression().fit(X=X_train, y=y_train)

print("Original results:")
print("\tR-squared value for training set: ", r2_score(y_train, model.predict(X_train)))
print("\tR-squared value for testing set: ", r2_score(y_test, model.predict(X_test)))


model = LinearRegression().fit(X=X_train_selected, y=y_train)

print("With selected features:")
print("\tR-squared value for training set: ", r2_score(y_train, model.predict(X_train_selected)))
print("\tR-squared value for testing set: ", r2_score(y_test, model.predict(X_test_selected)))

Original results:
	R-squared value for training set:  0.6095160399631113
	R-squared value for testing set:  0.5954462325232102
With selected features:
	R-squared value for training set:  0.5874746879530321
	R-squared value for testing set:  0.578093187285585


In [37]:
print(features.columns)

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')


In [38]:
select.get_support()

array([ True, False, False, False, False,  True,  True,  True])

In [39]:
# comes from the decision tree algorithm
select.threshold_

0.06946007914825762

In [40]:
#the importance ranking of features
select.estimator_.feature_importances_

array([0.52493621, 0.05335433, 0.04893749, 0.02843023, 0.03178195,
       0.13842815, 0.08856581, 0.08556583])

In [41]:
'''
Iterative Feature Selection

A series of models are built, with varying numbers of features
Recursive Feature Elimination (RFE)

Starts with all features, builds a model, and discards the least important feature according to the model
Then a new model is built using all but discarded feature, and so on
This is done until only a prespecified number of features are left
Use RFE's n_features_to_select parameter to set the number of features to select

The feature selection model needs to provide some measure of importance for each feature

so that they can be ranked by this measure
Computationally expensive (due to the recursive approach)
'''

"\nIterative Feature Selection\n\nA series of models are built, with varying numbers of features\nRecursive Feature Elimination (RFE)\n\nStarts with all features, builds a model, and discards the least important feature according to the model\nThen a new model is built using all but discarded feature, and so on\nThis is done until only a prespecified number of features are left\nUse RFE's n_features_to_select parameter to set the number of features to select\n\nThe feature selection model needs to provide some measure of importance for each feature\n\nso that they can be ranked by this measure\nComputationally expensive (due to the recursive approach)\n"

In [42]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor

select = RFE(DecisionTreeRegressor(random_state = 3000), n_features_to_select = 3)

In [43]:
#fit the RFE selector to the training data
select.fit(X_train, y_train)

#transform training and testing sets so only the selected features are retained
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

model = LinearRegression().fit(X=X_train, y=y_train)

print("Original results:")
print("\tR-squared value for training set: ", r2_score(y_train, model.predict(X_train)))
print("\tR-squared value for testing set: ", r2_score(y_test, model.predict(X_test)))


model = LinearRegression().fit(X=X_train_selected, y=y_train)

print("With selected features:")
print("\tR-squared value for training set: ", r2_score(y_train, model.predict(X_train_selected)))
print("\tR-squared value for testing set: ", r2_score(y_test, model.predict(X_test_selected)))

Original results:
	R-squared value for training set:  0.6095160399631113
	R-squared value for testing set:  0.5954462325232102
With selected features:
	R-squared value for training set:  0.5863410269918616
	R-squared value for testing set:  0.5777114692521803


In [44]:
y_train

6793     2.188
3521     2.362
15743    1.775
9759     1.341
5800     2.023
         ...  
14937    2.256
1876     1.316
17333    2.268
18350    4.750
9208     0.961
Name: Value, Length: 15480, dtype: float64

In [45]:
print(features.columns)

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')


In [46]:
select.get_support()

array([ True, False, False, False, False, False,  True,  True])