In [1]:
import numpy as np
import pandas as pd


master_dataframe = pd.read_csv("pima_diabetes.csv")
df = master_dataframe.drop(['skin'], axis=1) 
df.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True


In [2]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# split dataset into test/train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

# standardize features
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)  

### A Random Forest will identify feature importances
Feature importance rates how important each feature is for the Random Forest. 

Feature importances will always sum to 1.

In [4]:
from sklearn.ensemble import RandomForestClassifier



rf = RandomForestClassifier(n_estimators=10000, random_state=0)
rf.fit(X_train_std, y_train)


feature_importances = rf.feature_importances_

# feature importances will always sum to 1
print("Feature importances:\n{}".format(feature_importances))

Feature importances:
[0.0849827  0.23775924 0.09717414 0.0707385  0.0736413  0.17363556
 0.1282392  0.13382937]


In [5]:
# feature_importance array needs to be reshaped; it is a vector of shape (8,), it needs to be shaped (1,8)
most_important_features = pd.DataFrame(feature_importances.reshape(1,-1), columns=df.columns[:-1], index = ["importance"])
most_important_features

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age
importance,0.084983,0.237759,0.097174,0.070738,0.073641,0.173636,0.128239,0.133829


### Get the indices of the most important features sorted by importance

In [6]:
# sort the feature importances but show their original index position

feature_importance_inorder_mask = np.argsort(feature_importances)[::-1]
feature_importance_inorder_mask

array([1, 5, 7, 6, 2, 0, 4, 3], dtype=int64)

In [7]:
# convert columns from an Index object to a numpy array so that we can pass a list/mask to it
features = np.array(df.columns)

best_features = list(features[feature_importance_inorder_mask])

# display the best features in order
print("The best features in order: \n{}".format(best_features))

The best features in order: 
['glucose_conc', 'bmi', 'age', 'diab_pred', 'diastolic_bp', 'num_preg', 'insulin', 'thickness']


### Get the indices of the 'n' most important features (unsorted) based upon a given threshold

In [20]:
from sklearn.feature_selection import SelectFromModel

# threshold: may be set to None (which defaults to mean),'median', number between 0 and 1
# prefit=True tells it to use the model that we've already fit above

 sfm= SelectFromModel(rf, threshold=.08, prefit=True)
#sfm = SelectFromModel(rf, threshold='median', prefit=True)

# Get the indices of the most important features
most_important_features = sfm.get_support(indices=True)
print("Indices of the most important features:", most_important_features)

Indices of the most important features: [0 1 2 5 6 7]


### Display a list of the 'n' most important features (unsorted) based upon a given threhold

In [15]:
# convert columns from an Index object to a numpy array so that we can pass a list/mask to it
features = np.array(df.columns)

best_features = list(features[most_important_features])
num = len(best_features)

# display the best features, given a threshold, not necessarily in order
print("The {} best features (not necessarily in order): \n{}".format(num, best_features))

The 6 best features (not necessarily in order): 
['num_preg', 'glucose_conc', 'diastolic_bp', 'bmi', 'diab_pred', 'age']


In [17]:
# shape of data before transformation
X_train.shape

(537, 8)

### Transform the data to include just the 'n' most important features

In [21]:
# transform the data to just the most important features

X_train_most_important_features = sfm.transform(X_train_std)


In [22]:
# shape of data after transformation

print("Shape of X_train dataset with only the most important features: ", X_train_most_important_features.shape)


# You could now try the "most important feature" dataset to observe any change in the model's performance

# For example:
# print("CV scores: {}".format(cross_val_score(svm, X_train_best_features, y_train, scoring='accuracy', cv=10).mean())

Shape of X_train dataset with only the most important features:  (537, 6)
