In [1]:
import numpy as np
from xgboost import XGBClassifier 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import warnings

In [2]:
iris = datasets.load_iris()
feat_labels = ['Sepal Length','Sepal Width','Petal Length','Petal Width']
X = iris.data
y = iris.target

In [3]:
X[0:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [6]:
model = XGBClassifier() 
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [7]:
# Print the name and gini importance of each feature
for feature in zip(feat_labels, model.feature_importances_):
    print(feature)

('Sepal Length', 0.016870115)
('Sepal Width', 0.021230388)
('Petal Length', 0.7144931)
('Petal Width', 0.2474064)


In [10]:
sfm = SelectFromModel(model, threshold=0.23)
sfm.fit(X_train, y_train)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [11]:
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

Petal Length
Petal Width


In [12]:
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [16]:
X_important_train.shape

(90, 2)

In [17]:
y_train.shape

(90,)

In [13]:
clf_important = XGBClassifier()
clf_important.fit(X_important_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [14]:
y_important_pred = clf_important.predict(X_important_test)
accuracy_score(y_test, y_important_pred)


0.95

In [15]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9333333333333333