In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

col_names = ['ID', 'Diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
            'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
            'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst']
wdbc = pd.read_csv('wdbc.data', sep = ',', names = col_names)
wdbc = wdbc.drop('ID', axis = 1)

In [5]:
# separate X and y data
diagnosis = wdbc['Diagnosis']
diagnosis = diagnosis.map({'M': 1, 'B': 0})
wdbc_x = wdbc.drop('Diagnosis', axis = 1)

# split into testing and training data
features_train,features_test,labels_train,labels_test = train_test_split(wdbc_x, diagnosis,test_size = 0.2, random_state = 42)

In [6]:
# train random forest, 10 trees
rf = RandomForestRegressor(n_estimators = 10, random_state = 42)

# Random Forest w/out optimal parameters
rf.fit(features_train, labels_train)
y_pred = rf.predict(features_test)
y_pred = [int(x) for x in y_pred]
print(rf.__class__.__name__, accuracy_score(labels_test, y_pred))

RandomForestRegressor 0.9122807017543859


In [8]:
# Get numerical feature importances
importances = list(rf.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(features_train.columns), importances)]# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: concave_points_worst Importance: 0.28
Variable: perimeter_worst      Importance: 0.23
Variable: area_worst           Importance: 0.17
Variable: radius_worst         Importance: 0.1
Variable: concave_points_mean  Importance: 0.09
Variable: texture_worst        Importance: 0.02
Variable: texture_mean         Importance: 0.01
Variable: smoothness_mean      Importance: 0.01
Variable: concavity_mean       Importance: 0.01
Variable: area_se              Importance: 0.01
Variable: smoothness_se        Importance: 0.01
Variable: concavity_se         Importance: 0.01
Variable: symmetry_se          Importance: 0.01
Variable: fractal_dimension_se Importance: 0.01
Variable: concavity_worst      Importance: 0.01
Variable: radius_mean          Importance: 0.0
Variable: perimeter_mean       Importance: 0.0
Variable: area_mean            Importance: 0.0
Variable: compactness_mean     Importance: 0.0
Variable: symmetry_mean        Importance: 0.0
Variable: fractal_dimension_mean Importance: 0

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [16]:
# isolate most important features
new_features_train = features_train[['concave_points_worst', 'perimeter_worst', 'area_worst', 'radius_worst', 'concave_points_mean', 
                                    'texture_worst', 'smoothness_mean', 'concavity_mean', 'area_se', 'smoothness_se', 'concavity_se']]
new_features_test = features_test[['concave_points_worst', 'perimeter_worst', 'area_worst', 'radius_worst', 'concave_points_mean', 
                                    'texture_worst', 'smoothness_mean', 'concavity_mean', 'area_se', 'smoothness_se', 'concavity_se']]

rf = RandomForestRegressor(n_estimators = 10, random_state = 42)

# Random Forest w/out optimal parameters
rf.fit(new_features_train, labels_train)
y_pred = rf.predict(new_features_test)
y_pred = [int(x) for x in y_pred]
print(rf.__class__.__name__, accuracy_score(labels_test, y_pred))

RandomForestRegressor 0.9385964912280702
