In [1]:
import numpy as np
import pandas as pd

In [2]:
from matplotlib import pyplot as plt

In [3]:
data_dir = r'C:\Users\Simas\Desktop\Insight\Data Challenges\Breast-Cancer-Challenge\data\CleanedData'

In [4]:
df = pd.read_pickle(data_dir)

In [16]:
col_names = ['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 
              'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']

df_benign = df[df['Class']==0]
df_malignant = df[df['Class'] == 1]

df_data = df[col_names]
df_label = df['Class']

df_b_x = df_benign[col_names]
df_b_y = df_benign['Class']
df_m_x = df_malignant[col_names]
df_m_y = df_malignant['Class']

Try and fix class imbalance

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(df_data, df_label)

Use random forest classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_res, y_res)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

Check feature importance:

In [10]:
print(clf.feature_importances_)

[8.05498534e-02 4.16537756e-01 2.30034939e-01 3.75564259e-02
 1.37721444e-01 6.40840637e-02 3.14382668e-03 3.03597361e-02
 1.19556702e-05]


In [11]:
clf.feature_importances_/min(clf.feature_importances_)

array([6.73737665e+03, 3.48401845e+04, 1.92406561e+04, 3.14130662e+03,
       1.15193411e+04, 5.36013979e+03, 2.62956959e+02, 2.53935878e+03,
       1.00000000e+00])

In [12]:
# Accuracy:
np.mean(y_res == clf.predict(X_res))

0.9829860195199156

In [13]:
import pickle

In [None]:
pickle.dump(clf, open('Trained_Forest', 'wb'))

Double check there is no over fitting, make train test split and see how accuracy changes:

In [14]:
from sklearn.model_selection import train_test_split

In [17]:
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(df_b_x, df_b_y, test_size=0.2, random_state=0)
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(df_m_x, df_m_y, test_size=0.2, random_state=0)

In [18]:
X_train_split = pd.concat([X_train_b,X_train_m])
y_train_split = pd.concat([y_train_b,y_train_m])

In [19]:
sm2 = SMOTE(random_state=42)
X_res_split, y_res_split = sm.fit_resample(X_train_split, y_train_split)

In [20]:
clf_split = RandomForestClassifier(max_depth=2, random_state=0)
clf_split.fit(X_res_split, y_res_split)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [21]:
X_test_split = pd.concat([X_test_b,X_test_m])
y_test_split = pd.concat([y_test_b,y_test_m])

In [22]:
clf_split.score(X_test_split, y_test_split)

0.9968

Similar accuracy without validation split

In [23]:
clf_split.score(X_test_b, y_test_b)

0.9130434782608695

91% accuracy on just the minority class!

In [24]:
pickle.dump(clf_split, open('Trained_Forest_CV', 'wb'))