## Decision Tree



In [None]:
import pandas as pd
from matplotlib import pyplot as plt

dataset = pd.read_csv("../data/Cancer_Data.csv")

if 'Unnamed: 32' in dataset.columns:
    dataset.drop('Unnamed: 32', axis=1, inplace=True)
dataset['diagnosis'].replace(['B', 'M'],[0, 1], inplace=True) # B = 0, M = 1 
dataset

In [None]:
from sklearn.model_selection import train_test_split

x = dataset.drop(['diagnosis','id'],axis=1)
y = dataset['diagnosis']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)


In [None]:
predictions = clf.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

In [None]:
from sklearn.metrics import precision_score
precision_score(y_test, predictions)

In [None]:
path = clf.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

clfs = []

for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(x_train, y_train)
    clfs.append(clf)

acc_scores = [accuracy_score(y_test, clf.predict(x_test)) for clf in clfs]

plt.figure(figsize=(10,  6))
plt.grid()
plt.plot(ccp_alphas[:-1], acc_scores[:-1])
plt.xlabel("effective alpha")
plt.ylabel("Accuracy scores")

Feature Importance tells us the importance of each feature in determing our results

In [None]:
feature_importance = pd.DataFrame(clf.feature_importances_, index = x_test.columns).sort_values(0, ascending=False)
feature_importance.columns = ['%']
feature_importance

Let's try and seperate the data. First We are going to try to train our model only with the "worst" data.

In [None]:
dataset_worst = dataset.filter(['fractal_dimension_worst','symmetry_worst','concave points_worst','concavity_worst','compactness_worst','smoothness_worst','area_worst','perimeter_worst','texture_worst','radius_worst'],axis=1)
dataset_worst

In [None]:
x_worst = dataset_worst
y_worst = dataset['diagnosis']

x_worst_train, x_worst_test, y_worst_train, y_worst_test = train_test_split(x_worst, y_worst, test_size=0.3)

In [None]:
clf_worst = DecisionTreeClassifier()

In [None]:
clf_worst.fit(x_worst_train,y_worst_train)
predictions_worst = clf_worst.predict(x_worst_test)

accuracy_score(y_worst_test, predictions_worst)

In [None]:
precision_score(y_worst_test, predictions_worst)

In [None]:
path = clf_worst.cost_complexity_pruning_path(x_worst_train, y_worst_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

clfs = []

for ccp_alpha in ccp_alphas:
    clf_worst = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf_worst.fit(x_worst_train, y_worst_train)
    clfs.append(clf_worst)

acc_scores = [accuracy_score(y_worst_test, clf_worst.predict(x_worst_test)) for clf_worst in clfs]

plt.figure(figsize=(10,  6))
plt.grid()
plt.plot(ccp_alphas[:-1], acc_scores[:-1])
plt.xlabel("effective alpha")
plt.ylabel("Accuracy scores")

In [None]:
max(acc_scores)

In [None]:
ccp_alphas[acc_scores.index(max(acc_scores))]

In [None]:
feature_importance_worst = pd.DataFrame(clf_worst.feature_importances_, index = x_worst_test.columns).sort_values(0, ascending=False)
feature_importance_worst.columns = ['%']
feature_importance_worst

Median Values

In [None]:
dataset_mean = dataset.filter(['fractal_dimension_mean','symmetry_mean','concave points_mean','concavity_mean','compactness_mean','smoothness_mean','area_mean','perimeter_mean','texture_mean','radius_mean'],axis=1)
dataset_mean

In [None]:
x_mean = dataset_mean
y_mean = dataset['diagnosis']

x_mean_train, x_mean_test, y_mean_train, y_mean_test = train_test_split(x_mean, y_mean, test_size=0.3)

clf_mean = DecisionTreeClassifier()
clf_mean.fit(x_mean_train,y_mean_train)
predictions_mean = clf_mean.predict(x_mean_test)

accuracy_score(y_mean_test, predictions_mean)

In [None]:
precision_score(y_mean_test, predictions_mean)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_mean_test, predictions_mean, labels=[0,1])

In [None]:
feature_importance_mean = pd.DataFrame(clf_mean.feature_importances_, index = x_mean_test.columns).sort_values(0, ascending=False)
feature_importance_mean.columns = ['%']
feature_importance_mean

SE VALUES

In [None]:
dataset_se = dataset.filter(['fractal_dimension_se','symmetry_se','concave points_se','concavity_se','compactness_se','smoothness_se','area_se','perimeter_se','texture_se','radius_se'],axis=1)
dataset_se

In [None]:
x_se = dataset_se
y_se = dataset['diagnosis']

x_se_train, x_se_test, y_se_train, y_se_test = train_test_split(x_se, y_se, test_size=0.3)

clf_se = DecisionTreeClassifier()
clf_se.fit(x_se_train,y_se_train)
predictions_se = clf_se.predict(x_se_test)

accuracy_score(y_se_test, predictions_se)

In [None]:
precision_score(y_se_test, predictions_se)

In [None]:
feature_importance_se = pd.DataFrame(clf_se.feature_importances_, index = x_se_test.columns).sort_values(0, ascending=False)
feature_importance_se.columns = ['%']
feature_importance_se

Since SE features predicted the worst values let's test without them

In [None]:
dataset_worstMean = dataset.drop(['id','diagnosis','fractal_dimension_se','symmetry_se','concave points_se','concavity_se','compactness_se','smoothness_se','area_se','perimeter_se','texture_se','radius_se'],axis=1)
dataset_worstMean

In [None]:
x_worstMean = dataset_worstMean
y_worstMean = dataset['diagnosis']

x_worstMean_train, x_worstMean_test, y_worstMean_train, y_worstMean_test = train_test_split(x_worstMean, y_worstMean, test_size=0.3)

clf_worstMean = DecisionTreeClassifier()
clf_worstMean.fit(x_worstMean_train,y_worstMean_train)
predictions_worstMean = clf_worstMean.predict(x_worstMean_test)

accuracy_score(y_worstMean_test, predictions_worstMean)

In [None]:
precision_score(y_worstMean_test, predictions_worstMean)

In [None]:
feature_importance_worstMean = pd.DataFrame(clf_worstMean.feature_importances_, index = x_worstMean_test.columns).sort_values(0, ascending=False)
feature_importance_worstMean.columns = ['%']
feature_importance_worstMean

In [None]:
dataset_custom = dataset.filter(['radius_mean','perimeter_mean','area_mean','concavity_mean','concave points_mean','radius_worst','perimeter_worst','area_worst','concave points_worst'],axis=1)
dataset_custom

In [None]:
x_custom = dataset_custom
y_custom = dataset['diagnosis']

x_custom_train, x_custom_test, y_custom_train, y_custom_test = train_test_split(x_custom, y_custom, test_size=0.3)

clf_custom = DecisionTreeClassifier()
clf_custom.fit(x_custom_train,y_custom_train)
predictions_custom = clf_custom.predict(x_custom_test)

accuracy_score(y_custom_test, predictions_custom)

In [None]:
precision_score(y_custom_test, predictions_custom)

In [None]:
feature_importance_custom = pd.DataFrame(clf_custom.feature_importances_, index = x_custom_test.columns).sort_values(0, ascending=False)
feature_importance_custom.columns = ['%']
feature_importance_custom

#### 