In [None]:
# Import dependencies
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
# from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
# import tensorflow as tf


In [None]:
# View data features and target
data = load_breast_cancer()
print(data.feature_names)
print(data.target_names)

In [None]:

# Create dataframe from features and target and preview
df = pd.DataFrame(data.data, columns=data.feature_names)
df["target"] = data.target
df.head()



In [None]:
df.groupby('target').count()

In [None]:
# Look at dataframe info 
df.info()

In [None]:
# remove spaces from column headings and replace with _ and preview
df.columns = df.columns.str.replace(' ', '_')
df.head()

In [None]:
# Remove csv already created so new one can be saved
import os
os.remove('df.csv')

In [None]:
# Save dataframe to csv to use in SQL
df.to_csv('df.csv', index=False)

In [None]:
# Define X, y for trian test split
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [None]:
# Create StandardScaler model and fit to training data
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Find k value with highest accuracy
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [None]:
# Plot different k values
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores,  marker='x')
plt.xlabel('k neighbors')
plt.ylabel('Testing accuracy score')

In [None]:
# Best accuracy at k=7
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_scaled, y_train)
print('k=7 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

In [None]:
y_true = y_test
y_pred = knn.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)


In [None]:
print(classification_report(y_true, y_pred))

In [None]:
clf = RandomForestClassifier(random_state=78, n_estimators=750).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')


In [None]:
features = clf.feature_importances_
print(features)
plt.bar(x=range(len(features)), height=features, index=data.feature_names)
plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
sel = SelectFromModel(clf)
sel.fit(X_train_scaled, y_train)
sel.get_support()

In [None]:
X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=78)
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

In [None]:
# Scores using logistic regression model on full data set
clf = LogisticRegression().fit(X_train_scaled, y_train)
clf.fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
# Scores using RandomForest Feature Selection
clf = LogisticRegression()
clf.fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_selected_test_scaled, y_test)}')