In [None]:
import pandas as pd
import numpy as np

#Graphing
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('Breast_data.csv')
df.head()


In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
cell_data = df.iloc[:,2:].values
cell_data

In [None]:
df['type'].value_counts()
#6 types

In [None]:
df.shape

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel

#getting rid of warnings in my notebook
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
X = cell_data
y= df.type 



In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)


In [None]:
LogReg = LogisticRegression()


scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

LogReg.fit(X_train,y_train)


y_pred=LogReg.predict(X_test)
y_pred

In [None]:
print('Classes', LogReg.classes_)
print('Intercept',LogReg.intercept_)
print("Coefficients", LogReg.coef_ )

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test,y_pred,pos_label='positive',average='macro'))
print("Recall:", recall_score(y_test, y_pred,pos_label='positive',average='macro'))

In [None]:
# confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
categories = [0,1] 
fig, ax = plt.subplots()
plt.xticks([0,1], categories)
plt.yticks([0,1], categories)
sns.heatmap(pd.DataFrame(conf_mat), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

Accuracy for Logistic Regression is 93.55%**

# Decision Tree Classification

In [None]:
#Decision Tree Libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Plotting the tree
clf = DecisionTreeClassifier(criterion='entropy')

clf.fit(X_train,y_train)
y_pred= clf.predict(X_test)

fig, ax = plt.subplots(figsize=(12, 12))
tree.plot_tree(decision_tree=clf, max_depth= 3,fontsize=12);

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy for Decision Tree Classification is **68.42%**

# XGBoost

In [None]:
import xgboost as xgb
from xgboost import plot_importance

In [None]:
xg_cl = xgb.XGBClassifier(objective='binary:logistic', max_depth=1000, n_estimators=200, seed=2000) 
xg_cl.fit(X_train, y_train)
preds = xg_cl.predict(X_test)
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

In [None]:
xg_cl.fit(X_train, y_train)

In [None]:
#feature importance
xg_cl.feature_importances_

In [None]:
plot_importance(xg_cl, max_num_features=10) #lets look at top 10 since there are so many in this dataset
plt.show()

# KMeans

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_data = scaler.fit_transform(cell_data)

cluster_range = range(1, 20)
cluster_errors = []
for nu in cluster_range:
  clusters_test = KMeans(nu, n_init = 10 )
  clusters_test.fit(scaled_data)
  labels = clusters_test.labels_
  centroids = clusters_test.cluster_centers_
  cluster_errors.append( clusters_test.inertia_ )
clusters_df = pd.DataFrame( { "number of clusters":cluster_range, "cluster errors": cluster_errors } )
clusters_df[0:20]