In [1]:
#conda install scikit-learn


In [2]:
#pip install seaborn

In [None]:
# import libraries
from sklearn.datasets import load_breast_cancer
import pandas as pd  # Import Pandas for data manipulation using dataframes
import numpy as np  # Import Numpy for data statistical analysis
import matplotlib.pyplot as plt  # Import matplotlib for data visualisation
import seaborn as sns  # Statistical data visualization
# %matplotlib inline

# Import Cancer data drom the Sklearn library



In [None]:
data = pd.read_csv("BreastCancerDetection.csv")

# VISUALIZING THE DATA

In [None]:
data

In [None]:
#remove the last column - Unnamed: 32
data =data.iloc[:,:-1]
#remove the first column - ID
data =data.iloc[:,1:]

In [None]:
data.keys()

In [None]:
data["diagnosis"]

In [None]:
data["diagnosis"].value_counts()


In [None]:
sns.countplot(data['diagnosis'], label = "Count") 

In [None]:
# sns.pairplot(data, hue = 'diagnosis', vars = ['radius_mean', 'texture_mean', 'perimeter_mean',
#        'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
#        'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
#        'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
#        'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
#        'fractal_dimension_se', 'radius_worst', 'texture_worst',
#        'perimeter_worst', 'area_worst', 'smoothness_worst',
#        'compactness_worst', 'concavity_worst', 'concave points_worst',
#        'symmetry_worst', 'fractal_dimension_worst'] )

In [None]:
sns.pairplot(data, hue = 'diagnosis', vars = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'fractal_dimension_mean',] )

In [None]:
sns.pairplot(data, hue = 'diagnosis', vars = [
       'area_mean', 'smoothness_mean', ] )

In [None]:

sns.scatterplot(x = 'area_mean', y = 'smoothness_mean', hue = 'diagnosis', data = data)

In [None]:
# Let's check the correlation between the variables 
# Strong correlation between the mean radius and mean perimeter, mean area and mean primeter
plt.figure(figsize=(20,10)) 
sns.heatmap(data.corr(), annot=True) 

# MODEL TRAINING

In [None]:
# Let's drop the diagnosis label coloumns
X = data.drop(['diagnosis'],axis=1)

In [None]:
X

In [None]:
y = data['diagnosis']
y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=5)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# USING SVM

In [None]:
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix

svc_model = SVC()
svc_model.fit(X_train, y_train)

# EVALUATING THE MODEL

In [None]:
y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict))

# IMPROVING THE MODEL

In [None]:
min_train = X_train.min()
min_train
range_train = (X_train - min_train).max()
range_train

In [None]:
X_train_scaled = (X_train - min_train)/range_train

In [None]:
sns.scatterplot(x = X_train['area_mean'], y = X_train['smoothness_mean'], hue = y_train)

In [None]:
sns.scatterplot(x = X_train_scaled['area_mean'], y = X_train_scaled['smoothness_mean'], hue = y_train)

In [None]:
min_test = X_test.min()
range_test = (X_test - min_test).max()
X_test_scaled = (X_test - min_test)/range_test

In [None]:
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix

svc_model = SVC()
svc_model.fit(X_train_scaled, y_train)

In [None]:
y_predict = svc_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_predict)

sns.heatmap(cm,annot=True,fmt="d")

In [None]:
print(classification_report(y_test,y_predict))

# Improving VERSION 2

In [None]:
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']} 

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=4)

In [None]:
grid.fit(X_train_scaled,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid_predictions = grid.predict(X_test_scaled)

In [None]:
cm = confusion_matrix(y_test, grid_predictions)

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test,grid_predictions))

# KNN


In [None]:
# Import train_test_split from scikit library
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [None]:
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [None]:
y_predict = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
sns.heatmap(cm, annot=True, fmt="d")

In [None]:
print(classification_report(y_test, y_predict))

# decision_tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train,y_train)

In [None]:
feature_importances = pd.DataFrame(decision_tree.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)

In [None]:
feature_importances

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_predict_train = decision_tree.predict(X_train)
y_predict_train
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot=True)

In [None]:
# Predicting the Test set results
y_predict_test = decision_tree.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier(n_estimators=150)
RandomForest.fit(X_train, y_train)

In [None]:
y_predict_train = RandomForest.predict(X_train)
y_predict_train
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_train, y_predict_train))

In [None]:
# Predicting the Test set results
y_predict_test = RandomForest.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict_test))

 ## Naive Bayes

In [None]:
from sklearn.preprocessing import StandardScaler
data['diagnosis'] = data['diagnosis'].map({'M':1,'B':0})


In [None]:
data

In [None]:
# Let's drop the target label coloumns
X = data.drop(['diagnosis'],axis=1)
y = data['diagnosis']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
y_test

In [None]:
from sklearn.naive_bayes import GaussianNB 
NB_classifier = GaussianNB()
NB_classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
y_predict_train = NB_classifier.predict(X_train)
y_predict_train
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot=True)

In [None]:
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict_test))

In [None]:
print("", sum(y_test)) #t_test=114

In [None]:
y_predict_test

In [None]:
y_test

# Logistic Regression

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_predict_train = classifier.predict(X_train)
from sklearn.metrics import classification_report, confusion_matrix
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot=True, fmt="d")

In [None]:
y_predict_test = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True, fmt="d")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict_test))

# Gabriella


In [None]:
data2 =data.iloc[:,1:-1]
data2

In [None]:
# from scipy.stats import zscore

# z = np.abs(zscore(data.iloc[:,1:]))

# print(np.where(z > 3))

In [None]:
X = data.drop(['diagnosis'],axis=1)
y = data['diagnosis']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size = 0.25, random_state=5)

from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix

# Define model
svc_model = SVC()

# Fit model
svc_model.fit(X_train, y_train)

#check prediction
y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)

sns.heatmap(cm,annot=True,fmt="d")


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix


# Define model. Specify a number for random_state to ensure same results each run
try_model = DecisionTreeClassifier(random_state=1)

# Fit model
try_model.fit(X_train, y_train)

#check prediction
y_predict = try_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
sns.heatmap(cm, annot=True)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

In [None]:
from sklearn.tree import ExtraTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix


# Define model. Specify a number for random_state to ensure same results each run
try_model = ExtraTreeClassifier(random_state=1)

# Fit model
try_model.fit(X_train, y_train)

#check prediction
y_predict = try_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)

sns.heatmap(cm, annot=True)



In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix


# Define model. Specify a number for random_state to ensure same results each run
try_model = MLPClassifier()

# Fit model
try_model.fit(X_train, y_train)

#check prediction
y_predict = try_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)

sns.heatmap(cm, annot=True)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predict))