In [1]:
#Description: This program detects breast cancer, based off of data.

In [2]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Loading the dataset
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('breast.csv')
df.head(7)

In [None]:
#count no of rows and columns in the dataset
df.shape

In [None]:
#count the number of empty(NAN, NaN, na) values in each column
df.isna().sum()

In [None]:
#Drop the column Unnamed(missing values)
df = df.dropna(axis=1)

In [None]:
#Get the new count of number of rows and cols
df.shape

In [None]:
#Count of the number of Malignant(M) or Benign (B) cells
df['diagnosis'].value_counts()

In [None]:
#Visualize count
sns.countplot(df['diagnosis'], label="Count")

In [None]:
#Get data types of every column
df.dtypes

In [None]:
#Encoding the categorical data values
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
df.iloc[:,1] = labelencoder_Y.fit_transform(df.iloc[:,1].values) #cuz diagnosis index is 1

In [None]:
#create a pair plot
sns.pairplot(df.iloc[:,:], hue='diagnosis')

In [None]:
#print the first 5 rows after cleaning data
df.head(5)

In [None]:
#Dtermining the co-relation 
df.iloc[:,1:12].corr()

In [None]:
#visulaizing the co relation
plt.figure(figsize=(10,10))
sns.heatmap(df.iloc[:,1:12].corr(), annot = True)

In [None]:
#splitting the data set into independent (x) and dependent (y) datasets
X = df.iloc[:,2:31].values
Y = df.iloc[:,1].values

In [None]:
#Splitting the dataset into training (75%) and testing (25%)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [None]:
#Scaling the data to bring to the same level of magnitude (Feature Scaling)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)


In [None]:
#create a function for the models
def models(X_train, Y_train):

  #Logistic regressiom Model
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state = 0)
  log.fit(X_train, Y_train)

  #Decision Tree Model
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
  tree.fit(X_train,Y_train)

  #Random Forest Classifier
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators = 10, criterion='entropy', random_state=0)
  forest.fit(X_train, Y_train)

  #Models accuracy
  print('[0]Logistic Regression Training Accuracy:', log.score(X_train, Y_train))
  print('[1]Decision Tree Classifier Training Accuracy:', tree.score(X_train, Y_train))
  print('[2]Random forest Training Accuracy:', forest.score(X_train, Y_train))

  return log, tree, forest

In [None]:
#getting all of the models
model = models(X_train, Y_train)

In [None]:
 #Testing the models on test data using confusion matrix
 from sklearn.metrics import confusion_matrix
 for i in range(len(model)):
  print("Model ", i)
  cm = confusion_matrix(Y_test, model[i].predict(X_test))

  TP = cm[0][0]
  TN = cm[1][1]
  FN = cm[1][0]
  FP = cm[0][1]

  print(cm)
  print("Testing accuracy = ", (TP+TN)/(TP+TN+FP+FN))
  print()

In [None]:
#Calculating metrics of models
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

for i in range(len(model)):
  print("Model ", i)
  print(classification_report(Y_test, model[i].predict(X_test)))
  print(accuracy_score(Y_test, model[i].predict(X_test)))
  print()

In [None]:
#Prediction using Random Forest Classifier cuz high accuracy
pred = model[2].predict(X_test)
print("1 - Has Cancer\n0 - Doesnt have cancer\n")
print("Predicted values")
print(pred)
print()
print("Actual values")
print(Y_test)

In [None]:
#Prediction using Decison Tree Classifier cuz high accuracy
pred = model[1].predict(X_test)
print("1 - Has Cancer\n0 - Doesnt have cancer\n")
print("Predicted values")
print(pred)
print()
print("Actual values")
print(Y_test)

In [None]:
#Prediction using Logistic regression Classifier cuz high accuracy
pred = model[0].predict(X_test)
print("1 - Has Cancer\n0 - Doesnt have cancer\n")
print("Predicted values")
print(pred)
print()
print("Actual values")
print(Y_test)