# import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import os

# import dataset

In [None]:
#import data
df=pd.read_excel('../input/breast-cancer-detection-dataset/df.xlsx')

In [None]:
#first 5 rows of dataset
df.head()

# Data Preprocessing

In [None]:
#shape of dataset
df.shape

In [None]:
#check null values
df.info()

In [None]:
#check null values
df.isnull()

In [None]:
#check nul values
df.isnull().sum()

In [None]:
#statistics of dataset
df.describe()

# Data Visualization

In [None]:
#countplot according to diagnosis
sns.countplot(x='diagnosis', data =df)
plt.show()

In [None]:
#pairplot according to diagnosis
sns.pairplot(df.iloc[:,1:6],hue="diagnosis")
plt.show()

In [None]:
#heatmap diagram based on the correlation of attributes each other
plt.figure(figsize=(30,25))
sns.heatmap(df.corr())
ax = sns.heatmap(df.corr(),vmin=-1,vmax=1,center=0,annot=True)
plt.show()

In [None]:
#normal distribution curve according to diagnosis
sns.displot(data=df,
            x='Radius_mean',
            hue='diagnosis',
            kind='kde',
            fill=True,
            height=5, aspect=1.5)

In [None]:
#count according to diagnosis
df['diagnosis'].value_counts()

In [None]:
#whole mean data according to diagnosis
df.groupby(['diagnosis']).mean()

# Define dependent and independent variables

In [None]:
#replace M as 1 and B as 0
df.replace({'diagnosis':{'M':1 , 'B':0}},inplace = True)

In [None]:
#define dependent variable y and independent variable X
X = df.drop(columns={'diagnosis' , 'id'}, axis=1)
y = df['diagnosis']

In [None]:
#independent variable
X

In [None]:
#dependent variable
y

# Spilit training and testing dataset

In [None]:
#spilit traing and test dataset with 80% test size
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y,  random_state = 1)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
print(y.shape, y_train.shape, y_test.shape)

In [None]:
#import decision tree classifier library and fit the dataset
from sklearn.tree import DecisionTreeClassifier
classifier= DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [None]:
#check accuracy
classifier.score(X_test, y_test)

In [None]:
#import confusion matrix library and make confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()
plt.show()

# Prediction

In [None]:
#take x variable and convert into numpy array
patient1 = np.array([7.76,
                     24.54, 
                     47.92,
                     181.0,
                     0.05263,
                     0.04362,
                     0.00000,
                     0.00000,
                     0.1587,
                     0.05884,
                     0.3857,
                     1.428,
                     2.548,
                     19.15,
                     0.007189,
                     0.00466,
                     0,
                     0,
                     0.02676,
                     0.002783,
                     9.456,
                     30.37,
                     59.16,
                     268.6,
                     0.08996,
                     0.06444,
                     0.0000,
                     0.0000,
                     0.2871,
                     0.07039])

In [None]:
#reshape the prediction values and predict
pred = patient1.reshape(1,-1)
pred = classifier.predict(pred)
if pred == 0:
    print ('Patient has no cancer')
else:
    print ('Patient has cancer')