# **Import Libraries**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder #to encode categorical data
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report #for evaluation
from sklearn.preprocessing import StandardScaler #feature scaling
from imblearn.over_sampling import RandomOverSampler

# **Upload Data**

In [None]:
#Upload the csv file before running this cell
dataset= pd.read_csv("/content/dataset.csv")
dataset.sample(6)

# **Data Preprocessing**

**there are some "?" values is data.**

**First, replace them with NaN**

In [None]:
dataset.replace('?', np.nan, inplace = True)
#check how many values are NaN
dataset.isna().sum()

**shape before removing missing values**

In [None]:
print("Cancer data set dimensions : {}".format(dataset.shape))

**since there aren't many missing values, we can remove them from the dataset**

In [None]:
dataset=dataset.dropna()

**shape after removing missing values**

In [None]:
print("Cancer data set dimensions after removing missing values : {}".format(dataset.shape))

In [None]:
#to change the default number of columns to be displayed
pd.set_option("display.max_columns", 35)
dataset.head()

**Drop ID Column**

In [None]:
dataset.drop(columns=['id'], inplace=True)

In [None]:
dataset.info()

**change data types**

In [None]:
dataset = dataset.astype({"Recurrence":'category', "Lymph node status":'float', "Time":'float'})

In [None]:
dataset.info()

**encoding categorical data: converting categorical data into integer format so it can be provided to the model**

In [None]:
labelencoder = LabelEncoder()
dataset['Recurrence'] = labelencoder.fit_transform(dataset['Recurrence'])
dataset

# **Data Splitting (x,y)**

**we want to predict Y**

**iloc: select a specific row or column from the data set**

In [None]:
X = dataset.iloc[:,dataset.columns != 'Recurrence']
Y = dataset['Recurrence']

**X contains all feature values, Y contains the class**

In [None]:
X,Y

**the values of features vary in range, we need to perform feature scaling**

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
pd.DataFrame(X)

**to get the number of data point in each class**

In [None]:
dataset['Recurrence'].value_counts()

**performing oversampling**

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
X, Y = oversample.fit_resample(X, Y)

# **Data Splitting (training, testing)**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)
Y_train.value_counts()

# **Build Model**

## Naive Bayes


Here, we used gaussian naive bayes because we have variables with continous values

In [None]:
from sklearn.naive_bayes import GaussianNB
# instantiate the model
gnb_classifier = GaussianNB()
# fit the model
gnb_classifier.fit(X_train, Y_train)
#predict result
gnb_y_pred = gnb_classifier.predict(X_test)
print('Training set: {:.2f}%'.format(gnb_classifier.score(X_train, Y_train)*100))
print('Testing set: {:.2f}%'.format(gnb_classifier.score(X_test, Y_test)*100))

## SVM

In [None]:
from sklearn.svm import SVC
svc_classifier = SVC(kernel='linear')
svc_classifier.fit(X_train, Y_train)
svc_y_predict = svc_classifier.predict(X_test)
print('Training set: {:.2f}%'.format(svc_classifier.score(X_train, Y_train)*100))
print('Testing set: {:.2f}%'.format(svc_classifier.score(X_test, Y_test)*100))

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier as DT
DT_classifier = DT(criterion = "entropy", random_state = 100, 	max_depth = 3, min_samples_leaf = 5)
DT_classifier.fit(X_train, Y_train)
DT_y_predict= DT_classifier.predict(X_test)
print('Training set: {:.2f}%'.format(DT_classifier.score(X_train, Y_train)*100))
print('Testing set: {:.2f}%'.format(DT_classifier.score(X_test, Y_test)*100))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
forest.fit(X_train, Y_train)
rf_y_predict= forest.predict(X_test)
print('Training set: {:.2f}%'.format(forest.score(X_train, Y_train)*100))
print('Testing set: {:.2f}%'.format(forest.score(X_test, Y_test)*100))

# **Evaluation**

## Confusion Matrix

In [None]:
# create confusion matrix
gnb_cm=confusion_matrix(Y_test,gnb_y_pred)
# display the matrix
sns.heatmap(gnb_cm,annot=True)
# set title
plt.title("confusion matrix for Naive Bayes")
# print classification report
print(classification_report(Y_test, gnb_y_pred))

In [None]:
svc_cm=confusion_matrix(Y_test,svc_y_predict)
sns.heatmap(svc_cm,annot=True)
plt.title("confusion matrix for SVM")
print(classification_report(Y_test, svc_y_predict))

In [None]:
DT_cm=confusion_matrix(Y_test,DT_y_predict)
sns.heatmap(DT_cm,annot=True)
plt.title("confusion matrix for Decision tree")
print(classification_report(Y_test,DT_y_predict))

In [None]:
rf = confusion_matrix(Y_test, rf_y_predict)
sns.heatmap(rf, annot=True)
plt.title("confusion matrix for Random Forest")
print(classification_report(Y_test,rf_y_predict))