In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Read csv file
df = pd.read_csv('kag_risk_factors_cervical_cancer.csv')

In [None]:
# Taking a look at the first 5 rows of the dataset
df.head()

In [None]:
# Checking the shape (size) of dataset
df.shape

In [None]:
# Checking the columns' data types
df.info()

In [None]:
# Getting the statistical summary of dataset
df.describe().T

# Data Cleaning

In [None]:
# Checking for the missing values
df.isnull().sum()

In [None]:
# Checking for the duplicates
df.duplicated().sum()

In [None]:
# Handling the duplicates
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

# Data Visualization

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df["Age"])
plt.title("Age", size=15)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df["Number of sexual partners"])
plt.title("Number of sexual partners", size=15)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df["Hormonal Contraceptives"])
plt.title("Hormonal Contraceptiveses", size=15)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x=df["Smokes"], y=df["Biopsy"])
plt.title("Biopsy vs Smokes", size=15)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df["Num of pregnancies"])
plt.title("Num of pregnancies", size=15)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(df[df["Biopsy"]==1]["Age"], color="blue")
sns.distplot(df[df["Biopsy"]==0]["Age"], color="red")
plt.title("Biopsy vs Age", size=15)
plt.show()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True, cmap="RdBu")
plt.title("Correlation Between Variables", size=15)
plt.show()

# Data Preprocessing

In [None]:
# Split the data into X, y datasets
features = ["Age", "STDs: Number of diagnosis", "Dx:Cancer", "Dx:CIN", "Dx:HPV", "Dx", "Hinselmann", "Schiller", "Citology"]
X = df[features]
y = df["Biopsy"]

In [None]:
# Standardizing the data
ss = StandardScaler()
X = ss.fit_transform(X)

In [None]:
X = pd.DataFrame(X, columns=["Age", "STDs: Number of diagnosis", "Dx:Cancer", "Dx:CIN", "Dx:HPV", "Dx", "Hinselmann", "Schiller", "Citology"])
y = pd.DataFrame(y, columns=["Biopsy"])

In [None]:
# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Machine Learning Models

In [None]:
models = pd.DataFrame(columns=["Model","Accuracy Score"])

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
predictions = log_reg.predict(X_test)
score = accuracy_score(y_test, predictions)
print("LogisticRegression: ", score)

new_row = {"Model": "LogisticRegression", "Accuracy Score": score}
models = models.append(new_row, ignore_index=True)

In [None]:
GNB = GaussianNB()
GNB.fit(X_train, y_train)
predictions = GNB.predict(X_test)
score = accuracy_score(y_test, predictions)
print("GaussianNB: ", score)

new_row = {"Model": "GaussianNB", "Accuracy Score": score}
models = models.append(new_row, ignore_index=True)


In [None]:
BNB = BernoulliNB()
BNB.fit(X_train, y_train)
predictions = BNB.predict(X_test)
score = accuracy_score(y_test, predictions)
print("BernoulliNB: ", score)

new_row = {"Model": "BernoulliNB", "Accuracy Score": score}
models = models.append(new_row, ignore_index=True)

In [None]:
svm = SVC(random_state=0)
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)
score = accuracy_score(y_test, predictions)
print("SVC: ", score)

new_row = {"Model": "SVC", "Accuracy Score": score}
models = models.append(new_row, ignore_index=True)


In [None]:
randomforest = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=42)
randomforest.fit(X_train, y_train)
predictions = randomforest.predict(X_test)
score = accuracy_score(y_test, predictions)
print("RandomForestClassifier: ", score)

new_row = {"Model": "RandomForestClassifier", "Accuracy Score": score}
models = models.append(new_row, ignore_index=True)

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test)
score = accuracy_score(y_test, predictions)
print("XGBClassifier: ", score)

new_row = {"Model": "XGBClassifier", "Accuracy Score": score}
models = models.append(new_row, ignore_index=True)


In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
score = accuracy_score(y_test, predictions)
print("KNeighborsClassifier: ", score)

In [None]:
# Optimizing the KNN Model
score_list=[]

for i in range(1,20):
    knn2 = KNeighborsClassifier(n_neighbors=i)
    knn2.fit(X_train, y_train)
    predictions = knn2.predict(X_test)
    score_list.append(accuracy_score(predictions, y_test))

In [None]:
plt.figure(figsize =(10, 6))
plt.plot(range(1, 20), score_list, marker ='o', markerfacecolor ='red', markersize = 10)
  
plt.title('Score vs K Value', size=15)
plt.xlabel('K value')
plt.ylabel('Score')

The K value that we get the highest accuracy score with is between 12 and 15, so we are setting the "n_neighbors" parameter to 14

In [None]:
knn3 = KNeighborsClassifier(n_neighbors=14)
knn3.fit(X_train, y_train)
predictions = knn3.predict(X_test)
score = accuracy_score(y_test, predictions)
print("KNeighborsClassifier: ", score)

new_row = {"Model": "KNeighborsClassifier", "Accuracy Score": score}
models = models.append(new_row, ignore_index=True)

# Comparison of Machine Learning Models

In [None]:
models.sort_values(by="Accuracy Score", ascending=False)