In [102]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier


df = pd.read_csv("data/breast-cancer-wisconsin.data", header=None)
df = df.dropna()
df = df.loc[df.iloc[:, 6] != "?"]
df.iloc[:, 6] = df.iloc[:, 6].astype("int")

x = df.iloc[:, 1:10]
y = df.iloc[:, 10]

In [103]:
fig = px.imshow(df.corr(), width=500, height=500)
fig.show()

In [104]:
pca = PCA(n_components=2)


x_pca = pca.fit_transform(x)
X_pca_train, X_pca_test, y_pca_train, y_pca_test = train_test_split(
    x_pca, y, test_size=0.3, random_state=42
)

In [105]:
knn3_pca = KNeighborsClassifier(n_neighbors=3)
knn7_pca = KNeighborsClassifier(n_neighbors=7)

knn3_pca.fit(X_pca_train, y_pca_train)
knn7_pca.fit(X_pca_train, y_pca_train)

knn3_pca.predict(X_pca_test)
knn7_pca.predict(X_pca_test)

print(knn3_pca.score(X_pca_test, y_pca_test))
print(knn7_pca.score(X_pca_test, y_pca_test))

0.9658536585365853
0.975609756097561


In [106]:
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=42
)

knn3 = KNeighborsClassifier(n_neighbors=3)
knn7 = KNeighborsClassifier(n_neighbors=7)

knn3.fit(X_train, y_train)
knn7.fit(X_train, y_train)

In [107]:
knn3.predict(X_test)
knn7.predict(X_test)

print(knn3.score(X_test, y_test))
print(knn7.score(X_test, y_test))

0.9512195121951219
0.9560975609756097


In [108]:
log_regrese_model = LogisticRegression(C=1, penalty="l2", solver="liblinear")
log_regrese_model.fit(X_train, y_train)

log_regrese_model.predict(X_test)

print(log_regrese_model.score(X_test, y_test))

0.9512195121951219


In [109]:
log_regrese_model_pca = LogisticRegression(C=1, penalty="l2", solver="liblinear")
log_regrese_model_pca.fit(X_pca_train, y_pca_train)

log_regrese_model_pca.predict(X_pca_test)

print(log_regrese_model_pca.score(X_pca_test, y_pca_test))

0.9512195121951219


In [110]:
suppvm_lin = svm.SVC(kernel="linear")

suppvm_lin.fit(X_train, y_train)
suppvm_lin.predict(X_test)

print(suppvm_lin.score(X_test, y_test))

0.9609756097560975


In [111]:
suppvm_poly = svm.SVC(kernel="poly")

suppvm_poly.fit(X_train, y_train)
suppvm_poly.predict(X_test)

print(suppvm_poly.score(X_test, y_test))

0.9609756097560975


In [112]:
suppvm_rbf = svm.SVC(kernel="rbf")

suppvm_rbf.fit(X_train, y_train)
suppvm_rbf.predict(X_test)

print(suppvm.score(X_test, y_test))

0.9512195121951219


In [113]:
suppvm_pca_linear = svm.SVC(kernel="linear")

suppvm_pca_linear.fit(X_pca_train, y_pca_train)
suppvm_pca_linear.predict(X_pca_test)

print(suppvm_pca_linear.score(X_pca_test, y_pca_test))

0.9512195121951219


In [114]:
suppvm_pca_poly = svm.SVC(kernel="poly")

suppvm_pca_poly.fit(X_pca_train, y_pca_train)
suppvm_pca_poly.predict(X_pca_test)

print(suppvm_pca_poly.score(X_pca_test, y_pca_test))

0.9317073170731708


In [115]:
suppvm_pca_rbf = svm.SVC(kernel="rbf")

suppvm_pca_rbf.fit(X_pca_train, y_pca_train)
suppvm_pca_rbf.predict(X_pca_test)

print(suppvm_pca_rbf.score(X_pca_test, y_pca_test))

0.9707317073170731


In [116]:
rf = RandomForestClassifier(max_depth=100)

rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_test, y_test))

0.9560975609756097


In [117]:
rf_pca = RandomForestClassifier(max_depth=100)

rf_pca.fit(X_pca_train, y_pca_train)
rf_pca.predict(X_pca_test)
print(rf_pca.score(X_pca_test, y_pca_test))

0.9609756097560975


In [118]:
models = [knn3, knn7, log_regrese_model, suppvm_lin, suppvm_poly, suppvm_rbf, rf]
models_pca = [knn3_pca, knn7_pca, log_regrese_model_pca, suppvm_pca_linear, suppvm_pca_poly, suppvm_pca_rbf, rf_pca]
models_score = []
models_pca_score = []


for i in models:
    models_score.append(i.score(X_test, y_test))

for i in models_pca:
    models_pca_score.append(i.score(X_pca_test, y_pca_test))

In [119]:
models = ["knn3", "knn7", "log_regrese_model", "suppvm_lin", "suppvm_poly", "suppvm_rbf", "rf"]
models_pca = ["knn3_pca", "knn7_pca", "log_regrese_model_pca", "suppvm_pca_linear", "suppvm_pca_poly", "suppvm_pca_rbf", "rf_pca"]

fig = px.bar(x=models, y=models_score)
fig.show()

In [120]:
fig = px.bar(x=models_pca, y=models_pca_score)
fig.show()