# Project Cancer Detection

Project: Cancer Detection

1. Implement the following algorithms, and find out which one works the best, to predict cancer
   - Malignant
   - Benign

2. Algorithms:
   - Decision Tree
   - Random Forest
   - AdaBoost
   - KNN

3. Try to change the parameters of the models. Hyperparameters: things you can change for the models:
   - Depth of tree
   - Number of estimators
   - Number of neighbours

4. Find the Best Performance.

5. Best performance will get applause.

(Optional) 6. Make predictions on your own defined inputs, not just the X_test


In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler


In [2]:
# Load Breast Cancer dataset
data = load_breast_cancer()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)   # 0 = Malignant, 1 = Benign

In [3]:
# Train–Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Models Implementation 
# Decision Tree Classifier
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.9473684210526315
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [6]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9649122807017544
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [7]:
# AdaBoost Classifier
ab = AdaBoostClassifier(n_estimators=100, random_state=42)
ab.fit(X_train, y_train)
y_pred_ab = ab.predict(X_test)

print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_ab))
print(classification_report(y_test, y_pred_ab))


AdaBoost Accuracy: 0.9736842105263158
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [8]:
# KNN K Neighbors Classifier
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


KNN Accuracy: 0.9473684210526315
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [9]:
# Performance Comparison
results = pd.DataFrame({
    "Model": ["Decision Tree", "Random Forest", "AdaBoost", "KNN"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_dt),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_ab),
        accuracy_score(y_test, y_pred_knn)
    ]
})

results

Unnamed: 0,Model,Accuracy
0,Decision Tree,0.947368
1,Random Forest,0.964912
2,AdaBoost,0.973684
3,KNN,0.947368


In [11]:
# Predict on Your Own Input
# example: use first row as custom input
sample = X.iloc[[0]]

prediction = rf.predict(sample)

print("Prediction:", "Benign" if prediction[0] == 1 else "Malignant")


Prediction: Malignant


* We learn multiple ML algorithms because different datasets have different patterns, and understanding each model helps us choose, tune, and justify the best solution in real-world problems.

In [1]:
# code sir
import numpy as np

ar2d = [[1,2,3,4],
        [1,2,3,4],
        [1,2,3,4],
        [1,2,3,4]]

ar2d[0:2][1]

ar2d = np.array(ar2d)
ar2d[:,[0,2]]


array([[1, 3],
       [1, 3],
       [1, 3],
       [1, 3]])

In [9]:
from sklearn.datasets import load_iris
import pandas as pd

data = load_iris()
X = data.data

print(X.shape)   # (150, 4)
print(X.ndim)    # 2

X2 = X[:, :3]

df = pd.DataFrame(X)
df2 = pd.DataFrame(X2)

df2


(150, 4)
2


Unnamed: 0,0,1,2
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4
...,...,...,...
145,6.7,3.0,5.2
146,6.3,2.5,5.0
147,6.5,3.0,5.2
148,6.2,3.4,5.4


In [14]:
X = data.data
X.ndim
X = X[:,[0,1,2,3,4,5]]
X2 = X[:,0:6]
import pandas as pd
df = pd.DataFrame(X)
df2 = pd.DataFrame(X)
df2

IndexError: index 4 is out of bounds for axis 1 with size 4

In [15]:
data_df = pd.DataFrame(data.data)
data_df

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
