### "Breast Cancer Wisconsin (Diagnostic) dataset"

This dataset is often used in healthcare applications for cancer diagnosis.

In [1]:
# Step 1: Dataset Selection
from sklearn.datasets import load_breast_cancer

In [2]:
# Load the breast cancer dataset
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

In [3]:
# Step 2: Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Initialize StandardScaler to scale features
scaler = StandardScaler()

In [6]:
# Fit scaler to training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Step 3: Pipeline Setup
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [8]:
# Define the pipeline with preprocessing steps and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [9]:
# Step 4: Model Training
pipeline.fit(X_train_scaled, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier(random_state=42))])

In [10]:
# Step 5: Model Evaluation
from sklearn.metrics import accuracy_score, classification_report

In [11]:
# Predict on the testing data
y_pred = pipeline.predict(X_test_scaled)

In [12]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9649122807017544
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [13]:
# Step 6: Saving the Model
import joblib

In [14]:
# Save the trained model to disk
joblib.dump(pipeline, 'cancer_diagnosis_model.pkl')

['cancer_diagnosis_model.pkl']

In [15]:
# For real-time use, you can load the pre-trained model and use it to make predictions on new data
# Here, we'll simulate real-time prediction using a subset of the testing data

# Load the pre-trained model
loaded_model = joblib.load('cancer_diagnosis_model.pkl')

In [16]:
# Simulate new data for prediction (subset of testing data)
new_data = X_test[:5]  # Assuming new_data represents new patient records

In [17]:
# Preprocess the new data using the same preprocessing steps as during training
new_data_scaled = scaler.transform(new_data)

In [18]:
# Use the pre-trained model to make predictions on the new data
predictions = loaded_model.predict(new_data_scaled)
print("Predictions for new data:")
print(predictions)

Predictions for new data:
[1 0 0 1 1]
