In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import euclidean

# Step 1: Load the dataset
df = pd.read_csv('/content/data.csv')

# Step 2: Convert 'Diagnosis' to numerical values
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Step 3: Calculate the centroid of the normal (Benign) data points
normal_data = df[df['diagnosis'] == 0].drop(columns=['id', 'diagnosis'])  # Exclude 'id' and 'Diagnosis' columns
centroid_normal = normal_data.mean().values  # Calculate the centroid (mean of normal data points)

# Step 4: Calculate Euclidean distances for each data point from the centroid of normal points
def calculate_euclidean_distance(row, centroid):
    return euclidean(row.values, centroid)

df['euclidean_distance'] = df.drop(columns=['id', 'diagnosis']).apply(lambda row: calculate_euclidean_distance(row, centroid_normal), axis=1)

# Step 5: Normalize the distances to derive DS and SS
# Normalize Euclidean distances to a [0, 1] range for DS and SS
max_distance = df['euclidean_distance'].max()
min_distance = df['euclidean_distance'].min()

df['DS'] = (df['euclidean_distance'] - min_distance) / (max_distance - min_distance)  # Danger Signal
df['SS'] = 1 - df['DS']  # Safe Signal (Inverse of Danger Signal)

# Step 6: Initialize Dendritic Cells (DCs)
num_dcs = 10
dc_lifespan = np.random.uniform(10, 15, num_dcs)
cumulative_csm = np.zeros(num_dcs)
cumulative_k = np.zeros(num_dcs)
dc_antigen_count = np.zeros(num_dcs)

# Step 7: Calculate CSM and K for each DC
for i in range(len(df)):
    dc_index = i % num_dcs  # Assign antigen to a DC round-robin
    csm = df['DS'].iloc[i] + df['SS'].iloc[i]
    k = df['DS'].iloc[i] - 2 * df['SS'].iloc[i]

    # Update cumulative values for DC
    cumulative_csm[dc_index] += csm
    cumulative_k[dc_index] += k
    dc_antigen_count[dc_index] += 1

    # Update DC lifespan
    dc_lifespan[dc_index] -= csm
    if dc_lifespan[dc_index] <= 0:
        # Reset DC
        dc_lifespan[dc_index] = np.random.uniform(10, 15)
        cumulative_csm[dc_index] = 0
        cumulative_k[dc_index] = 0
        dc_antigen_count[dc_index] = 0

# Step 8: Define threshold and classify based on K value
# Let's assume if K > 0 it is classified as Malignant (anomalous), else Benign (normal)
threshold = 0
df['Predicted_M/B'] = np.where(cumulative_k > threshold, 1, 0)

# Step 9: Calculate Accuracy
accuracy = accuracy_score(df['Diagnosis'], df['Predicted_M/B'])
print(f'Accuracy: {accuracy * 100:.2f}%')

# Step 10: Save the results to a new CSV file
df.to_csv('breast_cancer_with_predictions.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset and handle missing values
df = pd.read_csv('/content/data.csv')
df.drop(columns=['Unnamed: 32'], inplace=True)  # Dropping irrelevant column

# Step 2: Convert 'diagnosis' to numerical values (M = 1, B = 0)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Handle missing values using SimpleImputer (mean strategy)
imputer = SimpleImputer(strategy='mean')
df[df.columns] = imputer.fit_transform(df[df.columns])

# Step 3: Compute correlation with the 'diagnosis' and select top features for DS and SS
correlation_matrix = df.corr()
class_correlation = correlation_matrix['diagnosis'].sort_values(ascending=False)
best_features = class_correlation.index[1:5]  # Top 4 features most correlated with 'diagnosis'
print(f"Best Features based on correlation: {best_features}")

# Step 4: Standardize the selected features for DS and SS computation
scaler = StandardScaler()
df[best_features] = scaler.fit_transform(df[best_features])

# Step 5: Calculate Euclidean distances for each data point from the centroid of normal (Benign) points
normal_data = df[df['diagnosis'] == 0][best_features]
centroid_normal = normal_data.mean().values  # Calculate the centroid (mean of normal data points)

def calculate_euclidean_distance(row, centroid):
    return euclidean(row.values, centroid)

df['euclidean_distance'] = df[best_features].apply(lambda row: calculate_euclidean_distance(row, centroid_normal), axis=1)

# Step 6: Normalize the distances to derive DS and SS
max_distance = df['euclidean_distance'].max()
min_distance = df['euclidean_distance'].min()

df['DS'] = (df['euclidean_distance'] - min_distance) / (max_distance - min_distance)  # Danger Signal
df['SS'] = 1 - df['DS']  # Safe Signal (Inverse of Danger Signal)

# Step 7: Initialize Dendritic Cells (DCs)
num_dcs = 20  # Number of Dendritic Cells
antigens_per_dc = len(df) * 2 // num_dcs  # Each DC will have twice the number of unique antigens
dc_lifespan = np.random.uniform(10, 15, num_dcs)
cumulative_csm = np.zeros(num_dcs)
cumulative_k = np.zeros(num_dcs)
dc_antigen_count = np.zeros(num_dcs)

# Step 8: Calculate CSM and K for each DC
for i in range(len(df)):
    dc_index = i % num_dcs  # Assign antigen to a DC in a round-robin fashion
    csm = df['DS'].iloc[i] + df['SS'].iloc[i]
    k = df['DS'].iloc[i] - 2 * df['SS'].iloc[i]

    # Update cumulative values for the assigned DC
    cumulative_csm[dc_index] += csm
    cumulative_k[dc_index] += k
    dc_antigen_count[dc_index] += 1

    # Update DC lifespan
    dc_lifespan[dc_index] -= csm
    if dc_lifespan[dc_index] <= 0:
        # Reset DC
        dc_lifespan[dc_index] = np.random.uniform(10, 15)
        cumulative_csm[dc_index] = 0
        cumulative_k[dc_index] = 0
        dc_antigen_count[dc_index] = 0

# Step 9: Define threshold and classify based on K value
threshold = 0  # If K > 0, classify as Malignant (1), else Benign (0)
df['Predicted_M/B'] = np.where(cumulative_k > threshold, 1, 0)

# Step 10: Calculate Accuracy
accuracy = accuracy_score(df['diagnosis'], df['Predicted_M/B'])
print(f'Accuracy: {accuracy * 100:.2f}%')

# Step 11: Save the results to a new CSV file
df.to_csv('/mnt/data/breast_cancer_ddca_with_predictions.csv', index=False)
print("Predictions have been saved to 'breast_cancer_ddca_with_predictions.csv'.")


Best Features based on correlation: Index(['concave points_worst', 'perimeter_worst', 'concave points_mean',
       'radius_worst'],
      dtype='object')


ValueError: Length of values (20) does not match length of index (569)

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset and handle missing values
df = pd.read_csv('/content/data.csv')
df.drop(columns=['Unnamed: 32'], inplace=True)  # Dropping irrelevant column

# Step 2: Convert 'diagnosis' to numerical values (M = 1, B = 0)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Handle missing values using SimpleImputer (mean strategy)
imputer = SimpleImputer(strategy='mean')
df[df.columns] = imputer.fit_transform(df[df.columns])

# Step 3: Compute correlation with the 'diagnosis' and select top features for DS and SS
correlation_matrix = df.corr()
class_correlation = correlation_matrix['diagnosis'].sort_values(ascending=False)
best_features = class_correlation.index[1:5]  # Top 4 features most correlated with 'diagnosis'
print(f"Best Features based on correlation: {best_features}")

# Step 4: Standardize the selected features for DS and SS computation
scaler = StandardScaler()
df[best_features] = scaler.fit_transform(df[best_features])

# Step 5: Calculate Euclidean distances for each data point from the centroid of normal (Benign) points
normal_data = df[df['diagnosis'] == 0][best_features]
centroid_normal = normal_data.mean().values  # Calculate the centroid (mean of normal data points)

def calculate_euclidean_distance(row, centroid):
    return euclidean(row.values, centroid)

df['euclidean_distance'] = df[best_features].apply(lambda row: calculate_euclidean_distance(row, centroid_normal), axis=1)

# Step 6: Normalize the distances to derive DS and SS
max_distance = df['euclidean_distance'].max()
min_distance = df['euclidean_distance'].min()

df['DS'] = (df['euclidean_distance'] - min_distance) / (max_distance - min_distance)  # Danger Signal
df['SS'] = 1 - df['DS']  # Safe Signal (Inverse of Danger Signal)

# Step 7: Initialize Dendritic Cells (DCs)
num_dcs = 20  # Number of Dendritic Cells
antigens_per_dc = len(df) * 2 // num_dcs  # Each DC will have twice the number of unique antigens
dc_lifespan = np.random.uniform(10, 15, num_dcs)  # Random lifespan between 10 and 15 for each DC
cumulative_csm = np.zeros(num_dcs)
cumulative_k = np.zeros(num_dcs)
dc_antigen_count = np.zeros(num_dcs)
mature_dc_count = np.zeros(len(df))  # To track antigen presence in mature DCs

# Step 8: Calculate CSM and K for each DC and determine if the DC matures
for i in range(len(df)):
    dc_index = i % num_dcs  # Assign antigen to a DC in a round-robin fashion
    csm = df['DS'].iloc[i] + df['SS'].iloc[i]
    k = df['DS'].iloc[i] - 2 * df['SS'].iloc[i]

    # Update cumulative values for the assigned DC
    cumulative_csm[dc_index] += csm
    cumulative_k[dc_index] += k
    dc_antigen_count[dc_index] += 1

    # Decrease the lifespan of the DC based on the CSM value
    dc_lifespan[dc_index] -= csm

    # Check if the DC has matured
    if dc_lifespan[dc_index] <= 0:
        # Mark antigens presented by this mature DC
        mature_dc_count[i] += 1  # Increase count of antigen appearance in mature DCs

        # Reset the DC
        dc_lifespan[dc_index] = np.random.uniform(10, 15)
        cumulative_csm[dc_index] = 0
        cumulative_k[dc_index] = 0
        dc_antigen_count[dc_index] = 0

# Step 9: Classify antigens based on their appearance in mature DCs
anomaly_threshold = 0.7  # Threshold for the MCAV (Mature Context Antigen Value)
df['MCAV'] = mature_dc_count / num_dcs  # Calculate MCAV for each antigen

# Classify antigens as malignant (1) if MCAV is greater than threshold, else benign (0)
df['Predicted_M/B'] = np.where(df['MCAV'] > anomaly_threshold, 1, 0)

# Step 10: Calculate accuracy by comparing with actual 'diagnosis'
accuracy = accuracy_score(df['diagnosis'], df['Predicted_M/B'])
print(f'Accuracy: {accuracy * 100:.2f}%')

# Step 11: Save the results to a new CSV file
df.to_csv('breast_cancer_ddca_with_predictions.csv', index=False)
print("Predictions have been saved to 'breast_cancer_ddca_with_predictions.csv'.")


Best Features based on correlation: Index(['concave points_worst', 'perimeter_worst', 'concave points_mean',
       'radius_worst'],
      dtype='object')
Accuracy: 62.74%
Predictions have been saved to 'breast_cancer_ddca_with_predictions.csv'.
