In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np


label_encoder = LabelEncoder()
# Load the CSV file into a DataFrame
df = pd.read_csv('/kaggle/input/cancer-data/Cancer_Data.csv')

# Display the first few rows of the DataFrame
print(df.head())

# Separate the target and features

X = df.drop(columns=['id', 'diagnosis'])  # Features 
y = label_encoder.fit_transform(df['diagnosis'])  # Target column

# Convert to NumPy arrays
X = X.to_numpy()
X = np.delete(X, 30, axis=1) # nan column

# Display the shapes of X and y
print("Features shape:", X.shape)
print("Target shape:", y.shape)

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [2]:
def pairwise_mean_feature_extraction(data):
    """
    Perform pairwise mean feature extraction on the input data.
    
    Parameters:
    data (numpy.ndarray): Input data where each row represents an observation and each column represents a feature.
    
    Returns:
    numpy.ndarray: Extracted features where each row represents an observation and each column represents the mean of a pair of columns.
    """
    num_features = data.shape[1]
    num_pairs = num_features - 1
    extracted_features = np.zeros((data.shape[0], num_pairs))
    
    for i in range(num_pairs):
        extracted_features[:, i] = np.mean(data[:, i:i+2], axis=1)
    
    return extracted_features

In [3]:
X_extracted = pairwise_mean_feature_extraction(X)
print(X_extracted.shape)

(569, 29)


In [4]:
pip install pyswarm

Collecting pyswarm
  Downloading pyswarm-0.6.tar.gz (4.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyswarm
  Building wheel for pyswarm (setup.py) ... [?25ldone
[?25h  Created wheel for pyswarm: filename=pyswarm-0.6-py3-none-any.whl size=4464 sha256=e25b5fbbe798891eab3e94a26588715ebaa5b32f055750f3c692db88d9133987
  Stored in directory: /root/.cache/pip/wheels/71/67/40/62fa158f497f942277cbab8199b05cb61c571ab324e67ad0d6
Successfully built pyswarm
Installing collected packages: pyswarm
Successfully installed pyswarm-0.6
Note: you may need to restart the kernel to use updated packages.


In [6]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from pyswarm import pso



# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_extracted, y, test_size=0.2, random_state=42)

# Define fitness function
def fitness_function(particle, X_train, X_test, y_train, y_test):
    # Convert particle to a binary mask (threshold at 0.5)
    mask = particle > 0.5
    # Ensure at least one feature is selected
    if np.sum(mask) == 0:
        return 1.0  # return worst score
    
    selected_features_train = X_train[:, mask]
    selected_features_test = X_test[:, mask]
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(selected_features_train, y_train)
    accuracy = clf.score(selected_features_test, y_test)
    return -accuracy


# Define lower and upper bounds for each feature
lb = np.zeros(X_train.shape[1], dtype=int)
ub = np.ones(X_train.shape[1], dtype=int)


# Perform PSO for feature selection
best_particle, _ = pso(fitness_function, lb, ub, args=(X_train, X_test, y_train, y_test), swarmsize=100, maxiter=100)

# Convert the best_particle to a binary mask (threshold at 0.5)
best_features_mask = best_particle > 0.5

# Print the indices of the selected features
selected_feature_indices = np.where(best_features_mask)[0]
print("Selected features:", selected_feature_indices)

Stopping search: maximum iterations reached --> 100
Selected features: [ 0  1  2  3  5  7  9 10 13 14 16 17 21 25 26]


In [83]:


X_extracted_selected = X_extracted[: , selected_feature_indices]
print(X_extracted_selected.shape)

(569, 15)


In [95]:
from sklearn.decomposition import NMF

# Apply Non-negative Matrix Factorization (NMF)
nmf = NMF(n_components=5)
X_extracted_reduced = nmf.fit_transform(X_extracted_selected)

print(X_extracted_reduced.shape)


(569, 5)


In [96]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_extracted_reduced, y, test_size=0.2, random_state=42)

In [97]:
class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            # Gradient descent
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return y_predicted_cls

In [98]:
if __name__ == "__main__":


    # Initialize and fit model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict

    predictions = model.predict(X_test)
    #print(predictions)

In [100]:
from sklearn.metrics import accuracy_score

accuracy_test = accuracy_score(y_test, predictions)


print("Accuracy of the Logistic Classifier on Test Set:", accuracy_test * 100)

Accuracy of the Logistic Classifier on Test Set: 71.05263157894737
