In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



data = datasets.load_breast_cancer()
RANDOM_STATE = 0
np.random.seed(RANDOM_STATE)

scaler = StandardScaler()
df = pd.DataFrame(data = data.data, columns = data.feature_names)


X = scaler.fit_transform(data.data)
y = data.target

df_scaled = pd.DataFrame(data = X, columns = data.feature_names)
df['target'] = y

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state=RANDOM_STATE)


In [None]:
#1
from sklearn.linear_model import LogisticRegression


lr_model = LogisticRegression(max_iter = 1000)
lr_model.fit(X_train, y_train)

lr_model.predict(X_test)

print("Model Accuracy:", lr_model.score(X_test,y_test))



In [None]:
#2
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt
import numpy as np


cluster_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35]
silhouette_scores = []

for k in cluster_range:
  gmm = GaussianMixture(n_components=k, random_state = RANDOM_STATE)

  cluster_labels = gmm.fit_predict(X_train)
  score = silhouette_score(X_train, cluster_labels)
  silhouette_scores.append(score)


optimal_k = cluster_range[np.argmax(silhouette_scores)]
optimal_k, silhouette_scores

print("Optimal K value:", optimal_k)
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.grid(True)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(range(1, max(cluster_range)+1))
plt.title('Silhouette Score vs. Number of Clusters')
plt.show()

If we analyze the silhouette scores across the possible values of k, we observe a sharp decrease when transitioning from k=3 to k=4 clusters. Beyond this, there is a gradual decline from k=4 to k=10. After k=10, the silhouette scores decline at a lower pace.

From this analysis we can conclude that the number of clusters is inversely correlated with the quality of clustering, since higher k's lead to lower silhouette scores. Specifically, the noticeable drop in silhouette scores from 3 to 4 clusters indicates a significant reduction in clustering quality.

In [4]:
#3.

test_probas = []

for k in cluster_range:
  gmm = GaussianMixture(n_components=k,random_state = RANDOM_STATE)
  cluster_labels = gmm.fit_predict(X_train)
  test_proba = gmm.predict_proba(X_test)
  test_probas.append(test_proba)
  
assigned_cluster = []
for i in range(len(test_probas)):
  assigned_cluster_k = []
  for j in range(len(test_probas[i])):
    assigned_cluster_k.append(int(np.argmax(test_probas[i][j])))

  assigned_cluster.append(assigned_cluster_k)



In [None]:
#4
lr_k_clusters_acc = []
for k in range(len(cluster_range)):
  lr_k_cluster = LogisticRegression(random_state = RANDOM_STATE).fit(test_probas[k], y_test)
  y_pred = lr_k_cluster.predict(test_probas[k])
  lr_k_clusters_acc.append(lr_k_cluster.score(test_probas[k], y_test))

plt.figure(figsize=(10, 6))
plt.plot(cluster_range, lr_k_clusters_acc, marker='o')
plt.grid(True)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Accuracy')
plt.xticks(range(1, max(cluster_range)+1))
plt.title('Accuracy vs. Number of Clusters')
plt.show()


Looking at the graph, we can conclude that the optimal value for
k is 9, with an accuracy of around 0.94. This shows that the optimal 
k value in this case is not 2, possibly because having more clusters makes it possible to capture more important information than with just two clusters. Additionally, the sharp decrease noticeable in the Silhouette Score graph does not appear in the accuracy graph until the number of clusters exceeds 10.

In conclusion, there does not seem to be any correlation between the cluster evaluation values and the accuracy values when comparing them for the same number of clusters.

In [6]:
gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=RANDOM_STATE)
gmm.fit(X_train)

centers = gmm.means_
covariances = gmm.covariances_
cov_matrixes = covariances

In [None]:
#5
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# RBF activation function
def rbf_activation(X, center, cov_matrix):

    diff = X - center
    inv_cov = np.linalg.inv(cov_matrix)
    exponent = -0.5 * np.sum(diff @ inv_cov * diff, axis=1)
    return np.exp(exponent)

def forward_propagation(X, centers, cov_matrixes, weights):

    rbf_outputs = np.array([rbf_activation(X, center, cov_matrix) for (center,cov_matrix) in zip(centers, cov_matrixes)])  # (k, n)
    rbf_outputs = rbf_outputs.T 

    outputs = sigmoid(np.dot(rbf_outputs, weights))

    return rbf_outputs, outputs

# Training function

def train_rbf_network(X, T, centers, cov_matrixes, weights, learning_rate, epochs):
    T = T.reshape(-1, 1) # reshapes the target vector `T` into a column vector with dimensions (n, 1)
    for epoch in range(epochs):
        rbf_Phi, O = forward_propagation(X, centers, cov_matrixes, weights)

        errors = T - O
        # Update weights
        weights += learning_rate * np.dot(rbf_Phi.T,errors)

    return weights

def predict(X, centers, cov_matrixes, weights):

    _, O = forward_propagation(X, centers, cov_matrixes, weights)

    return (O >= 0.5).astype(int)  # Binary classification (0 or 1),
                                  # if over 0.5 than there's more chance of being 1

learning_rate = [0.001, 0.01, 0.1]
epochs_possibilities = [10, 50, 100, 200, 300, 400, 500]

train_accuracies = {lr: [] for lr in learning_rate}
test_accuracies = {lr: [] for lr in learning_rate}


for epochs in epochs_possibilities:
    for lr in learning_rate:
        weights = np.full((len(centers), 1), 0.1)  # Initialize the weights for output layer (binary classification)
        # Train the network
        trained_weights = train_rbf_network(X_train, y_train, centers, cov_matrixes, weights, lr, epochs)

        # Predict and evaluate accuracy for training set
        y_train_pred = predict(X_train, centers, cov_matrixes, trained_weights)
        train_acc = accuracy_score(y_train, y_train_pred)
        train_accuracies[lr].append(train_acc)

        y_test_pred = predict(X_test, centers, cov_matrixes, trained_weights)
        test_acc = accuracy_score(y_test, y_test_pred)
        test_accuracies[lr].append(test_acc)


plt.figure(figsize=(10, 6))

lr_colors = {
    0.001: 'blue',
    0.01: 'green',
    0.1: 'red'
}

for lr in learning_rate:
    plt.plot(epochs_possibilities, train_accuracies[lr], label=f'Training Accuracy (lr={lr})',color=lr_colors[lr])


for lr in learning_rate:
    plt.plot(epochs_possibilities, test_accuracies[lr], linestyle='--', label=f'Testing Accuracy (lr={lr})', color=lr_colors[lr])

plt.title('Training and Testing Accuracy vs. Epochs for Different Learning Rates')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.grid(True)

plt.show()


In [None]:

import seaborn as sns

X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

X['target'] = y

correlation_matrix = X.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Matrix with Target Variable')
plt.show()