In [3]:
# importing necessary libraries
import numpy as np
from scipy.stats import norm

In [4]:
# data storage
data = np.array ([
    [184, 0, 0, 1, 2.1, 0],
    [2, 1, 30, 1, 1, 'normal'],
    [3, 0, 32, 1, 0, 'tumor'],
    [4, -1, 20, 1, 0, 'tumor'],
    [5, -1, 15, 0, 0, 'tumor'],
    [6, -1, 13, 0, 1, 'normal'],
    [7, 0, 11, 0, 1, 'tumor'],
    [8, 1, 22, 1, 0, 'normal'],
    [9, 1, 14, 0, 0, 'tumor'],
    [10, -1, 24, 0, 0, 'tumor'],
    [11, 1, 23, 0, 1, 'tumor'],
    [12, 0, 25, 1, 1, 'tumor'],
    [13, 0, 33, 0, 0, 'tumor'],
    [14, -1, 21, 1, 1, 'normal'],
    [15, 1, 12, 1, 1, 'unknown']
])

In [51]:
# Separate data and labels
X = data[:, 1:-1].astype(float)  # Features excluding Sample and Class columns
y = data[:, -1]

In [52]:
# Separate data for prediction (row 15)
X_pred = X[-1].reshape(1, -1)

In [53]:
# Remove the row for prediction from the training data
X = X[:-1]
y = y[:-1]

In [54]:
# Group the data by class
class_data = {}
for i in range(len(X)):
    class_label = y[i]
    if class_label not in class_data:
        class_data[class_label] = []
    class_data[class_label].append(X[i])

class_data

{'normal': [array([ 1., 35.,  1.,  0.]),
  array([ 1., 30.,  1.,  1.]),
  array([-1., 13.,  0.,  1.]),
  array([ 1., 22.,  1.,  0.]),
  array([-1., 21.,  1.,  1.])],
 'tumor': [array([ 0., 32.,  1.,  0.]),
  array([-1., 20.,  1.,  0.]),
  array([-1., 15.,  0.,  0.]),
  array([ 0., 11.,  0.,  1.]),
  array([ 1., 14.,  0.,  0.]),
  array([-1., 24.,  0.,  0.]),
  array([ 1., 23.,  0.,  1.]),
  array([ 0., 25.,  1.,  1.]),
  array([ 0., 33.,  0.,  0.])]}

In [55]:
# Calculate mean and standard deviation for each class and feature
feature_index = 1  # Assuming Gene B is the second column (index 1) in your data
class_params = {}
for class_label, class_samples in class_data.items():
    feature_values = [sample[feature_index] for sample in class_samples]
    print(f"{class_label}: {feature_values}") 
    class_params[class_label] = {
        'mean': np.mean(feature_values),
        'std': np.std(feature_values, ddof=1),
    }
class_params

normal: [35.0, 30.0, 13.0, 22.0, 21.0]
tumor: [32.0, 20.0, 15.0, 11.0, 14.0, 24.0, 23.0, 25.0, 33.0]


{'normal': {'mean': 24.2, 'std': 8.526429498916883},
 'tumor': {'mean': 21.88888888888889, 'std': 7.688375063113864}}

In [63]:
def custom_gauss_pdf(mean, std, x_i):
    pdf_val = (1 / (std * np.sqrt(2 * np.pi))) * (np.exp(-0.5 * ((x_i - mean) / std)**2 ))
    return pdf_val

In [65]:
likelihoods = {}
for class_label, params in class_params.items():
    likelihoods[class_label] = norm.pdf(X_pred[0, 1], loc=params['mean'], scale=params['std'])
    #likelihoods[class_label] = custom_gauss_pdf(params['mean'], params['std'], X_pred[0,1])
    #print(f"mean:{params['mean']}, std:{params['std']}, x_pred:{X_pred[0,1]}")
likelihoods

{'normal': 0.016810222465183086, 'tumor': 0.02269024718788692}

In [89]:
# Create a dictionary to store counts of feature values for each class
feature_counts_by_class = {}
class_labels = np.unique(y)

# skip continuous features
continuous_feature_indices = [1] 

# Initialize the dictionary with zeros
for class_label in class_labels:
    feature_counts_by_class[class_label] = {}

for feature_index in range(X.shape[1]):  # Loop over each feature
    for class_label in class_labels:  # Loop over each class
        feature_counts_by_class[class_label][feature_index] = {}

        # Count occurrences of each feature value for the current class
        for feature_value in np.unique(X[:, feature_index]):
            count = np.sum((X[:, feature_index] == feature_value) & (y == class_label))
            feature_counts_by_class[class_label][feature_index][feature_value] = count

# Display the counts for each feature value given each class
for class_label, feature_counts in feature_counts_by_class.items():
    print(f"Class: {class_label}")
    for feature_index, value_counts in feature_counts.items():
        print(f"  Feature {feature_index}: {value_counts}")



Class: normal
  Feature 0: {-1.0: 2, 0.0: 0, 1.0: 3}
  Feature 1: {11.0: 0, 13.0: 1, 14.0: 0, 15.0: 0, 20.0: 0, 21.0: 1, 22.0: 1, 23.0: 0, 24.0: 0, 25.0: 0, 30.0: 1, 32.0: 0, 33.0: 0, 35.0: 1}
  Feature 2: {0.0: 1, 1.0: 4}
  Feature 3: {0.0: 2, 1.0: 3}
Class: tumor
  Feature 0: {-1.0: 3, 0.0: 4, 1.0: 2}
  Feature 1: {11.0: 1, 13.0: 0, 14.0: 1, 15.0: 1, 20.0: 1, 21.0: 0, 22.0: 0, 23.0: 1, 24.0: 1, 25.0: 1, 30.0: 0, 32.0: 1, 33.0: 1, 35.0: 0}
  Feature 2: {0.0: 6, 1.0: 3}
  Feature 3: {0.0: 6, 1.0: 3}


In [88]:

# Calculate probabilities for each instance of X_pred given each class
probabilities_by_class = {}
for class_label in class_labels:
    probabilities_by_class[class_label] = {}

    # Calculate the total count for each feature in the training data
    total_counts = np.sum([count for count in feature_counts_by_class[class_label][feature_index].values()])

    for feature_index, feature_value in enumerate(X_pred[0]):
        if feature_index not in continuous_feature_indices:
            # Use only counts without Laplace smoothing
            count = feature_counts_by_class[class_label][feature_index].get(int(feature_value), 0)
            probability = count / total_counts
            probabilities_by_class[class_label][feature_index] = probability

# Display the calculated probabilities
print("\nProbabilities for each instance of X_pred given each class:")
for class_label, probabilities in probabilities_by_class.items():
    print(f"Class: {class_label}")
    for feature_index, probability in probabilities.items():
        print(f"  Feature {feature_index}: {probability:.4f}")


Probabilities for each instance of X_pred given each class:
Class: normal
  Feature 0: 0.6000
  Feature 2: 0.8000
  Feature 3: 0.6000
Class: tumor
  Feature 0: 0.2222
  Feature 2: 0.3333
  Feature 3: 0.3333


0.2222222222222222