In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score
import bisect

In [2]:
def calculate_caim(xj, yj, D):
    sp = [0] + [np.searchsorted(xj, point) for point in D[1:-1]] + [len(xj)]
    n = len(sp) - 1
    isum = 0
    for j in range(n):
        init = sp[j]
        fin = sp[j + 1]
        Mr = yj[init:fin].shape[0]
        if Mr == 0:
            continue
        _, counts = np.unique(yj[init:fin], return_counts=True)
        maxr = counts.max()
        isum += (maxr / Mr) * maxr
    return isum / n if n > 0 else 0

In [3]:
def caim(X, y, verbose=False):
    json = {}
    clases = np.unique(y)
    min_splits = clases.shape[0]
    if verbose:
        print(f"hay un máximo de {min_splits} cortes")
    for j in range(X.shape[1]):
        xj = X[:, j]
        new_index = xj.argsort()
        xj = xj[new_index]
        yj = y[new_index]
        division_points = np.unique(xj)[1:-1].tolist()
        gc = -1
        D = [xj[0], xj[-1]]
        bc = 0
        mejor_D = D.copy()  # Initialize to avoid reference issues
        k = 1
        while k <= min_splits and (gc < bc or division_points):
            midpoints = np.random.permutation(division_points).tolist()
            best_caim = 0
            k += 1
            if verbose:
                print(f"ahora k vale {k}")
            while midpoints:
                sp = midpoints.pop()
                D_temp = D.copy()
                D_temp.append(sp)
                D_temp.sort()
                caim = calculate_caim(xj, yj, D_temp)
                if verbose:
                    print(f"{D_temp} tiene caim de: {caim}")
                if caim > bc:
                    if verbose:
                        print(f"Este caim es mejor, se actualiza best caim a {caim}")
                    bc = caim
                    mejor_D = D_temp
            if bc > gc:
                gc = bc
                D = mejor_D
                if verbose:
                    print(f"se cambia el D, antes era {D}, ahora es {mejor_D}")
        json[j] = D
        print(f"La columna {j} tiene global caim de {gc}")
    return json

In [5]:
class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.parameters = {}
        for cls in self.classes:
            X_cls = X[y == cls]
            self.parameters[cls] = {
                'prior': len(X_cls) / len(X),
                'likelihoods': {i: self.calculate_likelihood(X_cls[:, i]) for i in range(X.shape[1])}
            }

    def calculate_likelihood(self, feature):
        values, counts = np.unique(feature, return_counts=True)
        return dict(zip(values, counts / len(feature)))

    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = []
            for cls in self.classes:
                prior = self.parameters[cls]['prior']
                likelihood = np.prod([self.parameters[cls]['likelihoods'][i].get(val, 1e-6) for i, val in enumerate(x)])
                posteriors.append(prior * likelihood)
            y_pred.append(self.classes[np.argmax(posteriors)])
        return np.array(y_pred)

In [6]:
def shuffle_data(X, y):
    n = len(y)  
    for i in range(n-1, 0, -1):
        j = random.randint(0, i)  # Seleccionar un índice aleatorio desde 0 hasta i
        # Intercambiar X[i] con X[j] y y[i] con y[j]
        X[i], X[j] = X[j], X[i]
        y[i], y[j] = y[j], y[i]
    return X, y

In [7]:
def KFold(n_splits, n_samples):
    indices = list(range(n_samples))
    fold_sizes = [n_samples // n_splits] * n_splits
    for i in range(n_samples % n_splits):
        fold_sizes[i] += 1
    
    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        train_indices = indices[:start] + indices[stop:]
        test_indices = indices[start:stop]
        folds.append((train_indices, test_indices))
        current = stop
    return folds

In [8]:
def accuracy_score2(y_true, y_pred):
    correct_predictions = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    total_predictions = len(y_true)
    return correct_predictions / total_predictions


In [9]:
def discretize_data_np(data, partitions):
    discretized_data = np.copy(data)
    
    for col_index, breakpoints in partitions.items():
        min_val = np.min(data[:, col_index])
        max_val = np.max(data[:, col_index])

        effective_breakpoints = [min_val] + sorted(breakpoints) + [max_val]
        
        # Discretiza la columna
        column_data = data[:, col_index]
        discretized_column = np.digitize(column_data, bins=effective_breakpoints, right=False)
        
        max_bin_index = len(breakpoints) + 1  
        discretized_column[discretized_column > max_bin_index] = max_bin_index
        
        discretized_data[:, col_index] = discretized_column
    
    return discretized_data

In [10]:
csv_file = "heart.csv" #cambiar el nombre a alguna otra base de datos

In [11]:
data = pd.read_csv(csv_file) # leer la base de datos

columnas = list(data.columns) # obtener la lista de las variables
variables = columnas[:-1] # todas las variables menos la última que es la clase
clase = columnas[-1] # la útlima columna es la clase

print("las variables son:")
for variable in variables:
    print(f"{variable}\n")
print("las clases son")
print(f"{data[clase].unique()}\n")

X = data[variables] # variables
y = data[clase] # target
               ###########################################
X= np.array(X) # la clase NaiveBayes recibe arrays       #
y= np.array(y) # por eso hay que cambiar el tipo de dato #
               ###########################################
    
    
X,y =shuffle_data(X,y) #shuffle


las variables son:
age

trestbps

chol

thalach

oldpeak

las clases son
[1 0]



In [20]:
min(data["thalach"])

71

In [12]:
for j in range(X.shape[1]):
    print(f"maximo y minimo de {j}")
    print(np.max(X[:,j]))
    print(np.min(X[:,j]))

maximo y minimo de 0
76.0
29.0
maximo y minimo de 1
200.0
94.0
maximo y minimo de 2
564.0
126.0
maximo y minimo de 3
202.0
96.0
maximo y minimo de 4
4.2
0.0


In [13]:
intervalos =caim(X,y,True)
X_disc = discretize_data_np(X, intervalos)

hay un máximo de 2 cortes
ahora k vale 2
[29.0, 35.0, 76.0] tiene caim de: 45.13455149501661
Este caim es mejor, se actualiza best caim a 45.13455149501661
[29.0, 67.0, 76.0] tiene caim de: 45.025719831769656
[29.0, 57.0, 76.0] tiene caim de: 45.02261004891062
[29.0, 42.0, 76.0] tiene caim de: 44.99579124579125
[29.0, 39.0, 76.0] tiene caim de: 45.11628268991283
[29.0, 69.0, 76.0] tiene caim de: 45.11628268991283
[29.0, 49.0, 76.0] tiene caim de: 45.11320317329487
[29.0, 59.0, 76.0] tiene caim de: 44.94282263992439
[29.0, 54.0, 76.0] tiene caim de: 45.18216443393749
Este caim es mejor, se actualiza best caim a 45.18216443393749
[29.0, 68.0, 76.0] tiene caim de: 45.20694444444444
Este caim es mejor, se actualiza best caim a 45.20694444444444
[29.0, 65.0, 76.0] tiene caim de: 44.92715988083416
[29.0, 70.0, 76.0] tiene caim de: 44.952330508474574
[29.0, 43.0, 76.0] tiene caim de: 45.02653137218395
[29.0, 44.0, 76.0] tiene caim de: 45.17017121010638
[29.0, 41.0, 76.0] tiene caim de: 45.044

In [14]:
for j in range(X.shape[1]):
    print(f"maximo y minimo de {j}")
    print(np.max(X[:,j]))
    print(np.min(X[:,j]))

print(intervalos)

maximo y minimo de 0
76.0
29.0
maximo y minimo de 1
200.0
94.0
maximo y minimo de 2
564.0
126.0
maximo y minimo de 3
202.0
96.0
maximo y minimo de 4
4.2
0.0
{0: [29.0, 55.0, 76.0], 1: [94.0, 148.0, 200.0], 2: [126.0, 269.0, 564.0], 3: [96.0, 137.0, 202.0], 4: [0.0, 0.5, 4.2]}


In [15]:
clf = NaiveBayes()
n_samples = len(y)
kf = KFold(n_splits=5, n_samples=n_samples)
accuracies = []  
confusion_matrices = []  

# Validación cruzada
for train_index, test_index in kf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracies.append(accuracy_score(y_test, y_pred))

In [16]:
average_precision = np.mean(accuracies)
std_precision = np.std(accuracies)

print("Precisión promedio:", average_precision)
print("Desviación estándar de la presición:", std_precision)

Precisión promedio: 0.4090163934426229
Desviación estándar de la presición: 0.059350400713031395


In [17]:
for e in kf:
    print(f"{e[0]}\n{e[1]}\n")

[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268