In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import pandas as pd
import numpy as np

from ML_loader import DatasetLoader,ModelLoader

In [3]:
#Get data 
loader = DatasetLoader()
loader.read_dataset('partial_database.csv',class_path='labelsDefault.txt',separator=',')
loader.split_dataset()

#Normalize data, if not already normalized
print(np.shape(loader.X_train),np.shape(loader.X_test),np.shape(loader.y_train),np.shape(loader.y_test))

(4048, 17) (1012, 17) (4048,) (1012,)


In [4]:
#Get model
model = ModelLoader()
model.set_model(MiniBatchKMeans(batch_size=1280))

cv = RepeatedStratifiedKFold(n_splits=22, n_repeats=3, random_state=1)

#Optimization 
param = {'n_clusters': list(range(1,23))}
model.set_optimal_params(param)

model.optimize(loader.X_train,loader.y_train,cv=cv,scoring='homogeneity_score',n_iter=10)

#Train model with optimal params
model.set_model(MiniBatchKMeans(**model.optimal_params))

In [6]:
##Split by class, to increment by class:
class_to_group = {1:[1,2,3,4,5,6,7],2:[8,9],3:[10,11,12,13],4:[14,15,16],5:[17,18],6:[19],7:[20,21,22]}
X_train_merged, X_test_merged, y_train_merged, y_test_merged=loader.split_dataset_class(class_to_group)
memory_values = []

#Train model, aime pas memory
for X_train,y_train,X_test,y_test in (zip(X_train_merged,y_train_merged,X_test_merged,y_test_merged)):
    print("Before: ",np.shape(X_train),np.shape(X_test),np.shape(y_train),np.shape(y_test),loader.classes)
    model.model.partial_fit(X_train,y_train,loader.classes)
    y_pred=model.predict(loader.X_test)
    print("Purity score group: ",(y_test,model.predict(X_test)))
    print("Purity score group: ",model.purity(loader.y_test,y_pred))
    """print("Memory: ",np.mean(mem),"MB")
    memory_values.append(np.mean(mem))"""
    
    cm = confusion_matrix(loader.y_test, y_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot()

model.reset_model()

"""average_memory = np.mean(memory_values)
print("Average memory usage: ", average_memory,"MB")"""

Before:  (1288, 17) (322, 17) (1288,) (322,) [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]


ValueError: sample_weight.shape == (22,), expected (1288,)!

In [None]:
X_train_split_data,y_train_split_data,X_test_split_data,y_test_split_data=loader.split_dataset_data(10)
memory_values = []

for x in range(len(X_train_split_data)):
    mem,res=model.partial_fit_train(X_train_split_data[x],y_train_split_data[x],classes=loader.classes)
    y_pred=model.predict(loader.X_test)
    print("Purity score group: ",model.purity(y_test,model.predict(X_test)))
    print("Purity score group: ",model.purity(loader.y_test,y_pred))
    print("Memory: ",np.mean(mem),"MB")
    memory_values.append(np.mean(mem))
    
    cm = confusion_matrix(loader.y_test, y_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot()

model.reset_model()

average_memory = np.mean(memory_values)
print("Average memory usage: ", average_memory,"MB")