In [12]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay

import pathlib
import os
import itertools
import pandas as pd
import seaborn as sns
import networkx as nx
from networkx.algorithms import community
from scipy.spatial.distance import squareform, pdist
from scipy.cluster.hierarchy import dendrogram, fcluster, cophenet
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import ExtraTreesClassifier
######
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
######
import uuid

from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations
from datetime import datetime 

%load_ext autoreload
%autoreload 2
import pysrc.utility as myutil

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
config = {'lp': 3,
          'const': 'False',
          'var': 'False',
          'sty': 'False',
          'cut': 'False',
          'mapID': 4}

conf_str = "-".join(f"{k}:{v}" for k,v in config.items())
    
# Generazioni variabili per la configurazione
lp_l    = range(3,10)
const_l = ["True","False"]
var_l   = ["True","False"]
sty_l   = ["True","False"]
cut_l   = ["True","False"]
mapID_l = range(5)

conf_keys = ["lp", "const", "var", "sty", "cut", "mapID"]

In [15]:
task = !ls featuresData

In [16]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    #"Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    #"AdaBoost",
    "Naive Bayes",
    #"QDA",
]
# svm non lineare, cambiare kernel

classifiers = [
    KNeighborsClassifier(len(task)),
    SVC(C=0.025),
    SVC(C=1),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=1000),
    RandomForestClassifier(n_estimators=1000),
    MLPClassifier(alpha=0.5, max_iter=1000),
    #AdaBoostClassifier(n_estimators=1000),
    GaussianNB(),
    #QuadraticDiscriminantAnalysis(),
]

In [47]:
#config creation
conf_list = []
for idx, m in enumerate(itertools.product(lp_l, const_l, var_l, sty_l, cut_l, mapID_l)):
    conf_keys = ["lp", "const", "var", "sty", "cut", "mapID"]
    conf = dict(zip(conf_keys, [m[i] for i in range(len(conf_keys))]))
    conf_list.append(conf)

# Dataframe result creation
df_result = pd.DataFrame()
list_of_lines = []    
exec_time = {}
folding_number = 5

timestamp = datetime.now().timestamp()
timestamp_str = datetime.fromtimestamp(timestamp).strftime("%d-%m-%Y_%H:%M:%S")

with tqdm(total=len(conf_list)) as pbar:
    for idx, config in enumerate(conf_list):
        dfs = {}
        match = []
        conf_str = "-".join(f"{k}:{v}" for k,v in config.items())
        #print("conf_str -> " + conf_str)
        
        # Find all the folders
        lista = !ls featuresData
        #print("lista -> " + str(lista))

        # Find all the 'csv' in each folder
        for i in lista:
            match_temp = !ls featuresData/{i} | grep {conf_str}.csv
            match.append(i+"/"+match_temp[0])
            
        #print("match -> " + str(match))

        task = []
        for i, l in enumerate(lista):
            task.append(l[1:])

        # Aggiunta colonna 'task' e 'id'
        for l in match:
            p = l.split("/")[0][1:]
            dfs[p] = pd.read_csv("featuresData/"+l, sep=",")
            dfs[p]["task"] = int(p)
            dfs[p]["id"] = dfs[p]["id"] + "-" + p

        # Clean.
        for l in task:
            #print(f"Problem: {l}")
            dfs[l] = myutil.remove_zero_rows(dfs[l], exclude_col=["task"])
            dfs[l] = myutil.clean_and_index(dfs[l])

            #print("    Original: " + str(dfs[l].shape))
            # Feature selection (FEATURE CORRELATION) UNFINET FEATURE SELECTION
            th = dfs[l].var().mean()
                print(th)
            #th = (dfs[l].var()/dfs[l].mean()).mean()/2# remove everything below the average value of variance/mean
            sel = VarianceThreshold(threshold=th)
            df_min = sel.fit_transform(dfs[l])
            # redo the dataframe
            feats = sel.get_feature_names_out()
            index = dfs[l].index
            dfs[l] = pd.DataFrame(df_min)
            dfs[l] = dfs[l].set_index(index)
            dfs[l].columns = feats
            #print("    After feature selection: " + str(dfs[l].shape))

            dfs[l]["task"] = l

           # Outlier Elimination (TENERE TUTTO E PROVARE A VEDERE CHE COSA SUCCEDE)
            #contamination_level = 0.01 # percentage of contamination
            #num_estimators = 300
            #iForest = IsolationForest(random_state=0, contamination=contamination_level, n_estimators=num_estimators).fit_predict(dfs[l])
            #mask = [i for i, f in enumerate(iForest) if f<0]
            #to_drop = dfs[l].iloc[mask].index
            #print(to_drop)
            #dfs[l] = dfs[l].drop(to_drop)
            #print("     After outlier elimination: " + str(dfs[l].shape))

        # Concat all df(s).
        df_tot = pd.concat([dfs[l.split("-")[0][1:]] for l in lista], ignore_index=False, sort=False)
        df_tot = df_tot.fillna(0)
        assert(df_tot.shape[0] == sum([dfs[l.split("-")[0][1:]].shape[0] for l in lista]))

        df_tot = myutil.drop_unuseful_features_all_equal(df_tot)

        print("Final shape:" + str(df_tot.shape))

        # Dataset creation
        #datasets = []
######
        X = df_tot.drop('task', axis=1).values
        #assert('task' not in X.columns and 'id' not in X.columns)
        y = df_tot['task'].values
######
        #datasets.append([X, y])
        
        # TO BE MODIFIED, MORE PRECISE
        # Confusion matrix e metriche
######
        for i, clf in enumerate(classifiers):
            classifier_time_start = time.time()
            
            clf = make_pipeline(StandardScaler(), clf)
            
            actual_classes = np.empty([0], dtype=int)
            predicted_classes = np.empty([0], dtype=int)
            kfold = KFold(n_splits=5, random_state=42, shuffle=True)
            
            for train_ndx, test_ndx in kfold.split(X):
                train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]
                
                clf.fit(train_X, train_y)
                
                actual_classes = np.append(actual_classes, test_y)
                predicted_classes = np.append(predicted_classes, clf.predict(test_X))
                
                #score = cross_val_predict(clf, X, y, cv=folding_number, n_jobs=-1)

            #score = clf.score(X_test, y_test)
            #print(names[i] + ": " + str(score.mean()) + " " + str(score.std()))
            
            matrix = confusion_matrix(actual_classes, predicted_classes, labels=task)
            #print(matrix)
            #sns.heatmap(matrix, xticklabels=task, yticklabels=task)
            
            
            classifier_time_end = time.time()
            exec_time[f"compute_{names[i]}"] = classifier_time_end - classifier_time_start

            line = (config | {'class_number': len(lista), 'classifier': names[i], 'matrix_raw': matrix} )
            #line = (config | {'class_number': len(lista), 'classifier': names[i], 'score_mean': score.mean(), 'score_std': score.std(), 'score_raw': score} 
            #        | {'time_clean': exec_time['cleaning'], 'time_down': exec_time['download'], 'time_classifier': exec_time[f"compute_{names[i]}"]})
            
            list_of_lines.append(line)
######
        #for i, clf in enumerate(classifiees):
        #    line = config | dict(zip(names, scores))
        #    #print(line)
        #    list_of_lines.append(line)

        # Save intermidiate results
        if (idx+1) % 10 == 0:
            df_result_tmp = pd.DataFrame(list_of_lines)
            df_result_tmp.to_csv(f"df_harvesting_{timestamp_str}tmp.csv")

        pbar.update()
        
df_result=pd.DataFrame(list_of_lines)
df_result.to_csv(f"df_harvesting_{timestamp_str}.csv")


  0%|          | 0/560 [00:00<?, ?it/s]

0.1


ValueError: No feature in X meets the variance threshold 0.10000 (X contains only one sample)