In [4]:
import pandas as pd
import numpy as np
from load import Dataset

def add_nominal_noise(df, percentage):
    noisy_df = df.copy()
    nominal_columns = df.select_dtypes(include=['category', 'object']).columns
    for col in nominal_columns:
        if col == 'class':  # Skip the 'class' column
            continue
        unique_values = df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x).unique()
        indices = noisy_df.sample(frac=percentage / 100).index
        noisy_df.loc[indices, col] = np.random.choice(unique_values, size=len(indices))
    return noisy_df

def add_numeric_noise(df, percentage, add_noise=True):
    noisy_df = df.copy()
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        noise = (df[col].max() - df[col].min()) * (percentage / 100)
        if add_noise:
            noisy_df[col] = df[col] + noise
        else:
            noisy_df[col] = df[col] - noise
    return noisy_df


In [5]:
def generate_noisy_datasets(train_df, valid_df, num_variants=10):
    noisy_datasets = []
    for i in range(1, 11):
        # Add noise
        train_noisy_add = add_numeric_noise(train_df, i, add_noise=True)
        valid_noisy_add = add_numeric_noise(valid_df, i, add_noise=True)
        noisy_datasets.append((train_noisy_add, valid_noisy_add))
        
        # Subtract noise
        train_noisy_subtract = add_numeric_noise(train_df, i, add_noise=False)
        valid_noisy_subtract = add_numeric_noise(valid_df, i, add_noise=False)
        noisy_datasets.append((train_noisy_subtract, valid_noisy_subtract))
    return noisy_datasets

In [12]:
dataset_name = 'cellcycle'  # Replace with your dataset name
dataset = Dataset(dataset_name, nan_strategy='mean')

# Load train and valid data
train_df = dataset.train
valid_df = dataset.valid
test_df = dataset.test

# Generate and store noisy datasets in a list
noisy_datasets = generate_noisy_datasets(train_df, valid_df, num_variants=10)

# Display the first few rows of the first noisy train and valid dataset as an example
noisy_train_1, noisy_valid_1 = noisy_datasets[1]
noisy_train_1.head(), noisy_valid_1.head()

(   cln3-1    cln3-2  clb2-2  clb2-1  alpha0  alpha7  alpha14  alpha21  \
 0  0.0733 -0.282872 -0.2916 -0.0096  -0.222 -0.2178  -0.2491   0.1301   
 1 -1.2967 -0.333100 -0.1716  0.1504  -0.212 -0.7778   0.0609  -0.3599   
 2 -0.6767  0.946900  0.1684  0.5704  -0.122 -0.5978  -0.5091  -0.0999   
 3  0.1733 -0.853100 -0.2916 -0.6196  -0.102 -0.3378   0.1309  -0.1599   
 4 -0.1967 -0.603100 -0.1916 -0.2596  -0.072 -0.0778   0.0809  -0.2699   
 
    alpha28  alpha35  ...  elu150  elu180  elu210  elu240  elu270  elu300  \
 0  -0.4498  -0.4725  ...   0.083 -0.1454  0.3454  0.0432 -0.1176 -0.3589   
 1  -0.4298  -0.6125  ...   0.253  0.1246  0.3754  0.2132  0.1924 -0.3089   
 2   0.0802  -0.1025  ...  -0.857 -0.2854 -0.1846  0.2732  0.1824 -0.4689   
 3  -0.2998   0.0275  ...   0.093  0.2646 -0.1246 -0.4368 -0.6776 -0.1089   
 4  -0.1598   0.2175  ...  -0.427 -0.2054  0.0954  0.3032 -0.0576 -0.1989   
 
    elu330  elu360  elu390                                              class  
 0  0.0054

In [11]:
train_df.head()

Unnamed: 0,cln3-1,cln3-2,clb2-2,clb2-1,alpha0,alpha7,alpha14,alpha21,alpha28,alpha35,...,elu150,elu180,elu210,elu240,elu270,elu300,elu330,elu360,elu390,class
0,0.15,-0.219772,-0.22,0.07,-0.15,-0.15,-0.21,0.17,-0.42,-0.44,...,0.11,-0.12,0.37,0.07,-0.09,-0.32,0.04,-0.48,0.04,"[11/02/01, 11/02/02, 11/02/03/01]"
1,-1.22,-0.27,-0.1,0.23,-0.14,-0.71,0.1,-0.32,-0.4,-0.58,...,0.28,0.15,0.4,0.24,0.22,-0.27,-0.1,0.34,0.02,[12/04/02]
2,-0.6,1.01,0.24,0.65,-0.05,-0.53,-0.47,-0.06,0.11,-0.07,...,-0.83,-0.26,-0.16,0.3,0.21,-0.43,0.21,0.6,0.65,"[01/04, 14/01, 14/04, 16/01, 16/19/03, 20/09/0..."
3,0.25,-0.79,-0.22,-0.54,-0.03,-0.27,0.17,-0.12,-0.27,0.06,...,0.12,0.29,-0.1,-0.41,-0.65,-0.07,-0.35,-0.19,0.15,"[10/03/02, 42/10, 43/01/03/09]"
4,-0.12,-0.54,-0.12,-0.18,0.0,-0.01,0.12,-0.23,-0.13,0.25,...,-0.4,-0.18,0.12,0.33,-0.03,-0.16,0.33,0.28,0.01,"[10/01/05/01, 32/01]"


In [None]:
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def scale_data(df, method='standard'):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if method == 'standard':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

# Outliers handling options

def remove_outliers_zscore(df, threshold=3.0):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
    filtered_entries = (z_scores < threshold).all(axis=1)
    return df[filtered_entries]

def remove_outliers_iqr(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    filter = (df >= (Q1 - 1.5 * IQR)) & (df <= (Q3 + 1.5 * IQR))
    return df[filter.all(axis=1)]

# Imputing Outliers
def clip_outliers(df, lower_percentile=0.01, upper_percentile=0.99):
    lower_bound = df.quantile(lower_percentile)
    upper_bound = df.quantile(upper_percentile)
    return df.clip(lower=lower_bound, upper=upper_bound, axis=1)

def replace_outliers_with_mean(df, threshold=3.0):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        mean = df[col].mean()
        std = df[col].std()
        outliers = np.abs((df[col] - mean) / std) > threshold
        df.loc[outliers, col] = mean
    return df

In [13]:
from sklearn.linear_model import LogisticRegression
from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
from sklearn.metrics import accuracy_score, classification_report

def train_and_evaluate(train_df, valid_df, test_df, regularization_strength=0.01):
    X_train, y_train = train_df.drop(columns=['class']), train_df['class']
    X_valid, y_valid = valid_df.drop(columns=['class']), valid_df['class']
    X_test, y_test = test_df.drop(columns=['class']), test_df['class']
    
    # L2 reg
    classifier = MultiLabelLocalClassifierPerNode(
        local_classifier=LogisticRegression(penalty='l2', C=regularization_strength, solver='lbfgs', max_iter=10000),
        tolerance=0.001,
        verbose=1
    )
    
    classifier.fit(X_train, y_train)
    
    y_pred_valid = classifier.predict(X_valid)
    valid_accuracy = accuracy_score(y_valid, y_pred_valid)
    
    y_pred_test = classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    results = {
        'valid_accuracy': valid_accuracy,
        'test_accuracy': test_accuracy,
        'valid_classification_report': classification_report(y_valid, y_pred_valid),
        'test_classification_report': classification_report(y_test, y_pred_test),
    }
    
    return results

results = train_and_evaluate(train_df, valid_df, test_df, regularization_strength=0.01)
print(results)

2024-05-26 15:36:37,868 - LCPN - INFO - Creating digraph from 1628 3D labels
2024-05-26 15:36:39,330 - LCPN - INFO - Detected 6 roots
2024-05-26 15:36:39,331 - LCPN - INFO - Initializing local classifiers
2024-05-26 15:36:39,343 - LCPN - INFO - Initializing siblings binary policy
2024-05-26 15:36:39,343 - LCPN - INFO - Fitting local classifiers
2024-05-26 15:41:02,078 - LCPN - INFO - Cleaning up variables that can take a lot of disk space
2024-05-26 15:41:02,391 - LCPN - INFO - Predicting
2024-05-26 15:41:02,402 - LCPN - INFO - Predicting for node '1'
2024-05-26 15:41:02,414 - LCPN - INFO - Predicting for node '0'
2024-05-26 15:41:02,416 - LCPN - INFO - Predicting for node '2'
2024-05-26 15:41:02,418 - LCPN - INFO - Predicting for node '3'
2024-05-26 15:41:02,421 - LCPN - INFO - Predicting for node '4'
2024-05-26 15:41:02,423 - LCPN - INFO - Predicting for node '9'
2024-05-26 15:41:02,435 - LCPN - INFO - Predicting for node '1'
2024-05-26 15:41:02,435 - LCPN - INFO - Predicting for nod

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.