In [1]:
import os
import sys
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
import random

from ficaria.missing_imputation import FCMCentroidImputer, FCMParameterImputer, FCMRoughParameterImputer
from ucimlrepo import fetch_ucirepo 

In [2]:
def load_yeast_uci():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data"
    col_names = ["sequence", "mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc", "class"]
    df = pd.read_csv(url, sep=r"\s+", names=col_names)
    df = df.drop(columns=["sequence", "class"])
    return df

def load_mice_protein_expression():
    mice_protein_expression = fetch_ucirepo(id=342) 

    X = mice_protein_expression.data.features    
    ids = mice_protein_expression.data.ids 
    y = mice_protein_expression.data.targets

    df_full = pd.concat([X, ids, y], axis=1)
    
    return df_full

In [3]:
def run_imputation_experiment(
    df,
    missing_percents,
    forced_positions,
    imputers_dict=None,
    random_state=42,
    n_clusters=5,
    r_val=2,
    diff=0
):
    if imputers_dict is None:
        imputers_dict = {
            "FCMCentroid": FCMCentroidImputer(n_clusters=n_clusters, max_iter=1000),
            "FCMParameter": FCMParameterImputer(n_clusters=n_clusters, max_iter=1000),
            "FCMRoughParameter": FCMRoughParameterImputer(n_clusters=n_clusters, max_iter=1000)
        }

    results = []
    rng = np.random.default_rng(random_state)
    n_rows, n_cols = df.shape
    total_cells = n_rows * n_cols

    for perc in missing_percents:
        df_missing = df.copy()
        n_missing = int(total_cells * perc / 100)

        # Losowe braki
        missing_indices = set(
            zip(
                rng.integers(0, n_rows, n_missing),
                rng.integers(0, n_cols, n_missing)
            )
        )

        # Wymuszone pozycje
        forced = forced_positions.get(perc, [])
        forced_zero_based = [(r - 1 + diff, c - 1) for (r, c) in forced]
        missing_indices.update(forced_zero_based)

        # Usuń dane
        for (r0, c0) in missing_indices:
            df_missing.iat[r0, c0] = np.nan

        # Przygotuj dane wynikowe dla każdej wymuszonej pozycji
        position_results = {
            (r, c): {
                "missing_percent": perc,
                "row": r,
                "column": c,
                "actual_value": df.iat[r - 1 + diff, c - 1]
            }
            for (r, c) in forced
        }

        # Imputacja każdą metodą
        for name, imputer in imputers_dict.items():
            imputed = imputer.fit_transform(df_missing)
            df_imputed = pd.DataFrame(imputed, columns=df.columns)

            for (r, c) in forced:
                imputed_val = df_imputed.iat[r - 1 + diff, c - 1]
                position_results[(r, c)][f"imputed_{name}"] = round(imputed_val, r_val)

        results.extend(position_results.values())

    results_df = pd.DataFrame(results)
    return results_df

### Yeast

#### Reproduction of the results using a custom implementation

In [4]:
forced_positions = {
    1: [(14, 7), (16, 2), (30, 1), (57, 6), (73, 7)],
    3: [(5, 1), (9, 6), (15, 7), (32, 7), (38, 6)],
    5: [(1, 4), (7, 2), (7, 7), (8, 2), (10, 5)],
    7: [(2, 6), (9, 4), (13, 3), (14, 8), (15, 2)],
    9: [(1, 1), (1, 3), (2, 3), (2, 6), (4, 4)]
}

df_yeast = load_yeast_uci()
yeast_res_df = run_imputation_experiment(
    df=df_yeast,
    missing_percents=[1, 3, 5, 7, 9],
    forced_positions=forced_positions,
    n_clusters=5
)

print("\n=== Tabela wyników imputacji ===")
yeast_res_df


=== Tabela wyników imputacji ===


Unnamed: 0,missing_percent,row,column,actual_value,imputed_FCMCentroid,imputed_FCMParameter,imputed_FCMRoughParameter
0,1,14,7,0.53,0.5,0.5,0.5
1,1,16,2,0.44,0.49,0.48,0.5
2,1,30,1,0.8,0.7,0.56,0.5
3,1,57,6,0.0,0.01,0.01,0.01
4,1,73,7,0.47,0.5,0.5,0.5
5,3,5,1,0.42,0.51,0.51,0.5
6,3,9,6,0.0,0.01,0.01,0.01
7,3,15,7,0.46,0.5,0.5,0.5
8,3,32,7,0.52,0.5,0.5,0.5
9,3,38,6,0.0,0.01,0.01,0.01


Difference between the results obtained using the custom implementation and those reported in the article

In [5]:
data = [
    [0.53, 0.49, 0.50, 0.51],
    [0.44, 0.49, 0.57, 0.47],
    [0.8,  0.69, 0.64, 0.71],
    [0, 0, 0, 0],
    [0.47, 0.49, 0.50, 0.51],
    [0.42, 0.50, 0.60, 0.49],
    [0, 0.00, 0, 0],
    [0.46, 0.50, 0.50, 0.49],
    [0.52, 0.49, 0.50, 0.51],
    [0, 0.00, 0, 0],
    [0.13, 0.28, 0.41, 0.26],
    [0.54, 0.50, 1.07, 0.56],
    [0.53, 0.49, 1.07, 0.5],
    [0.45, 0.45, 0.56, 0.47],
    [0.5, 0.50, 1.00, 0.5],
    [0, 0, 0, 0],
    [0.36, 0.45, 0.41, 0.37],
    [0.57, 0.51, 0.50, 0.54],
    [0.22, 0.26, 0.27, 0.22],
    [0.55, 0.64, 0.77, 0.62],
    [0.58, 0.70, 1.50, 0.51],
    [0.47, 0.44, 1.50, 0.46],
    [0.48, 0.50, 0.51, 0.49],
    [0, 0, 0.51, 0],
    [0.13, 0.45, 0.47, 0.16]
]

yeast_experiment_result = pd.DataFrame(data, columns=[
    "actual_value",
    "imputed_FCMCentroid",
    "imputed_FCMParameter",
    "imputed_FCMRoughParameter"
])

In [6]:
yeas_diff = yeast_experiment_result - yeast_res_df[["actual_value", "imputed_FCMCentroid", "imputed_FCMParameter", "imputed_FCMRoughParameter"]]
yeas_diff

Unnamed: 0,actual_value,imputed_FCMCentroid,imputed_FCMParameter,imputed_FCMRoughParameter
0,0.0,-0.01,0.0,0.01
1,0.0,0.0,0.09,-0.03
2,0.0,-0.01,0.08,0.21
3,0.0,-0.01,-0.01,-0.01
4,0.0,-0.01,0.0,0.01
5,0.0,-0.01,0.09,-0.01
6,0.0,-0.01,-0.01,-0.01
7,0.0,0.0,0.0,-0.01
8,0.0,-0.01,0.0,0.01
9,0.0,-0.01,-0.01,-0.01


### Mice protein expression

Reproduction of the results using a custom implementation

In [7]:
forced_positions = {
    1: [(1, 9), (3, 46), (5, 8), (5, 52), (5, 55)],
    3: [(1, 16), (1, 34), (1, 41), (1, 64), (2, 33)],
    5: [(2, 7), (2, 9), (2, 19), (2, 32), (2, 45)],
    7: [(3, 2), (3, 5), (3, 8), (3, 9), (3, 23)],
    9: [(6, 22), (6, 24), (6, 28), (6, 29), (6, 30)]
}

df_yeast = load_mice_protein_expression()
mice_protein_expression_res_df = run_imputation_experiment(
    df=df_yeast,
    missing_percents=[1, 3, 5, 7, 9],
    forced_positions=forced_positions,
    n_clusters=3,
    r_val=4,
    diff=75
)

print("\n=== Tabela wyników imputacji ===")
mice_protein_expression_res_df


=== Tabela wyników imputacji ===


Unnamed: 0,missing_percent,row,column,actual_value,imputed_FCMCentroid,imputed_FCMParameter,imputed_FCMRoughParameter
0,1,1,9,0.239283,0.2202,0.2145,0.2274
1,1,3,46,0.17523,0.1684,0.1645,0.1719
2,1,5,8,3.871971,3.3809,3.7011,3.1191
3,1,5,52,0.153741,0.1596,0.1614,0.1578
4,1,5,55,0.167972,0.1584,0.1571,0.1597
5,3,1,16,1.0993,0.892,0.7874,0.925
6,3,1,34,0.487664,0.4913,0.4644,0.4931
7,3,1,41,0.32196,0.3343,0.3167,0.3323
8,3,1,64,2.436587,2.4031,2.3286,2.4361
9,3,2,33,0.337682,0.7098,0.6128,0.7198


Difference between the results obtained using the custom implementation and those reported in the article

In [8]:
data = [
    [0.2392, 0.2193, 0.2155, 0.2210],
    [0.1752, 0.1671, 0.1629, 0.1699],
    [3.8719, 3.3294, 5.8859, 3.7785],
    [0.1537, 0.1561, 5.8859, 0.1443],
    [0.1679, 0.1589, 5.8859, 0.1636],
    [1.0993, 0.7652, 3.7709, 1.0940],
    [0.4876, 0.4469, 3.7709, 0.4939],
    [0.3219, 0.3059, 3.7709, 0.2996],
    [2.4365, 2.2490, 3.7709, 2.4373],
    [0.3376, 0.5164, 2.6262, 0.3423],
    [0.1677, 0.1777, 4.5706, 0.1766],
    [0.2210, 0.2238, 4.5706, 0.2392],
    [0.4091, 0.4620, 4.5706, 0.4268],
    [2.5380, 2.4808, 4.5706, 2.5782],
    [1.0639, 1.0445, 4.5706, 1.0611],
    [0.8528, 0.6652, 16.192, 0.9459],
    [5.3508, 4.3074, 16.192, 4.4561],
    [3.8145, 3.5173, 16.192, 3.8830],
    [0.2223, 0.2210, 16.192, 0.2619],
    [0.1725, 0.1749, 16.192, 0.1924],
    [0.3269, 0.3580, 9.6256, 0.3386],
    [1.4003, 1.4395, 9.6256, 1.3981],
    [0.2923, 0.2899, 9.6256, 0.2700],
    [0.7505, 0.7635, 9.6256, 0.7554],
    [0.1439, 0.1627, 9.6256, 0.1495]
]

mice_protein_expression_experiment_result = pd.DataFrame(data, columns=[
    "actual_value",
    "imputed_FCMCentroid",
    "imputed_FCMParameter",
    "imputed_FCMRoughParameter"
])

In [9]:
mice_protein_expression_diff = mice_protein_expression_experiment_result - mice_protein_expression_res_df[["actual_value", "imputed_FCMCentroid", 
                                                                                                           "imputed_FCMParameter", "imputed_FCMRoughParameter"]]
mice_protein_expression_diff["actual_value"] = round(mice_protein_expression_diff["actual_value"], 2)

mice_protein_expression_diff

Unnamed: 0,actual_value,imputed_FCMCentroid,imputed_FCMParameter,imputed_FCMRoughParameter
0,-0.0,-0.0009,0.001,-0.0064
1,-0.0,-0.0013,-0.0016,-0.002
2,-0.0,-0.0515,2.1848,0.6594
3,-0.0,-0.0035,5.7245,-0.0135
4,-0.0,0.0005,5.7288,0.0039
5,-0.0,-0.1268,2.9835,0.169
6,-0.0,-0.0444,3.3065,0.0008
7,-0.0,-0.0284,3.4542,-0.0327
8,-0.0,-0.1541,1.4423,0.0012
9,-0.0,-0.1934,2.0134,-0.3775
