In [None]:
!pip3 install ../../random-forest-mc/

In [1]:
from random_forest_mc.model import RandomForestMC
from random_forest_mc.utils import LoadDicts
import pandas as pd
import numpy as np

from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

from numbers import Number
import logging as log

# Load dataset

In [2]:
dicts = LoadDicts("../../random-forest-mc/tests/")
dataset_dict = dicts.datasets_metadata
ds_name = "titanic"
params = dataset_dict[ds_name]
dataset = (
    pd.read_csv(params["csv_path"])[params["ds_cols"] + [params["target_col"]]]
    .dropna()
    .reset_index(drop=True)
)
dataset["Age"] = dataset["Age"].astype(np.uint8)
dataset["SibSp"] = dataset["SibSp"].astype(np.uint8)
dataset["Pclass"] = dataset["Pclass"].astype(str)
dataset["Fare"] = dataset["Fare"].astype(np.uint32)
ds_cols = params["ds_cols"]
target_col = params["target_col"]

# Load current model

In [3]:
cls = RandomForestMC(target_col=target_col)
cls.fit(dataset)
row = dataset.reset_index(drop=True).loc[0]
predict_row = cls.predict(row)
predict_ds = cls.predict(dataset.sample(n=10))
predict_probs_ds = cls.predict_proba(dataset.sample(n=10))

Planting the forest: 100%|██████████| 16/16 [00:06<00:00,  2.57it/s]


In [None]:
predict_ds

In [None]:
cls

# Generate some missing data

In [4]:
df_tmp = dataset.sample(frac=0.2).reset_index(drop=True)
mask_random = np.random.choice([True, False], size=df_tmp[ds_cols].shape, p=[0.7, 0.3])
dataset_missing_values = df_tmp[ds_cols].mask(~mask_random)
dataset_missing_values[target_col] = df_tmp[target_col]
dataset_missing_values_ground_truth = df_tmp

In [5]:
dataset_missing_values

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
0,,,16.0,2.0,18.0,S,0
1,1,female,40.0,1.0,134.0,C,1
2,,female,15.0,0.0,8.0,Q,1
3,3,,34.0,0.0,,C,0
4,1,male,51.0,0.0,,S,1
...,...,...,...,...,...,...,...
137,3,male,17.0,0.0,7.0,S,0
138,3,,,,17.0,,1
139,,female,16.0,,46.0,,0
140,,male,54.0,1.0,26.0,S,0


In [6]:
dict_values = {col: dataset[col].unique().tolist() for col in ds_cols}

In [7]:
dict_values.keys()

dict_keys(['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked'])

# Load the new extention

In [8]:
# a row of pd.DataFrame.iterrows()
# dsRow: TypeAlias = pd.core.series.Series
dsRow = pd.core.series.Series

# A tree composed by a assimetric tree of dictionaries:
# TypeTree: TypeAlias = Dict
TypeTree = Dict

# Value type of classes
# TypeClassVal: TypeAlias = Any
TypeClassVal = Any  # !Review if is not forced to be str!

# Type of the leaf
# TypeLeaf: TypeAlias = Dict[TypeClassVal, float]
TypeLeaf = Dict[TypeClassVal, float]

In [9]:
# How to format a dict with values to fill the missing ones
featName = str
featValue = Union[str, Number]
dictValues = Dict[featName, featValue]


class RandomForestMC_Ext(RandomForestMC):
    @staticmethod
    def _fill_row_missing(row: dsRow, dict_values : dictValues) -> pd.DataFrame:
        list_out = []
        for col, vals in dict_values.items():
            if pd.isna(row[col]):        
                for val in vals:
                    _row = row.copy()
                    _row[col] = val
                    list_out.append(_row)
        return pd.concat(list_out, axis=1).transpose().reset_index(drop=True)
        
        
    def predictMissingValues(self, row_or_matrix: Union[dsRow, pd.DataFrame], dict_values : dictValues, use_all_Tress: bool = True):
        used_features = set()
        for Tree in self:
            used_features |= set(Tree.used_features)
        not_have_feats = set(dict_values.keys()) - used_features
        if not_have_feats:
            _tmp = ", ".join(not_have_feats)
            log.warning(f'The Forest model have not the following feature(s): [{_tmp}].')
        
        if isinstance(row_or_matrix, dsRow):
            self.df_data_miss = self._fill_row_missing(row_or_matrix, dict_values)
            
        if isinstance(row_or_matrix, pd.DataFrame):
            self.df_data_miss = []
            for _, row in row_or_matrix.iterrows():
                self.df_data_miss.append(self._fill_row_missing(row, dict_values))
            self.df_data_miss = pd.concat(self.df_data_miss).reset_index(drop=True)
            
        df_predict = pd.DataFrame.from_dict(self.predict_proba(cls_ext.df_data_miss))

In [10]:
cls_ext = RandomForestMC_Ext(target_col=params["target_col"])
cls_ext.fit(dataset)

Planting the forest: 100%|██████████| 16/16 [00:06<00:00,  2.29it/s]


In [11]:
cls_ext

RandomForestMC_Ext(len(Forest)=16,n_trees=16,model_version=1.0.4-dev,module_version=1.0.4-dev)

In [12]:
cls_ext.__class__.__name__

'RandomForestMC_Ext'

In [15]:
dataset_missing_values.loc[2]

Pclass         NaN
Sex         female
Age           15.0
SibSp          0.0
Fare           8.0
Embarked         Q
Survived         1
Name: 2, dtype: object

In [13]:
cls_ext.predictMissingValues(dataset_missing_values.loc[2], dict_values)

In [14]:
cls_ext.df_data_miss

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
0,3,female,15.0,0.0,8.0,Q,1
1,1,female,15.0,0.0,8.0,Q,1
2,2,female,15.0,0.0,8.0,Q,1


In [17]:
dataset_missing_values_A

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
133,1.0,female,39.0,,83.0,C,1
77,,,24.0,,7.0,S,0
40,3.0,male,22.0,0.0,7.0,S,0


In [18]:
dataset_missing_values_A = dataset_missing_values.sample(n=3).reset_index(drop=True)
cls_ext.predictMissingValues(dataset_missing_values_A, dict_values)

In [19]:
cls_ext.df_data_miss

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
0,3.0,male,26.0,1.0,18.0,C,1
1,3.0,male,26.0,0.0,18.0,C,1
2,3.0,male,26.0,3.0,18.0,C,1
3,3.0,male,26.0,4.0,18.0,C,1
4,3.0,male,26.0,2.0,18.0,C,1
5,3.0,male,26.0,5.0,18.0,C,1
6,3.0,male,27.0,0.0,7.0,S,1
7,3.0,male,27.0,0.0,7.0,C,1
8,3.0,male,27.0,0.0,7.0,Q,1
9,3.0,,18.0,0.0,7.0,S,1


In [20]:
predict_probs_ds = cls_ext.predict_proba(cls_ext.df_data_miss)

In [21]:
df_predict = pd.DataFrame.from_dict(cls_ext.predict_proba(cls_ext.df_data_miss))

In [22]:
cls_ext.df_data_miss_with_preds = pd.concat([cls_ext.df_data_miss, df_predict], axis=1)

In [35]:
for i, row in dataset_missing_values_A.iterrows():
    conds = []
    missing_cols = []
    for col in dict_values.keys():
        if not pd.isna(row[col]):
            conds.append(cls_ext.df_data_miss[col] == row[col])
        else:
            missing_cols.append(col)
    cond = conds.pop()
    while conds:
        cond = cond & conds.pop()
        
    df_tmp = cls_ext.df_data_miss_with_preds.loc[cond]
    break

In [31]:
row

Pclass         3
Sex         male
Age         26.0
SibSp        NaN
Fare        18.0
Embarked       C
Survived       1
Name: 0, dtype: object

In [36]:
df_tmp

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived,0,1
0,3,male,26.0,1,18.0,C,1,0.5625,0.4375
1,3,male,26.0,0,18.0,C,1,0.625,0.375
2,3,male,26.0,3,18.0,C,1,0.5625,0.4375
3,3,male,26.0,4,18.0,C,1,0.5625,0.4375
4,3,male,26.0,2,18.0,C,1,0.5625,0.4375
5,3,male,26.0,5,18.0,C,1,0.5625,0.4375


In [33]:
row

Pclass         3
Sex         male
Age         26.0
SibSp        NaN
Fare        18.0
Embarked       C
Survived       1
Name: 0, dtype: object

In [34]:
col

'Embarked'