In [None]:
!pip3 install ../../random-forest-mc/

In [12]:
from random_forest_mc.model import RandomForestMC, dsRow
from random_forest_mc.utils import LoadDicts
import pandas as pd
import numpy as np

from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

from numbers import Number
import logging as log

# Load dataset

In [13]:
dicts = LoadDicts("../../random-forest-mc/tests/")
dataset_dict = dicts.datasets_metadata
ds_name = "titanic"
params = dataset_dict[ds_name]
dataset = (
    pd.read_csv(params["csv_path"])[params["ds_cols"] + [params["target_col"]]]
    .dropna()
    .reset_index(drop=True)
)
dataset["Age"] = dataset["Age"].astype(np.uint8)
dataset["SibSp"] = dataset["SibSp"].astype(np.uint8)
dataset["Pclass"] = dataset["Pclass"].astype(str)
dataset["Fare"] = dataset["Fare"].astype(np.uint32)
ds_cols = params["ds_cols"]
target_col = params["target_col"]

# Load current model

In [14]:
cls = RandomForestMC(target_col=target_col)
cls.fit(dataset)
row = dataset.reset_index(drop=True).loc[0]
predict_row = cls.predict(row)
predict_ds = cls.predict(dataset.sample(n=10))
predict_probs_ds = cls.predict_proba(dataset.sample(n=10))

Planting the forest: 100%|██████████| 16/16 [00:06<00:00,  2.65it/s]


In [15]:
predict_ds

['1', '0', '1', '1', '1', '0', '0', '0', '0', '0']

In [16]:
cls

RandomForestMC(len(Forest)=16,n_trees=16,model_version=1.0.4-dev,module_version=1.0.4-dev)

# Generate some missing data

In [17]:
df_tmp = dataset.sample(frac=0.2).reset_index(drop=True)
mask_random = np.random.choice([True, False], size=df_tmp[ds_cols].shape, p=[0.7, 0.3])
dataset_missing_values = df_tmp[ds_cols].mask(~mask_random)
dataset_missing_values[target_col] = df_tmp[target_col]
dataset_missing_values_ground_truth = df_tmp

In [18]:
dataset_missing_values

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
0,,female,23.0,3.0,,,1
1,,male,21.0,0.0,,,0
2,3,,23.0,,,S,0
3,3,female,27.0,0.0,,,1
4,,male,28.0,0.0,7.0,,0
...,...,...,...,...,...,...,...
137,2,,,1.0,30.0,,1
138,,female,8.0,0.0,26.0,,1
139,,male,34.0,1.0,,,0
140,1,male,,0.0,,S,0


In [19]:
dict_values = {col: dataset[col].unique().tolist() for col in ds_cols}

In [20]:
dict_values.keys()

dict_keys(['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked'])

# Load the new extention

In [21]:

# Custom exception when missing values not found
class MissingValuesNotFound(Exception):
    """Exception raised for missing values not found.

    Attributes:
        message -- explanation of the error
    """

    def __init__(
        self,
        message="Dataset or row without missing values! Please, give a dataset or row with missing values using \'NaN\'.",
    ):
        super().__init__(message)

In [36]:
# How to format a dict with values to fill the missing ones
featName = str
featValue = Union[str, Number]
dictValues = Dict[featName, featValue]


class RandomForestMC_Ext(RandomForestMC):
    @staticmethod
    def _fill_row_missing(row: dsRow, dict_values : dictValues) -> pd.DataFrame:
        list_out = []
        for col, vals in dict_values.items():
            if pd.isna(row[col]):        
                for val in vals:
                    _row = row.copy()
                    _row[col] = val
                    list_out.append(_row)
        if len(list_out) == 0:
            log.warning('Filling rows process: found row without missing data!')
            return None
        return pd.concat(list_out, axis=1).transpose().reset_index(drop=True)
        
        
    def predictMissingValues(self, row_or_matrix: Union[dsRow, pd.DataFrame], dict_values : dictValues, use_all_Tress: bool = True):
        used_features = set()
        for Tree in self:
            used_features |= set(Tree.used_features)
        not_have_feats = set(dict_values.keys()) - used_features
        if not_have_feats:
            _tmp = ", ".join(not_have_feats)
            log.warning(f'The Forest model have not the following feature(s): [{_tmp}].')
        
        if isinstance(row_or_matrix, dsRow):
            df_data_miss = self._fill_row_missing(row_or_matrix, dict_values)
            if df_data_miss is None:
                raise MissingValuesNotFound
            row_or_matrix = pd.DataFrame(row_or_matrix).transpose().reset_index(drop=True)
            
        elif isinstance(row_or_matrix, pd.DataFrame):
            row_or_matrix = row_or_matrix.reset_index(drop=True)
            df_data_miss = []
            for _, row in row_or_matrix.iterrows():
                _tmp = self._fill_row_missing(row, dict_values)
                if _tmp is not None:
                    df_data_miss.append(_tmp)
            if len(df_data_miss) == 0:
                raise MissingValuesNotFound
            df_data_miss = pd.concat(df_data_miss).reset_index(drop=True)
            
        df_predict = pd.DataFrame.from_dict(self.predict_proba(df_data_miss))
        df_predict = pd.concat([df_data_miss, df_predict], axis=1)
        
        out = []
        for i, row in row_or_matrix.reset_index(drop=True).iterrows():
            conds = []
            missing_cols = []
            for col in dict_values.keys():
                if not pd.isna(row[col]):
                    conds.append(df_data_miss[col] == row[col])
                else:
                    missing_cols.append(col)
            cond = conds.pop()
            while conds:
                cond = cond & conds.pop()

            df_tmp = df_predict.loc[cond]
            df_tmp = pd.concat([pd.DataFrame(row).transpose(), df_tmp]).drop_duplicates().reset_index(drop=True)
            df_tmp['row_id'] = i
            out.append(df_tmp)

        return pd.concat(out).reset_index(drop=True)

In [37]:
cls_ext = RandomForestMC_Ext(target_col=params["target_col"])
cls_ext.fit(dataset)

Planting the forest: 100%|██████████| 16/16 [00:06<00:00,  2.45it/s]


In [38]:
cls_ext

RandomForestMC_Ext(len(Forest)=16,n_trees=16,model_version=1.0.4-dev,module_version=1.0.4-dev)

In [39]:
cls_ext.__class__.__name__

'RandomForestMC_Ext'

In [40]:
dataset_missing_values.loc[2]

Pclass         3
Sex          NaN
Age         23.0
SibSp        NaN
Fare         NaN
Embarked       S
Survived       0
Name: 2, dtype: object

In [41]:
df_pred_missings = cls_ext.predictMissingValues(dataset_missing_values.loc[2], dict_values)

In [42]:
df_pred_missings

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived,0,1,row_id
0,3,,23.0,,,S,0,,,0
1,3,male,23.0,,,S,0,0.9375,0.0625,0
2,3,female,23.0,,,S,0,0.5625,0.4375,0
3,3,,23.0,1,,S,0,0.3750,0.6250,0
4,3,,23.0,0,,S,0,0.5000,0.5000,0
...,...,...,...,...,...,...,...,...,...,...
92,3,,23.0,,133,S,0,0.5000,0.5000,0
93,3,,23.0,,25,S,0,0.5000,0.5000,0
94,3,,23.0,,37,S,0,0.5000,0.5000,0
95,3,,23.0,,50,S,0,0.5000,0.5000,0


In [43]:
df_pred_missings = cls_ext.predictMissingValues(dataset_missing_values.sample(n=2), dict_values)
df_pred_missings

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived,0,1,row_id
0,3,female,2.0,,12.0,S,1,,,0
1,3,female,2.0,1,12.0,S,1,0.3125,0.6875,0
2,3,female,2.0,0,12.0,S,1,0.3750,0.6250,0
3,3,female,2.0,3,12.0,S,1,0.3125,0.6875,0
4,3,female,2.0,4,12.0,S,1,0.3125,0.6875,0
...,...,...,...,...,...,...,...,...,...,...
94,3,female,27.0,0.0,50,,1,0.3125,0.6875,1
95,3,female,27.0,0.0,5,,1,0.3750,0.6250,1
96,3,female,27.0,0.0,,S,1,0.3750,0.6250,1
97,3,female,27.0,0.0,,C,1,0.3125,0.6875,1


In [44]:
dataset_missing_values

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
0,,female,23.0,3.0,,,1
1,,male,21.0,0.0,,,0
2,3,,23.0,,,S,0
3,3,female,27.0,0.0,,,1
4,,male,28.0,0.0,7.0,,0
...,...,...,...,...,...,...,...
137,2,,,1.0,30.0,,1
138,,female,8.0,0.0,26.0,,1
139,,male,34.0,1.0,,,0
140,1,male,,0.0,,S,0


In [45]:
dataset_missing_values.sample(n=20)

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
96,,male,17.0,0.0,7.0,,0
119,1.0,female,39.0,1.0,,,1
113,1.0,male,,1.0,53.0,,0
18,2.0,male,52.0,0.0,13.0,,0
46,3.0,,51.0,,7.0,S,0
67,1.0,,38.0,,227.0,C,1
95,1.0,male,38.0,,,S,1
20,1.0,female,35.0,0.0,,C,1
9,3.0,,32.0,0.0,,S,1
68,3.0,female,1.0,1.0,11.0,S,1


In [46]:
df_pred_missings = cls_ext.predictMissingValues(dataset_missing_values.sample(n=20), dict_values)
df_pred_missings



Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived,0,1,row_id
0,3,,21.0,0.0,,S,1,,,0
1,3,male,21.0,0.0,,S,1,0.9375,0.0625,0
2,3,female,21.0,0.0,,S,1,0.3750,0.6250,0
3,3,,21.0,0.0,7,S,1,0.5000,0.5000,0
4,3,,21.0,0.0,71,S,1,0.5625,0.4375,0
...,...,...,...,...,...,...,...,...,...,...
925,3,male,26.0,1.0,14.0,,0,0.4375,0.5625,19
926,3,female,26.0,1.0,14.0,,0,0.1875,0.8125,19
927,3,,26.0,1.0,14.0,S,0,0.5000,0.5000,19
928,3,,26.0,1.0,14.0,C,0,0.1875,0.8125,19


In [47]:
df_pred_missings = cls_ext.predictMissingValues(dataset.sample(n=20), dict_values)
df_pred_missings



MissingValuesNotFound: Dataset or row without missing values! Please, give a dataset or row with missing values using 'NaN'.

In [48]:
df_pred_missings = cls_ext.predictMissingValues(dataset.loc[0], dict_values)
df_pred_missings



MissingValuesNotFound: Dataset or row without missing values! Please, give a dataset or row with missing values using 'NaN'.