In [1]:
import polars as pl
from datetime import datetime
import pandas as pd

from feataz.utils import check_data

In [98]:
df = pl.read_csv("data/train.csv")

In [3]:
import numpy as np
isinstance(df, (np.generic, np.ndarray))

False

In [4]:
df.__len__()

891

In [27]:
from typing import Any, List, Union, Dict
import polars as pl

def find_eligible_categorical_variables(X: pl.DataFrame, n_unique: int = 20) -> List[Union[str, int]]:

    variables = [x for x in X.columns if X.schema[x] in 
                    [pl.datatypes.String,
                      pl.datatypes.Categorical,
                      pl.datatypes.Enum,
                      pl.datatypes.Utf8]]

    if len(variables) == 0:
        raise TypeError(
            "No categorical variables found in this dataframe. Please check "
            "variable format with pandas dtypes."
        )

    vars = []
    for v in variables:
        if X[v].n_unique() <= n_unique:
            vars.append(v)
    return vars

In [7]:
find_eligible_categorical_variables(df)

['Sex', 'Embarked']

In [116]:
from typing import List, Optional, Dict, Union

from pydantic import BaseModel

def is_variable_available(data, var):
    not_cols = []
    for i in var:
        if i not in data.columns:
            not_cols.append(i)

    not_cols = ",".join(["'"+x+"'" for x in not_cols])        

    if len(not_cols)>0:
        raise KeyError(f"Error: This variables {not_cols} is/are not available in your dataset")
    

class OneHot():
    """One Hot Encoder class."""

    def __init__(self, 
                 variable:  Union[None, int, str, List[Union[str, int]], Dict] = None, 
                 n_top_category: Optional[int] = None, 
                 drop_first: bool = False, 
                 keep_original: bool = False
                 ) -> None:
        """Init.

        Args:
            variable (str | list | dict): list of features to encode
            n_top_category (int): drop or keep the one hot encoded column
            drop_first
            keep_original
        """
        self.n_top_category = n_top_category
        self.drop_first = drop_first
        self.variable = variable
        self.keep_original = keep_original

    def fit(self, data: Union[pl.DataFrame, pd.DataFrame]):
        """
        """
        
        df = check_data(data)

        most_freq = {}
        if self.variable:
            if type(self.variable) == dict:
                var = list(self.variable.keys)
                for j in var:
                    n_top = self.variable[j]
                    most_freq[j] = df[j].value_counts().sort(by = 'count',
                                         descending = True)[:n_top][j].to_list()
            else:
                var = list(self.variable)

        else:
            var = find_eligible_categorical_variables(df)

        if self.n_top_category:
            for j in var:
                most_freq[j] = df[j].value_counts(
                                    ).sort(by = 'count',
                                     descending = True)[:self.n_top_category][j].to_list()
        
        is_variable_available(df, var)

        self.var = var
        self.most_freq = most_freq
        return self

    def transform(self, data):
        df = check_data(data)

        is_variable_available(df, self.var)

        if not self.most_freq:
            dum = df[self.var].to_dummies(drop_first = self.drop_first)
        else:
            dum = pl.DataFrame([])
            for i in self.var:
                items = [str(x) for x in self.most_freq[i]]
                tmp = df.select(i).with_columns(pl.when(pl.col(i).cast(str).is_in(items) == False)
                                     .then(pl.lit(None))
                                     .otherwise(pl.col(i))
                                     .alias(i)
                                     )
                dum = pl.concat([dum, tmp], how = 'horizontal')
        dum = dum.to_dummies(drop_first = self.drop_first)
        if not self.keep_original:
            result = pl.concat([df.drop(self.var),dum ], how = 'horizontal')
        else:
            result = pl.concat([df,dum ], how = 'horizontal')
            
        return result

In [117]:
df.n_unique()

891

In [119]:
%%time
ohc = OneHot(variable=['Cabin'], n_top_category=3)
fit = ohc.fit(df)
df_new = fit.transform(df)

CPU times: user 7.94 ms, sys: 1.59 ms, total: 9.53 ms
Wall time: 4.36 ms


In [97]:

d = df_new.select(pl.all().implode().list.unique()).to_dict()
d = {k: v.to_list()[0] for k, v in d.items()}
print(d)

{'PassengerId': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 2

In [80]:
fit.most_freq

{'Cabin': [None, 'G6', 'C23 C25 C27', 'B96 B98']}

In [54]:
list([1,2])

[1, 2]

In [46]:
i = 'Cabin'
df[[i]].with_columns(pl.when(pl.col(i).is_in(['C85']) == False)
                                     .then(pl.lit(None))
                                     .otherwise(pl.col(i))
                                     .alias(i)
                                     )

Cabin
str
""
"""C85"""
""
""
""
…
""
""
""
""


In [53]:
type('Cabin') == str

True

In [4]:
data

NameError: name 'data' is not defined

In [74]:
z = '1'
if z:
    print('a')

a


In [23]:
def UserForm(fruits):
    fruits: List | Dict 

    return fruits

In [24]:
UserForm(fruits='a')

'a'

In [31]:
from typing import List, Optional

from pydantic import BaseModel


class Foo(BaseModel):
    count: int
    size: Optional[float] = None


class Bar(BaseModel):
    apple: str = 'x'
    banana: str = 'y'


class Spam(BaseModel):
    foo: Foo
    bars: Bar

Spam()

ValidationError: 2 validation errors for Spam
foo
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing
bars
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing

In [28]:
def 

Spam(foo=Foo(count=4, size=None), bars=[Bar(apple='x1', banana='y'), Bar(apple='x2', banana='y')])

In [9]:
import pandas as pd
df = pd.DataFrame()

In [14]:
isinstance(df, pd.DataFrame)

True