In [1]:
from typing import Sequence

import numpy as np
import pandas as pd
import pandas.api.types as pdt

import visions
from visions.relations import IdentityRelation, TypeRelation
from visions.typesets.typeset import get_type_from_path


class Nominal(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Categorical)]

    @classmethod
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        return not pdt.is_categorical_dtype(series) or (
            pdt.is_categorical_dtype(series) and not series.cat.ordered
        )


class Categorical(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        # This example could be extended to show how low-cardinality discrete variables would be
        # inferred to nominal / ordinal. This can be achieved with an InferenceRelation.
        return [IdentityRelation(visions.Generic)]

    @classmethod
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        return pdt.is_object_dtype(series)


class Binary(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Nominal)]

    @classmethod
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        state["n_distinct"] = state.get("n_distinct") or series.nunique()
        return state["n_distinct"] == 2


class Ordinal(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Categorical)]

    @classmethod
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        return pdt.is_categorical_dtype(series) and series.cat.ordered


class Numeric(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(visions.Generic)]

    @classmethod
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        return pdt.is_numeric_dtype(series)


class Continuous(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Numeric)]

    @classmethod
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        return not pdt.is_integer_dtype(series)


class Discrete(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Numeric)]

    @classmethod
    def contains_op(cls, series: pd.Series, state: dict) -> bool:
        return pdt.is_integer_dtype(series)


class VariableTypeset(visions.VisionsTypeset):
    def __init__(self):
        types = {
            visions.Generic,
            Categorical,
            Nominal,
            Ordinal,
            Numeric,
            Continuous,
            Discrete,
            Binary,
        }
        super().__init__(types)


variable_set = VariableTypeset()
variable_set.output_graph("variable_set.pdf")


class Classification(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(visions.Generic)]

    @classmethod
    def contains_op(cls, series, state):
        state["dtype"] = state.get("dtype") or variable_set.detect_type(series)
        return state["dtype"] in [Nominal, Categorical, Ordinal, Binary]


class BinaryClassification(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Classification)]

    @classmethod
    def contains_op(cls, series, state):
        state["dtype"] = state.get("dtype") or variable_set.detect_type(series)
        return state["dtype"] == Binary


class MultiClassification(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Classification)]

    @classmethod
    def contains_op(cls, series, state):
        state["dtype"] = state.get("dtype") or variable_set.detect_type(series)
        return state["dtype"] != Binary


class Regression(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(visions.Generic)]

    @classmethod
    def contains_op(cls, series, state):
        state["dtype"] = state.get("dtype") or variable_set.detect_type(series)
        return state["dtype"] in [Continuous, Discrete]


class PoissonRegression(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Regression)]

    @classmethod
    def contains_op(cls, series, state):
        state["dtype"] = state.get("dtype") or variable_set.detect_type(series)
        if not state["dtype"] == Discrete:
            return False

        # This is a simplified test if poisson regression applies that doesn't take into account if
        # the ratio is significant
        state["mean_var_ratio"] = state.get("mean_var_rate") or np.mean(
            series
        ) / np.var(series)
        return np.isclose(state["mean_var_ratio"], 1, rtol=0.05)


class NegBinomRegression(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Regression)]

    @classmethod
    def contains_op(cls, series, state):
        state["dtype"] = state.get("dtype") or variable_set.detect_type(series)
        if not state["dtype"] == Discrete:
            return False

        # See comment at poisson regression
        state["mean_var_ratio"] = state.get("mean_var_rate") or np.mean(
            series
        ) / np.var(series)
        return state["mean_var_ratio"] > 1.05


class OrdinalRegression(visions.VisionsBaseType):
    @staticmethod
    def get_relations() -> Sequence[TypeRelation]:
        return [IdentityRelation(Classification)]

    @classmethod
    def contains_op(cls, series, state):
        state["dtype"] = state.get("dtype") or variable_set.detect_type(series)
        return state["dtype"] == Ordinal


class MLProblemTypeset(visions.VisionsTypeset):
    def __init__(self):
        types = {
            visions.Generic,
            Classification,
            BinaryClassification,
            MultiClassification,
            Regression,
            NegBinomRegression,
            PoissonRegression,
            OrdinalRegression,
        }
        super().__init__(types)


problem_set = MLProblemTypeset()
problem_set.output_graph("problem_set.pdf")


# Example
dataset = pd.DataFrame(
    {
        "target_3": ["cat", "dog", "dog", "cat", "horse"],
        "target_2": ["cat", "dog", "dog", "cat", "dog"],
        "target_num": [1, 2, 2, 1, 2],
    }
)


for target in dataset.columns:
    _, problem_types, state = problem_set.detect(dataset[target])
    problem_type = get_type_from_path(problem_types)

    print(
        f"The target variable '{target}' is of the {state['dtype']} statistical type."
    )
    print(f"Our logic found that a {problem_type} model should be used.")

The target variable 'target_3' is of the Nominal statistical type.
Our logic found that a MultiClassification model should be used.
The target variable 'target_2' is of the Binary statistical type.
Our logic found that a BinaryClassification model should be used.
The target variable 'target_num' is of the Discrete statistical type.
Our logic found that a NegBinomRegression model should be used.


In [2]:
df = pd.read_csv("../dataset/adult/adult.csv")

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [4]:
for target in df.columns:
    _, problem_types, state = problem_set.detect(df[target])
    problem_type = get_type_from_path(problem_types)

    print(
        f"The target variable '{target}' is of the {state['dtype']} statistical type."
    )
    print('-----')
    print(f"Our logic found that a {problem_type} model should be used.")
    print('================')

The target variable 'age' is of the Discrete statistical type.
-----
Our logic found that a Regression model should be used.
The target variable 'workclass' is of the Nominal statistical type.
-----
Our logic found that a MultiClassification model should be used.
The target variable 'fnlwgt' is of the Discrete statistical type.
-----
Our logic found that a Regression model should be used.
The target variable 'education' is of the Nominal statistical type.
-----
Our logic found that a MultiClassification model should be used.
The target variable 'education.num' is of the Discrete statistical type.
-----
Our logic found that a NegBinomRegression model should be used.
The target variable 'marital.status' is of the Nominal statistical type.
-----
Our logic found that a MultiClassification model should be used.
The target variable 'occupation' is of the Nominal statistical type.
-----
Our logic found that a MultiClassification model should be used.
The target variable 'relationship' is of t

In [6]:
for target in df.columns:
    a, problem_types, state = problem_set.detect(df[target])
    problem_type = get_type_from_path(problem_types)
    print(target)
    print("=")
    print(a)
    print("=")
    print(problem_types)
    print("=")
    print(state)
    print("=")
    print(problem_type)
    print("=")
    print("--------------")
    

age
=
0        90
1        82
2        66
3        54
4        41
         ..
32556    22
32557    27
32558    40
32559    58
32560    22
Name: age, Length: 32561, dtype: int64
=
[Generic, Regression]
=
{'dtype': Discrete, 'mean_var_ratio': 0.20736612561090012}
=
Regression
=
--------------
workclass
=
0              ?
1        Private
2              ?
3        Private
4        Private
          ...   
32556    Private
32557    Private
32558    Private
32559    Private
32560    Private
Name: workclass, Length: 32561, dtype: object
=
[Generic, Classification, MultiClassification]
=
{'dtype': Nominal}
=
MultiClassification
=
--------------
fnlwgt
=
0         77053
1        132870
2        186061
3        140359
4        264663
          ...  
32556    310152
32557    257302
32558    154374
32559    151910
32560    201490
Name: fnlwgt, Length: 32561, dtype: int64
=
[Generic, Regression]
=
{'dtype': Discrete, 'mean_var_ratio': 1.7035063254174622e-05}
=
Regression
=
--------------
education