In [5]:
import os
import numpy as np
import pandas as pd


import sklearn

In [25]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.ensemble import RandomForestClassifier


class OrdinalRandomForestClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
    def __init__(self, rf_params_kwargs):
        self.rf_params_kwargs = rf_params_kwargs
        
    def fit(self, X, y):
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        
        # an ordered array of unique labels.
        self.classes_ = unique_labels(y)
        
        # number of target labels
        self.n_classes_ = len(self.classes_)
        
        # initialize the random forest classifiers
        self.rfs = {i : RandomForestClassifier(**self.rf_params_kwargs) for i in range(self.n_classes_ - 1)}
        
        # create the ordinal target
        ordinal_target = self.create_ordinal_target(y)
        
        # fit the random forest classifiers
        for i, model in self.rfs.items():
            model.fit(X, ordinal_target[:, i])
        
        # Return the classifier
        return self
    # -------------------------------------------
    def create_ordinal_target(self, y):
        ordinal_target = np.zeros((len(y), self.n_classes_ - 1), dtype=int)
        for i in range(self.n_classes_ - 1):
            # the i-th column indicates whether the target is greater than i
            # Y(y <= i) = 0, Y(y > i) = 1
            ordinal_target[:, i] = (y > i)
        
        return ordinal_target
    # -------------------------------------------
    def predict_prob(self, X):
        '''
        predict the probability of each class for each sample in X
        P(y = 0) = P(y <= 0)
        P(y = 1) = P(y <= 1) - P(y <= 0)
        P(y = k) = P(y <= k) - P(y <= k-1)
        P(y = K - 1) = 1 - P(y <= K-2)
        :param X: 
        :return: 
        '''
        check_is_fitted(self)
        X = check_array(X)
        
        prob = np.zeros((X.shape[0], self.n_classes_))
        for i, model in self.rfs.items():
            prob[:, i] = model.predict_proba(X)[:, 1]
            
        prob[:, -1] = 1
        for i in range(1, self.n_classes_):
            prob[:, i] *= np.prod(1 - prob[:, :i], axis=1)
        
        return prob
    
    # -------------------------------------------
    @property
    def feature_importance_(self):
        check_is_fitted(self)
        return {i : model.feature_importances_ for i, model in self.rfs.items()}

In [14]:
y = np.array([2, 1, 7] * 10)
unique_labels(y)

array([1, 2, 7])

In [13]:
cl = unique_labels(y)
cl.sort()

In [8]:
data_Y

0          very likely
1      somewhat likely
2             unlikely
3      somewhat likely
4      somewhat likely
            ...       
395           unlikely
396           unlikely
397    somewhat likely
398    somewhat likely
399        very likely
Name: apply, Length: 400, dtype: category
Categories (3, object): ['unlikely' < 'somewhat likely' < 'very likely']

In [26]:
model = OrdinalRandomForestClassifier({'n_estimators': 100, 'max_depth': 2})
model.fit(data_X, data_Y)

model.create_ordinal_target(data_Y)

array([[1, 1],
       [1, 0],
       [0, 0],
       [1, 0],
       [1, 0],
       [0, 0],
       [1, 0],
       [1, 0],
       [0, 0],
       [1, 0],
       [0, 0],
       [1, 0],
       [1, 1],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 1],
       [0, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [0, 0],
       [1, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [0,

In [24]:
url = "https://stats.idre.ucla.edu/stat/data/ologit.dta"
data_student = pd.read_stata(url)
data_X = data_student[['pared', 'public', 'gpa']]
data_Y = data_student['apply']

data_Y = data_Y.map({'unlikely' : 0, 'somewhat likely' : 1, 'very likely' : 2}) \
               .astype(int)

pd.get_dummies(data_Y, dtype=)

Unnamed: 0,0,1,2
0,False,False,True
1,False,True,False
2,True,False,False
3,False,True,False
4,False,True,False
...,...,...,...
395,True,False,False
396,True,False,False
397,False,True,False
398,False,True,False


In [11]:
from statsmodels.miscmodels.ordinal_model import OrderedModel
mod_log = OrderedModel(data_Y, data_X, distr='logit')

res_log = mod_log.fit(method='bfgs', disp=False)
res_log.summary()

0,1,2,3
Dep. Variable:,apply,Log-Likelihood:,-358.51
Model:,OrderedModel,AIC:,727.0
Method:,Maximum Likelihood,BIC:,747.0
Date:,"Thu, 09 May 2024",,
Time:,22:44:05,,
No. Observations:,400,,
Df Residuals:,395,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pared,1.0476,0.266,3.942,0.000,0.527,1.569
public,-0.0586,0.298,-0.197,0.844,-0.642,0.525
gpa,0.6158,0.261,2.363,0.018,0.105,1.127
unlikely/somewhat likely,2.2035,0.780,2.827,0.005,0.676,3.731
somewhat likely/very likely,0.7398,0.080,9.236,0.000,0.583,0.897


In [15]:
data_Y.astype(int)

ValueError: Cannot cast object dtype to int32