In [82]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from pandas.api.types import is_categorical_dtype,is_numeric_dtype
from sklearn.model_selection import train_test_split, StratifiedKFold

In [439]:
class DataUtils:

    @staticmethod
    def is_numeric(series:pd.Series) -> bool:
        return is_numeric_dtype(series)
    
    @staticmethod
    def is_categorical(series:pd.Series) -> bool:
        return not DataUtils.is_numeric(series)

    @staticmethod
    def encode(series:pd.Series)-> np.ndarray:
        encoder = preprocessing.OrdinalEncoder()
        arr = series.to_numpy().reshape(-1,1)
        return encoder.fit_transform(arr).astype(int), encoder.categories_

    @staticmethod
    def normalize(series:pd.Series) -> np.ndarray: 
       standard_scaler =  preprocessing.MinMaxScaler()
       arr = series.to_numpy().reshape(-1,1)
       return standard_scaler.fit_transform(arr)


    @staticmethod
    def split_into_folds(df:pd.DataFrame, fold_num=5):     
        fold_generator = StratifiedKFold(n_splits=fold_num, shuffle=True)
        class_column = df.columns[-1]
        
        y = df[class_column]
        del df[class_column]

        x_train,y_train,x_test,y_test = list(),list(),list(),list()
        for train_indices, test_indices in fold_generator.split(df, y):

            x_train.append(df.iloc[train_indices])
            y_train.append(pd.DataFrame(y.iloc[train_indices]))
            x_test.append(df.iloc[test_indices])
            y_test.append(pd.DataFrame(y.iloc[test_indices]))

        return x_train, y_train, x_test, y_test



In [467]:
df = pd.read_csv('../data/data.csv')

x_train, y_train, x_test, y_test = DataUtils.split_into_folds(df,4)

In [433]:
x_train[0]

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [434]:
y_train[0]

Unnamed: 0,Class
0,Benign
1,Benign
2,Benign
3,Benign
4,Benign
...,...
694,Benign
695,Benign
696,Malignant
697,Malignant


In [458]:
class RiseClassifier:
    
    def __init__(self,columns):
        self.columns = columns
        

    def fit(self,x_df:pd.DataFrame,y_df:pd.DataFrame):
        """
            convert the ru
        """
        df = self.__process_df(x_df.copy())
        self.instances = self.__df_to_instances(df.to_numpy(),y_df.to_numpy())
        self.rules = set([InstanceRule(inst) for inst in self.instances])    


    def __df_to_instances(self,x_df,y_df):
        """
            converts dataframe into list of instances 
        """
        instances = []
        categorical_colums = self.categorical_econding.keys()
        for x,y in zip(x_df,y_df):
            # here we preparte the atributes fro Instance
            # we keep in tuple format where (column_name,value,is_this_attribute_categorical)
            attributes = [(column,value, column in categorical_colums) for column,value in zip(self.columns,x)]
            instance = Instance.build(y[0],attributes)
            instances.append(instance)
        return instances 


    def __process_df(self,df:pd.DataFrame):
        """
            apply ordinal enconding and normalization for df attributes
            also save the attribute encoding in self.categorical_econding
        """
        self.categorical_econding = {}
        for column in self.columns:
            if DataUtils.is_categorical(df[column]):
                encoded,categories = DataUtils.encode(df[column])
                self.categorical_econding[column] = list(categories[0])
                df[column] = encoded
        return df
            


In [None]:
class RiseUtils:

    @staticmethod
    def rules_precision(rules:Set[InstanceRule],instances:List[Instance]):

In [459]:
@dataclass(unsafe_hash=True,eq=True)
class GenericInstanceAttribute(object):
    """
        used as generic representation for attributes of instances
    """
    attribute_name:str
    def __init__(self,attribute_name:str):
        self.attribute_name = attribute_name

@dataclass(unsafe_hash=True,eq=True)
class NumericInstanceAttribute(GenericInstanceAttribute):
    value:float
    def __init__(self,attribute_name:str,value:float):
        super().__init__(attribute_name)
        self.value = value
        
@dataclass(unsafe_hash=True,eq=True)
class CategoricalInstanceAttribute(GenericInstanceAttribute):
    value:int
    def __init__(self,attribute_name:str,value:int):
        super().__init__(attribute_name)
        self.value = value


@dataclass(unsafe_hash=True,eq=True)
class Instance(object):
    label:str
    properties: Tuple[GenericInstanceAttribute]
    
    def __init__(self,label:str,properties:Tuple[GenericInstanceAttribute]):
        self.label = label
        self.properties = properties
    
    def __str__(self):
        return f"{self.properties} + [{self.clazz}]"
    
    def to_numpy(self):
        return np.array([ prop.value for prop in self.properties])
    
    @staticmethod
    def build(label,attributes_metadata):
        llist = []
        for attr_metadata in attributes_metadata:
            # check if attribute is categorical
            if attr_metadata[2]:
                attr = CategoricalInstanceAttribute(attr_metadata[0],attr_metadata[1])
            else:
                attr = NumericInstanceAttribute(attr_metadata[0],attr_metadata[1])
            llist.append(attr)

        return Instance(label,llist)
    


In [482]:
from dataclasses import dataclass
from typing import List,Set,Tuple
from scipy import spatial

@dataclass(unsafe_hash=True,eq=True)
class GenericAttributeRule:
    attribute_name:str
    def __init__(self,name):
        self.attribute_name = name


@dataclass(unsafe_hash=True,eq=True)
class AtributeNumericalRule(GenericAttributeRule):
    lower_bound: float
    upper_bound: float
    def __init__(self,name,lower_bound,upper_bound):
        super().__init__(name)
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound


@dataclass(unsafe_hash=True,eq=True)
class AtributeCategoricalRule(GenericAttributeRule):
    value: int 
    def __init__(self,name,value):
        super().__init__(name)
        self.value = value


@dataclass(unsafe_hash=True,eq=True)
class InstanceRule:
    conclusions: Tuple[GenericAttributeRule]
    label:str
    
    def __init__(self,instance:Instance):
        self.label = instance.label
        conclusions = []
        for attribute in instance.properties:
            if isinstance(attribute, CategoricalInstanceAttribute):
                rule = AtributeCategoricalRule(attribute.attribute_name,attribute.value)
                conclusions.append(rule)
            elif isinstance(attribute, NumericInstanceAttribute):
                rule = AtributeNumericalRule(attribute.attribute_name,attribute.value,attribute.value)
                conclusions.append(rule)
        self.conclusions = tuple(conclusions)
        # self.conclusions = tuple(rules)

    def get_numerical_rules(self):
        return list(filter(lambda rule: isinstance(rule,AtributeNumericalRule) ,self.conclusions))
    
    def get_categorical_rules(self):
        return list(filter(lambda rule: isinstance(rule,AtributeCategoricalRule) ,self.conclusions))

    def get_rule(self,name):
        return list(filter(lambda rule: rule.attribute_name == name,self.conclusions))
    

    def to_numpy(self):
        ll = []
        for rule in self.conclusions:
            if isinstance(rule,AtributeNumericalRule):
               ll.append(rule.lower_bound)
            elif isinstance(rule,AtributeCategoricalRule):
                ll.append(rule.value)
        
        return np.array(ll)
    
    def distance(self,instance:Instance):
        return spatial.distance.cosine(self.to_numpy(),instance.to_numpy())



In [483]:
rise = RiseClassifier(x_train[0].columns)
rise.fit(x_train[0],y_train[0])

In [488]:
instance = rise.instances[0]
rule = InstanceRule(instance)

rule.distance(rise.instances[4])

0.7113248654051871

In [107]:
l1 = ['young','myope','yes','reduced']
l2 = ['young','myope','no','reduced']
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return 1 - float(len(s1.intersection(s2)) / len(s1.union(s2)))

jaccard_similarity(l1,l2)

0.4

In [258]:
import random
import numpy as np

dat = [0,0,1,2,3,4,5,6,7,8,9,10]

def discretize(data, bins):
    split = np.array_split(np.sort(data), bins)
    cutoffs = [x[-1] for x in split]
    cutoffs = cutoffs[:-1]
    discrete = np.digitize(data, cutoffs, right=True)
    return discrete, cutoffs

discrete_dat, cutoff = discretize(dat, 5)
print( "dat: {}".format(dat))
print( "discrete_dat: {}".format(discrete_dat))
print ("cutoff: {}".format(cutoff))

# dat: [ 0.5  1.   1.5  2.   2.5  3.   3.5  4.   4.5  5.   5.5  6. ]
# >> discrete_dat: [0 0 0 0 1 1 1 1 2 2 2 2]
# >> cutoff: [2.0, 4.0]

dat: [0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
discrete_dat: [0 0 0 1 1 1 2 2 3 3 4 4]
cutoff: [1, 4, 6, 8]


In [391]:
from scipy import spatial


1- spatial.distance.cosine(np.array([0.2,0.3,0.3,7]),np.array([0.2,0.3,0.4,0]))

0.06617138199854622