# Extracting binary features from confusion matrices

A binary feature for n items is coded as an array of n binary values. This script finds the k best features from all x<sup>n-1</sup> possible features to describe all the items by considering mutual entropies between the input feature and output, and the mutual information between the input features.

In [1]:
import numpy as np
import pandas as pd
from itertools import combinations, combinations_with_replacement
from tqdm.notebook import tqdm

In [2]:
def mutual_information(matrix):
    """Calculating mutual information from tabulated data"""
    total = matrix.sum()
    flatM = matrix.ravel()
    margin0 = matrix.sum(axis=0)
    margin1 = matrix.sum(axis=1)
    divisor = np.outer(margin1,margin0).ravel()
    indices = flatM.nonzero()
    return np.sum(flatM[indices]/total*np.log2(flatM[indices]*total/divisor[indices]))
    
def mutual_x_y(matrix,feature, output_grouping = True): 
    """Tabulating data based on a feature and the outcome from the full confusion matrix. 
    If output_grouping is True, the outcome is also grouped based on the feature"""
    if output_grouping:
        indices0 = []
        indices1 = []
        for i,v in enumerate(feature):
            if v:
                indices1.append(i)
            else:
                indices0.append(i)

        matrix_xy = np.array([[matrix[np.ix_(indices0,indices0)].sum(),
                          matrix[np.ix_(indices1,indices0)].sum()],
                         [matrix[np.ix_(indices0,indices1)].sum(),
                          matrix[np.ix_(indices1,indices1)].sum()]])
    else:
        a0 = matrix[:,feature].sum(axis=1)
        a1 = matrix[:,np.invert(feature)].sum(axis=1)
        matrix_xy = np.vstack([a0,a1]).transpose()
        
    return mutual_information(matrix_xy)
    

def mutual_xi_xj(counts, indices_i1, indices_j1):
    """Tabulating the input based on two features"""
    indices_i0 = np.invert(indices_i1)
    indices_j0 = np.invert(indices_j1)

    matrix_xx = np.array([[counts[indices_i0&indices_j0].sum(),
                      counts[indices_i1&indices_j0].sum()],
                     [counts[indices_i0&indices_j1].sum(),
                      counts[indices_i1&indices_j1].sum()]])
    return mutual_information(matrix_xx)


def mutual_xi_xj_Y(cm, indices_i1, indices_j1): 
    """Calculating conditional mutual information"""
    mi = 0
    for r in range(cm.ny):
        mi+= cm.py[r]*mutual_xi_xj(cm.matrix[r,], indices_i1, indices_j1)
    return mi
      

    
    
def mRMR(cm,k):
    """To maximize MI_xy - (1/k) * MI_xx (Peng et al., 2005)"""
    indices = cm.order
    selected = np.array([],dtype='int')
    
    for i in tqdm(range(k)):
        best = 0
        j_max = 0
        for j in range(len(indices)):
            f = indices[j]
            I_xx = 0
            for s in selected:
                if f < s:       
                    a = f
                    b = s
                else:
                    a = s
                    b = f
                if not (a,b) in cm.MI_xx:
                    cm.MI_xx[(a,b)] = mutual_xi_xj(cm.x_counts,cm.features[a],cm.features[b])                  
                I_xx += cm.MI_xx[(a,b)]
            if I_xx:
                I_xx = I_xx/len(selected)
            j_x = cm.MI_xy[f] - I_xx
            if j_x > j_max:
                j_max = j_x
                best = j
        selected = np.append(selected, indices[best])
        indices=np.delete(indices,best)
    return selected

def ran(cm,k):
    """customized J(x) with MI_xx = 1"""
    indices = cm.order
    selected = np.array([],dtype='int')
    
    for i in tqdm(range(k)):
        best = 0
        j_max = 0
        for j in range(len(indices)):
            f = indices[j]
            I_xx = 0
            for s in selected:
                if f < s:       
                    a = f
                    b = s
                else:
                    a = s
                    b = f
                if not (a,b) in cm.MI_xx:
                    cm.MI_xx[(a,b)] = mutual_xi_xj(cm.x_counts,cm.features[a],cm.features[b])                  
                I_xx += cm.MI_xx[(a,b)]
            j_x = cm.MI_xy[f] - I_xx
            if j_x > j_max:
                j_max = j_x
                best = j
        selected = np.append(selected, indices[best])
        indices=np.delete(indices,best)
    return selected
    
def DCSF(cm,k):
    """(Gao et al., 2018)"""
    indices = cm.order
    selected = np.array([],dtype='int')

    for i in tqdm(range(k)):
        best = 0
        j_max = 0
        for j in range(len(indices)):
            f = indices[j]
            I_xx = 0
            I_xx_Y = 0
            for s in selected:
                if f < s:       
                    a = f
                    b = s
                else:
                    a = s
                    b = f
                if not (a,b) in cm.MI_xx:
                    cm.MI_xx[(a,b)] = mutual_xi_xj(cm.x_counts,cm.features[a],cm.features[b]) 
                if not (a,b) in cm.MI_xx_Y:
                    cm.MI_xx_Y[(a,b)] = mutual_xi_xj_Y(cm,cm.features[a],cm.features[b]) 
                I_xx += cm.MI_xx[a,b]
                I_xx_Y += cm.MI_xx_Y[a,b]
            if I_xx:
                I_xx = I_xx/len(selected)
                I_xx_Y = I_xx_Y/len(selected)
            j_x = cm.MI_xy[f] - 3 * I_xx + 2 * I_xx_Y
            if j_x > j_max:
                j_max = j_x
                best = j
        selected = np.append(selected, indices[best])
        indices=np.delete(indices,best)
    return selected

def JMI(cm,k):
    """(Yang and Moody, 2000)"""
    indices = cm.order
    selected = np.array([],dtype='int')

    for i in tqdm(range(k)):
        best = 0
        j_max = 0
        for j in range(len(indices)):
            f = indices[j]
            I_xx = 0
            I_xx_Y = 0
            for s in selected:
                if f < s:       
                    a = f
                    b = s
                else:
                    a = s
                    b = f
                if not (a,b) in cm.MI_xx:
                    cm.MI_xx[(a,b)] = mutual_xi_xj(cm.x_counts,cm.features[a],cm.features[b]) 
                if not (a,b) in cm.MI_xx_Y:
                    cm.MI_xx_Y[(a,b)] = mutual_xi_xj_Y(cm,cm.features[a],cm.features[b]) 
                I_xx += cm.MI_xx[a,b]
                I_xx_Y += cm.MI_xx_Y[a,b]
            if I_xx:
                I_xx = I_xx/len(selected)
                I_xx_Y = I_xx_Y/len(selected)
            j_x = cm.MI_xy[f] - I_xx + I_xx_Y
            if j_x > j_max:
                j_max = j_x
                best = j
        selected = np.append(selected, indices[best])
        indices=np.delete(indices,best)
    
    return selected
    
class confusion_matrix:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path, index_col=0)
        self.df.reindex(sorted(self.df.index), axis=0)
        self.df.reindex(sorted(self.df.columns), axis=1)
        
        self.matrix = self.df.to_numpy()
        self.x_counts = self.matrix.sum(axis=0)
        self.n = self.matrix.shape[0]
        self.ny = self.matrix.shape[1]
        self.n_features = 2**(self.n-1)
        self.py = self.matrix.sum(axis=1)/self.matrix.sum()
        self.results = dict()
        
        self.MI_xy = np.array([])
        self.MI_xx = dict()
        # Uppercase Y means Y is the conditioning variable
        self.MI_xx_Y = dict()


    def select(self, feature_lim = None, k = 8, method = "DCSF", output_grouping = False):
        
        
        self.features = np.array([[j=='1' 
                                   for j in '1'+np.binary_repr(i, width=self.n-1)] 
                                   for i in range(self.n_features)])
        
        if self.MI_xy.size==0:     
            print('Computing I(x;y)')

            for i in tqdm(range(self.n_features)):
                self.MI_xy= np.append(self.MI_xy,mutual_x_y(self.matrix,self.features[i],output_grouping))

        self.order = np.arange(self.n_features, dtype='int')  
        
        try:
            exec(method)
            print('Selecting features with ' + method)
        except NameError:
            print('Selection methods ' + method + ' is not defined')        
        
        exec('self.results[method] = {}(self,{})'.format(method,k))
        self.results[method] = self.results[method][np.argsort(-self.MI_xy[self.results[method]])]
        
    def show_selected(self, method):
        results = self.results[method]
        print('Printing {} selected features:'.format(len(results)))
        print('  | '+(' | ').join(production.df.index)+ ' | MI_xy')
        not_diff = []
        
        for i in range(len(results)):
            j = results[i]
            f = self.features[j]
            if not i:
                fs = f
            else:
                fs = np.vstack([fs, f])
                
            print('{:<2}'.format(i+1)+'| '+
                  ' | '.join(('x'+np.binary_repr(j, width=self.n-1)).replace('1','x').replace('0',' '))+ 
                  ' | {:.4f}'.format(self.MI_xy[j]))
            
        for i in range(fs.shape[1]):
            for j in range(i+1,fs.shape[1]):
                if (fs[:,i]==fs[:,j]).all():
                    not_diff.append([self.df.index[i],self.df.index[j]])
        print('\n'+'-'*40)
        print('\nPairs of items that cannot be differentiated with the selected features:')
        if not i:
            print('None')
        else:
            for i in not_diff:
                print(i)

## selection with original outcomes

In [3]:
production = confusion_matrix('cm_production.csv')
perception = confusion_matrix('cm_perception.csv')

In [4]:
production.select(method = 'DCSF')
production.select(method = 'mRMR')
production.select(method = 'JMI')
production.select(method = 'ran')

Computing I(x;y)


  0%|          | 0/32768 [00:00<?, ?it/s]

Selecting features with DCSF


  0%|          | 0/8 [00:00<?, ?it/s]

Selecting features with mRMR


  0%|          | 0/8 [00:00<?, ?it/s]

Selecting features with JMI


  0%|          | 0/8 [00:00<?, ?it/s]

Selecting features with ran


  0%|          | 0/8 [00:00<?, ?it/s]

In [5]:
production.show_selected('DCSF')

Printing 8 selected features:
  | b | d | ð | f | g | k | m | n | p | s | ʃ | t | v | z | ʒ | θ | MI_xy
1 | x | x |   | x | x | x | x | x | x |   |   | x |   |   |   |   | 0.5572
2 | x | x | x |   | x |   | x | x |   |   |   |   | x | x | x |   | 0.4554
3 | x |   |   | x |   | x | x | x | x | x | x | x |   |   |   |   | 0.4531
4 | x | x | x |   | x | x |   |   |   | x | x | x |   |   | x |   | 0.3764
5 | x | x |   |   | x |   | x | x |   | x | x |   |   |   |   | x | 0.3730
6 | x | x |   | x | x |   |   |   | x | x | x |   | x | x |   |   | 0.3564
7 | x | x |   |   | x | x |   |   |   |   |   | x | x | x |   | x | 0.3510
8 | x | x | x | x | x |   |   |   | x |   |   |   |   |   | x | x | 0.3256

----------------------------------------

Pairs of items that cannot be differentiated with the selected features:
['d', 'g']
['ð', 'ʒ']
['f', 'p']
['k', 't']
['m', 'n']
['s', 'ʃ']
['v', 'z']


In [6]:
production.show_selected('mRMR')

Printing 8 selected features:
  | b | d | ð | f | g | k | m | n | p | s | ʃ | t | v | z | ʒ | θ | MI_xy
1 | x | x |   | x | x | x | x | x | x |   |   | x |   |   |   |   | 0.5572
2 | x | x |   | x | x | x | x | x | x |   |   | x | x |   |   |   | 0.5523
3 | x | x |   |   | x | x | x | x | x | x | x | x |   |   |   |   | 0.4758
4 | x | x |   | x | x | x |   |   | x | x | x | x |   |   |   | x | 0.4702
5 | x |   |   | x |   | x |   |   | x | x | x | x | x |   |   | x | 0.4594
6 | x |   | x | x | x | x |   |   | x |   |   |   | x | x | x |   | 0.4303
7 | x | x | x |   | x |   |   |   |   |   |   |   | x | x |   |   | 0.4155
8 | x |   |   | x |   |   | x | x | x | x | x |   | x | x |   |   | 0.4030

----------------------------------------

Pairs of items that cannot be differentiated with the selected features:
['m', 'n']
['s', 'ʃ']


In [7]:
production.show_selected('JMI')

Printing 8 selected features:
  | b | d | ð | f | g | k | m | n | p | s | ʃ | t | v | z | ʒ | θ | MI_xy
1 | x | x |   | x | x | x | x | x | x |   |   | x |   |   |   |   | 0.5572
2 | x | x |   |   | x | x | x | x | x |   |   | x |   |   | x |   | 0.5115
3 | x | x | x | x | x | x |   |   | x |   |   | x | x |   |   |   | 0.4618
4 | x |   |   | x |   | x | x | x | x |   |   | x | x | x |   |   | 0.4596
5 | x | x |   | x | x | x |   |   | x |   |   | x | x | x |   |   | 0.4552
6 | x | x | x |   | x |   | x | x |   |   |   |   | x | x |   |   | 0.4546
7 | x |   |   | x |   | x |   |   | x | x | x | x |   |   |   | x | 0.4489
8 | x |   | x | x |   | x |   |   | x |   |   | x | x | x | x |   | 0.3787

----------------------------------------

Pairs of items that cannot be differentiated with the selected features:
['d', 'g']
['k', 'p']
['k', 't']
['m', 'n']
['p', 't']
['s', 'ʃ']
['s', 'θ']
['ʃ', 'θ']


In [8]:
production.show_selected('ran')

Printing 8 selected features:
  | b | d | ð | f | g | k | m | n | p | s | ʃ | t | v | z | ʒ | θ | MI_xy
1 | x | x |   | x | x | x | x | x | x |   |   | x |   |   |   |   | 0.5572
2 | x |   |   | x |   | x |   |   | x | x | x | x | x |   |   | x | 0.4594
3 | x |   | x | x | x | x |   |   | x |   |   |   | x | x | x |   | 0.4303
4 | x | x |   |   | x | x |   |   |   | x | x |   |   |   | x |   | 0.3868
5 | x |   |   | x |   |   | x | x | x | x | x |   |   | x | x |   | 0.3660
6 | x | x |   | x |   |   | x |   |   |   |   |   | x |   | x | x | 0.3100
7 | x | x |   | x | x |   |   | x |   | x |   | x | x | x |   |   | 0.2809
8 | x |   |   |   |   | x | x | x |   |   | x | x | x | x | x |   | 0.2791

----------------------------------------

Pairs of items that cannot be differentiated with the selected features:


In [9]:
perception.select(method='DCSF')
perception.select(method='mRMR')
perception.select(method='JMI')
perception.select(method='ran')

Computing I(x;y)


  0%|          | 0/32768 [00:00<?, ?it/s]

Selecting features with DCSF


  0%|          | 0/8 [00:00<?, ?it/s]

Selecting features with mRMR


  0%|          | 0/8 [00:00<?, ?it/s]

Selecting features with JMI


  0%|          | 0/8 [00:00<?, ?it/s]

Selecting features with ran


  0%|          | 0/8 [00:00<?, ?it/s]

In [10]:
perception.show_selected('DCSF')

Printing 8 selected features:
  | b | d | ð | f | g | k | m | n | p | s | ʃ | t | v | z | ʒ | θ | MI_xy
1 | x | x | x |   | x |   | x | x |   |   |   |   | x | x | x |   | 0.8111
2 | x | x | x | x | x |   |   |   |   |   | x |   | x | x | x | x | 0.5262
3 | x | x |   |   | x |   | x | x |   | x |   |   | x | x | x |   | 0.5208
4 | x |   |   | x |   | x | x | x | x |   |   | x | x |   |   | x | 0.4953
5 | x | x | x | x | x | x |   |   | x |   |   | x | x | x | x | x | 0.4666
6 | x |   | x | x |   |   | x | x |   |   |   |   | x |   |   | x | 0.4499
7 | x |   | x |   |   | x |   |   | x | x |   | x | x |   |   |   | 0.3906
8 | x |   |   |   |   |   |   |   |   | x | x |   | x |   |   |   | 0.3635

----------------------------------------

Pairs of items that cannot be differentiated with the selected features:
['b', 'v']
['d', 'g']
['d', 'z']
['d', 'ʒ']
['f', 'θ']
['g', 'z']
['g', 'ʒ']
['k', 'p']
['k', 't']
['m', 'n']
['p', 't']
['z', 'ʒ']


In [11]:
perception.show_selected('mRMR')

Printing 8 selected features:
  | b | d | ð | f | g | k | m | n | p | s | ʃ | t | v | z | ʒ | θ | MI_xy
1 | x | x | x |   | x |   | x | x |   |   |   |   | x | x | x |   | 0.8111
2 | x |   |   | x |   | x |   |   | x | x | x | x |   |   |   | x | 0.7003
3 | x | x | x |   | x |   |   |   |   | x | x |   | x | x | x |   | 0.6455
4 | x | x | x | x | x |   |   |   |   |   |   |   | x | x | x | x | 0.5914
5 | x |   |   | x |   | x |   |   | x | x |   | x | x |   |   | x | 0.5691
6 | x | x | x | x | x |   |   |   |   | x | x |   | x | x | x | x | 0.5545
7 | x |   | x |   |   |   | x | x |   |   |   |   | x |   |   |   | 0.5447
8 | x | x | x |   | x | x |   |   | x |   |   | x | x | x | x |   | 0.5410

----------------------------------------

Pairs of items that cannot be differentiated with the selected features:
['d', 'g']
['d', 'z']
['d', 'ʒ']
['f', 'θ']
['g', 'z']
['g', 'ʒ']
['k', 'p']
['k', 't']
['m', 'n']
['p', 't']
['z', 'ʒ']


In [12]:
perception.show_selected('JMI')

Printing 8 selected features:
  | b | d | ð | f | g | k | m | n | p | s | ʃ | t | v | z | ʒ | θ | MI_xy
1 | x | x | x |   | x |   | x | x |   |   |   |   | x | x | x |   | 0.8111
2 | x |   |   | x |   | x |   |   | x | x | x | x |   |   |   | x | 0.7003
3 | x | x | x |   | x |   |   |   |   |   | x |   | x | x | x |   | 0.6820
4 | x |   |   | x |   | x |   |   | x | x |   | x | x |   |   | x | 0.5691
5 | x | x | x | x | x |   |   |   |   | x |   |   | x | x | x | x | 0.5590
6 | x | x | x | x | x |   |   |   |   | x | x |   | x | x | x | x | 0.5545
7 | x | x | x |   | x | x |   |   | x |   |   | x | x | x | x |   | 0.5410
8 | x |   |   |   |   |   | x | x |   |   | x |   | x |   |   |   | 0.4640

----------------------------------------

Pairs of items that cannot be differentiated with the selected features:
['d', 'ð']
['d', 'g']
['d', 'z']
['d', 'ʒ']
['ð', 'g']
['ð', 'z']
['ð', 'ʒ']
['f', 's']
['f', 'θ']
['g', 'z']
['g', 'ʒ']
['k', 'p']
['k', 't']
['m', 'n']
['p', 't']
['s', 'θ']
['z'

In [13]:
perception.show_selected('ran')

Printing 8 selected features:
  | b | d | ð | f | g | k | m | n | p | s | ʃ | t | v | z | ʒ | θ | MI_xy
1 | x | x | x |   | x |   | x | x |   |   |   |   | x | x | x |   | 0.8111
2 | x | x | x | x | x |   |   |   |   | x | x |   | x | x | x | x | 0.5545
3 | x | x | x |   | x | x |   |   | x |   |   | x | x | x | x |   | 0.5410
4 | x |   | x | x |   | x |   |   | x |   |   | x | x |   |   | x | 0.5053
5 | x |   | x |   |   |   | x | x |   |   | x |   | x |   |   |   | 0.4971
6 | x |   |   |   |   | x |   |   |   | x | x | x | x | x |   |   | 0.2676
7 | x |   |   | x |   | x | x | x |   |   |   |   |   | x | x |   | 0.2263
8 | x |   |   | x | x |   | x |   | x | x |   |   | x |   |   |   | 0.1811

----------------------------------------

Pairs of items that cannot be differentiated with the selected features:
