# Extracting binary features from confusion matrices

A binary feature for n items is coded as an array of n binary values. This script finds the k best features from all x<sup>n-1</sup> possible features to describe all the items by considering mutual entropies between the input feature and output, and the mutual information between the input features.

In [1]:
import numpy as np
import pandas as pd
from itertools import combinations, combinations_with_replacement
from tqdm.notebook import tqdm

In [2]:
def mutual_information(matrix):
    """Calculating mutual information from tabulated data"""
    total = matrix.sum()
    flatM = matrix.ravel()
    margin0 = matrix.sum(axis=0)
    margin1 = matrix.sum(axis=1)
    divisor = np.outer(margin1,margin0).ravel()
    indices = flatM.nonzero()
    return np.sum(flatM[indices]/total*np.log2(flatM[indices]*total/divisor[indices]))
    
def mutual_x_y(matrix,feature, output_grouping = True): 
    """Tabulating data based on a feature and the outcome from the full confusion matrix. 
    If output_grouping is True, the outcome is also grouped based on the feature"""
    if output_grouping:
        indices0 = []
        indices1 = []
        for i,v in enumerate(feature):
            if v:
                indices1.append(i)
            else:
                indices0.append(i)

        matrix_xy = np.array([[matrix[np.ix_(indices0,indices0)].sum(),
                          matrix[np.ix_(indices1,indices0)].sum()],
                         [matrix[np.ix_(indices0,indices1)].sum(),
                          matrix[np.ix_(indices1,indices1)].sum()]])
    else:
        a0 = matrix[:,feature].sum(axis=1)
        a1 = matrix[:,np.invert(feature)].sum(axis=1)
        matrix_xy = np.vstack([a0,a1]).transpose()
        
    return mutual_information(matrix_xy)
    

def mutual_xi_xj(counts, indices_i1, indices_j1):
    """Tabulating the input based on two features"""
    indices_i0 = np.invert(indices_i1)
    indices_j0 = np.invert(indices_j1)

    matrix_xx = np.array([[counts[indices_i0&indices_j0].sum(),
                      counts[indices_i1&indices_j0].sum()],
                     [counts[indices_i0&indices_j1].sum(),
                      counts[indices_i1&indices_j1].sum()]])
    return mutual_information(matrix_xx)


# def mutual_xi_y_xj(counts, indices_i1, indices_j1): 
    """Calculating conditional mutual information"""
    #to fill

def mRMR(indices, features, MI_xy,MI_xx, x_counts,k):
    """To minimize MI_xy - (1/k) * MI_xx (Peng et al., 2005)"""

    selected = np.array([],dtype='int')
    
    for i in range(k):
        best = 0
        j_max = 0
        for j in range(len(indices)):
            f = indices[j]
            I_xx = 0
            for s in selected:
                if f < s:       
                    a = f
                    b = s
                else:
                    a = s
                    b = f
                if MI_xx[a,b] == -1:
                    MI_xx[a,b] = mutual_xi_xj(x_counts,features[a],features[b])
                I_xx += MI_xx[a,b]
            if I_xx:
                I_xx = I_xx/len(selected)
            j_x = MI_xy[f] - I_xx
            if j_x > j_max:
                j_max = j_x
                best = j
        selected = np.append(selected, indices[best])
        indices=np.delete(indices,best)
    
    return selected
    
    
class confusion_matrix:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path, index_col=0)
        self.df.reindex(sorted(self.df.index), axis=0)
        self.df.reindex(sorted(self.df.columns), axis=1)
        self.matrix = self.df.to_numpy()
        self.x_counts = self.matrix.sum(axis=0)
        self.n = self.matrix.shape[0]
        self.n_features = 2**(self.n-1)


    def select(self, feature_lim = None, k = 8, batch_size = 100, output_grouping = True):
        
        if k >= batch_size:
            print('Error: Batch size is smaller than the number of features to be selected')
        
        self.features = np.array([[j=='1' 
                                   for j in '1'+np.binary_repr(i, width=self.n-1)] 
                                   for i in range(self.n_features)])
        
        print('Computing I(x;y)')
        self.MI_xy = np.array([])
        for i in tqdm(range(self.n_features)):
            self.MI_xy= np.append(self.MI_xy,mutual_x_y(self.matrix,self.features[i],output_grouping))
            
        self.order = np.arange(self.n_features, dtype='int') #this or the commented line below
        np.random.shuffle(self.order)

# arranging features according to I(x,y) with a descending order 
#         self.order = np.argsort(self.MI_xy)
        
# filtering the features with I(x,y) only       
#         if self.feature_lim & self.feature_lim >self.n_features:
#             print('Only analyzing the {} features with the highest entropies'.format(feature_lim))
#             self.order = self.order[:feature_lim]        
        
        print('Selecting features with mRMR')
        self.MI_xx = -np.ones((len(self.features),len(self.features)))
        self.results = np.array([],dtype='int')
        for i in tqdm(range(np.ceil(self.n_features/batch_size).astype('int'))):
            start = i*batch_size
            batch = self.order[start:start+batch_size]
            batch = np.append(batch, self.results)
            self.results = mRMR(batch, self.features, self.MI_xy, 
                           self.MI_xx, self.x_counts,k)
        self.results = self.results[np.argsort(-self.MI_xy[self.results])]

    
    def show_selected(self):
        print('Printing {} selected features:'.format(len(self.results)))
        not_diff = []
        
        for i in range(len(self.results)):
            j = self.results[i]
            f = self.features[j]
            if not i:
                fs = f
            else:
                fs = np.vstack([fs, f])
            print('\nfeature {}: MI_xy = {}'.format(i+1, self.MI_xy[j]))
            print(list(self.df.index[f]),list(self.df.index[np.invert(f)]))
            
        for i in range(fs.shape[1]):
            for j in range(i+1,fs.shape[1]):
                if (fs[:,i]==fs[:,j]).all():
                    not_diff.append([self.df.index[i],self.df.index[j]])
        print('\n'+'-'*40)
        print('\nPairs of items that cannot be differentiated with the selected features:')
        if not i:
            print('None')
        else:
            for i in not_diff:
                print(i)

## selection with outcomes coded with features

In [3]:
production = confusion_matrix('cm_production.csv')
perception = confusion_matrix('cm_perception.csv')

In [4]:
production.select()

Computing I(x;y)


  0%|          | 0/32768 [00:00<?, ?it/s]

Selecting features with mRMR


  0%|          | 0/328 [00:00<?, ?it/s]

In [5]:
production.show_selected()

Printing 8 selected features:

feature 1: MI_xy = 0.5093390859080247
['b', 'd', 'f', 'g', 'k', 'm', 'n', 'p', 't', 'v'] ['ð', 's', 'ʃ', 'z', 'ʒ', 'θ']

feature 2: MI_xy = 0.46865112627117056
['b', 'd', 'ð', 'g', 'k', 'm', 'n', 'p', 't'] ['f', 's', 'ʃ', 'v', 'z', 'ʒ', 'θ']

feature 3: MI_xy = 0.4267946168356088
['b', 'd', 'f', 'g', 'k', 'p', 's', 'ʃ', 't', 'θ'] ['ð', 'm', 'n', 'v', 'z', 'ʒ']

feature 4: MI_xy = 0.42436985042306735
['b', 'd', 'ð', 'g', 'm', 'n', 'z', 'ʒ'] ['f', 'k', 'p', 's', 'ʃ', 't', 'v', 'θ']

feature 5: MI_xy = 0.39319095228223927
['b', 'd', 'f', 'g', 'k', 'p', 's', 'ʃ', 't', 'v', 'z'] ['ð', 'm', 'n', 'ʒ', 'θ']

feature 6: MI_xy = 0.3864864898490794
['b', 'd', 'f', 'k', 'm', 'n', 'p', 's', 'ʃ', 't', 'ʒ'] ['ð', 'g', 'v', 'z', 'θ']

feature 7: MI_xy = 0.3835056389751981
['b', 'f', 'k', 'm', 'n', 'p', 't', 'v', 'z', 'θ'] ['d', 'ð', 'g', 's', 'ʃ', 'ʒ']

feature 8: MI_xy = 0.37197597873146876
['b', 'ð', 'f', 'g', 'k', 'p', 'v', 'z', 'ʒ'] ['d', 'm', 'n', 's', 'ʃ', 't', 'θ'

In [6]:
perception.select()

Computing I(x;y)


  0%|          | 0/32768 [00:00<?, ?it/s]

Selecting features with mRMR


  0%|          | 0/328 [00:00<?, ?it/s]

In [7]:
perception.show_selected()

Printing 8 selected features:

feature 1: MI_xy = 0.797844140221025
['b', 'd', 'ð', 'g', 'm', 'n', 'v', 'z', 'ʒ'] ['f', 'k', 'p', 's', 'ʃ', 't', 'θ']

feature 2: MI_xy = 0.674671087985163
['b', 'd', 'ð', 'g', 'n', 'v', 'z', 'ʒ'] ['f', 'k', 'm', 'p', 's', 'ʃ', 't', 'θ']

feature 3: MI_xy = 0.638879803214022
['b', 'd', 'ð', 'g', 'ʃ', 'v', 'z', 'ʒ'] ['f', 'k', 'm', 'n', 'p', 's', 't', 'θ']

feature 4: MI_xy = 0.5389162433934669
['b', 'ð', 'f', 'k', 'p', 's', 'ʃ', 't', 'v', 'θ'] ['d', 'g', 'm', 'n', 'z', 'ʒ']

feature 5: MI_xy = 0.5212657025301919
['b', 'd', 'ð', 'f', 'g', 's', 'ʃ', 'v', 'z', 'ʒ', 'θ'] ['k', 'm', 'n', 'p', 't']

feature 6: MI_xy = 0.5058607622773384
['b', 'd', 'ð', 'g', 'k', 'p', 't', 'v', 'z', 'ʒ'] ['f', 'm', 'n', 's', 'ʃ', 'θ']

feature 7: MI_xy = 0.45562199221666655
['b', 'ð', 'm', 'n', 'ʃ', 'v'] ['d', 'f', 'g', 'k', 'p', 's', 't', 'z', 'ʒ', 'θ']

feature 8: MI_xy = 0.4012766059695936
['b', 'ð', 'f', 'm', 'n', 'v', 'θ'] ['d', 'g', 'k', 'p', 's', 'ʃ', 't', 'z', 'ʒ']

---

## selection with original outcomes

In [8]:
production = confusion_matrix('cm_production.csv')
perception = confusion_matrix('cm_perception.csv')

In [9]:
production.select(output_grouping=False)

Computing I(x;y)


  0%|          | 0/32768 [00:00<?, ?it/s]

Selecting features with mRMR


  0%|          | 0/328 [00:00<?, ?it/s]

In [10]:
production.show_selected()

Printing 8 selected features:

feature 1: MI_xy = 0.5571793506990322
['b', 'd', 'f', 'g', 'k', 'm', 'n', 'p', 't'] ['ð', 's', 'ʃ', 'v', 'z', 'ʒ', 'θ']

feature 2: MI_xy = 0.4775546805879241
['b', 'd', 'ð', 'g', 'k', 'm', 'n', 'p', 't', 'θ'] ['f', 's', 'ʃ', 'v', 'z', 'ʒ']

feature 3: MI_xy = 0.4758139923287613
['b', 'd', 'g', 'k', 'm', 'n', 'p', 's', 'ʃ', 't'] ['ð', 'f', 'v', 'z', 'ʒ', 'θ']

feature 4: MI_xy = 0.4643684965414386
['b', 'd', 'f', 'k', 'm', 'n', 'p', 't', 'v', 'θ'] ['ð', 'g', 's', 'ʃ', 'z', 'ʒ']

feature 5: MI_xy = 0.45940910147508035
['b', 'f', 'k', 'p', 's', 'ʃ', 't', 'v', 'θ'] ['d', 'ð', 'g', 'm', 'n', 'z', 'ʒ']

feature 6: MI_xy = 0.44131149793095104
['b', 'f', 'g', 'k', 'p', 'v', 'z', 'ʒ'] ['d', 'ð', 'm', 'n', 's', 'ʃ', 't', 'θ']

feature 7: MI_xy = 0.41733458030477516
['b', 'ð', 'm', 'n', 'v', 'z'] ['d', 'f', 'g', 'k', 'p', 's', 'ʃ', 't', 'ʒ', 'θ']

feature 8: MI_xy = 0.40961501293016506
['b', 'd', 'ð', 'g', 'k', 'v', 'ʒ'] ['f', 'm', 'n', 'p', 's', 'ʃ', 't', 'z', 'θ'

In [11]:
perception.select(output_grouping=False)

Computing I(x;y)


  0%|          | 0/32768 [00:00<?, ?it/s]

Selecting features with mRMR


  0%|          | 0/328 [00:00<?, ?it/s]

In [12]:
perception.show_selected()

Printing 8 selected features:

feature 1: MI_xy = 0.8111237162102998
['b', 'd', 'ð', 'g', 'm', 'n', 'v', 'z', 'ʒ'] ['f', 'k', 'p', 's', 'ʃ', 't', 'θ']

feature 2: MI_xy = 0.7496630095174159
['b', 'd', 'ð', 'g', 'v', 'z', 'ʒ'] ['f', 'k', 'm', 'n', 'p', 's', 'ʃ', 't', 'θ']

feature 3: MI_xy = 0.7003071389926607
['b', 'f', 'k', 'p', 's', 'ʃ', 't', 'θ'] ['d', 'ð', 'g', 'm', 'n', 'v', 'z', 'ʒ']

feature 4: MI_xy = 0.6454576189386554
['b', 'd', 'ð', 'g', 's', 'ʃ', 'v', 'z', 'ʒ'] ['f', 'k', 'm', 'n', 'p', 't', 'θ']

feature 5: MI_xy = 0.5544992649478546
['b', 'd', 'ð', 'f', 'g', 's', 'ʃ', 'v', 'z', 'ʒ', 'θ'] ['k', 'm', 'n', 'p', 't']

feature 6: MI_xy = 0.540968789728632
['b', 'd', 'ð', 'g', 'k', 'p', 't', 'v', 'z', 'ʒ'] ['f', 'm', 'n', 's', 'ʃ', 'θ']

feature 7: MI_xy = 0.5409013256412882
['b', 'ð', 'f', 'k', 'p', 's', 't', 'v', 'θ'] ['d', 'g', 'm', 'n', 'ʃ', 'z', 'ʒ']

feature 8: MI_xy = 0.4971159824291359
['b', 'ð', 'm', 'n', 'ʃ', 'v'] ['d', 'f', 'g', 'k', 'p', 's', 't', 'z', 'ʒ', 'θ']

--