This script is testing how to calculate normalized, task and collection specific confusion values, for within-task (0), within-collection (between-task) (1), bw-collection (2)

In [19]:
import pandas
import numpy

In [2]:
# A pandas DataFrame is basically a wrapper for a numpy 2D array
confusion_mtx=pandas.DataFrame([[10,1,2,0],[1,12,3,1],[3,1,14,0],[1,1,3,16]])
confusion_mtx

Unnamed: 0,0,1,2,3
0,10,1,2,0
1,1,12,3,1
2,3,1,14,0
3,1,1,3,16


In [3]:
# We can add column and row names
image_names = ["image1","image2","image3","image4"]
confusion_mtx.index=image_names #index is the row names
confusion_mtx.columns=image_names 
confusion_mtx
# You can also do this upon initialization, like
# mtx = pandas.DataFrame(index=image_names,columns=image_names)

Unnamed: 0,image1,image2,image3,image4
image1,10,1,2,0
image2,1,12,3,1
image3,3,1,14,0
image4,1,1,3,16


In [4]:
# You can sum easily based on an axis, this thing is called a "Series"
confusion_mtx.sum(axis=0)

image1    15
image2    15
image3    22
image4    17
dtype: int64

In [6]:
# And we can divide DataFrames by series
confusion_mtx = confusion_mtx.divide(confusion_mtx.sum(axis=0))
confusion_mtx

Unnamed: 0,image1,image2,image3,image4
image1,0.666667,0.066667,0.090909,0.0
image2,0.066667,0.8,0.136364,0.058824
image3,0.2,0.066667,0.636364,0.0
image4,0.066667,0.066667,0.136364,0.941176


In [7]:
# I would probably make these labels a dictionary so I can index by the row or column name
# If we only had one set of labels, we would just set them as index and columns to the dataFrame
collection={"image1":0,"image2":0,"image3":1,"image4":1}
task={"image1":0,"image2":0,"image3":0,"image4":1}

In [15]:
# We can just make it empty, and it will fill / label when we fill it
confusion_categories=pandas.DataFrame(index=image_names,columns=image_names)
confusion_categories
# within-task (0), within-collection (between-task) (1), bw-collection (2)

Unnamed: 0,image1,image2,image3,image4
image1,,,,
image2,,,,
image3,,,,
image4,,,,


In [16]:
for row in confusion_mtx.iterrows():
    image1_name = row[0]
    row_counts = row[1]      # This is again a Series
    row_counts[image1_name] = 0 # get rid of diagonal
    for image2_name,count in row_counts.iteritems():
        if image1_name==image2_name:
            continue
        if collection[image1_name]==collection[image2_name]:
            if task[image1_name]==task[image2_name]:
                # You use "loc" to index
                confusion_categories.loc[image1_name,image2_name]=0
            else:
                confusion_categories.loc[image1_name,image2_name]=1
        else:
            confusion_categories.loc[image1_name,image2_name]=2
            
confusion_categories

Unnamed: 0,image1,image2,image3,image4
image1,,0.0,2.0,2.0
image2,0.0,,2.0,2.0
image3,2.0,2.0,,1.0
image4,2.0,2.0,1.0,


The above shows a potential bug in using numpy array - if the person forgot that the diagonal was 0, then he/she might have counted it. With pandas the default value is NaN (and you can change by specifying a value as the first argument, eg pandas.DataFrame(1,index=image_names,columns=image_names)

In [17]:
# Now we want to calculate the normalized value, this is where pandas comes in handy 
#(the above that we just did is about the same)
value_counts = confusion_categories.apply(pandas.value_counts).fillna(0).sum(axis=1)
# First column shows the index (the value in the DataFrame) and second shows the count
value_counts

0    2
1    2
2    8
dtype: float64

In [21]:
normalized_confusion = []
for i in range(3):
    # Lame, I can't remember how to do this nicely with pandas /bonk
    normalized_value = numpy.mean(confusion_mtx.values[confusion_categories.values==i]/value_counts.loc[i])
    normalized_confusion.append(normalized_value)
    
normalized_confusion

[0.033333333333333333, 0.034090909090909088, 0.010720254010695188]

It looks like the middle value we got the same thing, but the others are different, I think because you were including some of the zeros we don't want to include.